#! /local/bin/perl -s
#
# ms2html	--- convert (pseudo) troff -ms text to HTML
#
# Converts an annotated text file into HTML
# The annotations are based on troff -ms macros,
# but may be generated in a variety of ways.
# For example, a Framemaker template (doc2ms.fm)
# can be applied to an existing document to
# insert the annotations for every paragraph type.
#
# A Framemaker template that implements the annotations is in:
# cui.unige.ch:PUBLIC/oscar/doc2ms.fm.Z
#
# A Table of Contents and links to references and numbered sections
# are automatically generated.
#
# Each of the following commands should appear on a line by itself
# (although arbitrary white space is tolerated, to facilitate
# translation from Framemaker files):
#
# Standard troff -ms:
#	.TL	Title
#	.ST	Subtitle
#	.AU	Author
#	.AI	Author's Institution
#	.AB	Abstract
#	.NH1	Numbered Section
#	.NH2	Numbered Subsection
#	.NH3	Numbered Subsubsection
#	.NH4	Numbered Subsubsubsection
#	.SH1	Unnumbered Section
#	.SH2	Unnumbered Subsection
#	.SH3	Unnumbered Subsubsection
#	.SH4	Unnumbered Subsubsubsection
#	.LP	Left Paragraph
#	.PP	Indented Paragraph
#	.IP	Indented Paragraph
#	.QP	Quotation
#	.FS	Footnote
#	.DS	Display Start
#	.\"	Comment
# Non-standard:
#	.BC	Block - Centred
#	.BH	Block - Hang Indented
#	.BU1	Bullet Item (level 1)
#	.BU2	Bullet Item (level 2)
#	.BU3	Bullet Item (level 3)
#	.BU4	Bullet Item (level 4)
#	.LL	Left Label (Bold .LP)
#	.NS	Start Numbered Paragraph
#	.NN	Next Numbered Paragraph
#	.MD	Math Definition
#	.ML	Math Lemma
#	.MP	Math Proposition
#	.MT	Math Theorem
#	.PR	Proof
#	.UR	Unnumbered Reference
#	.RF[1]	Reference
#
# In addition, the following are understood:
#	[RF:1]	A cross reference:
#	\fB	start bold text
#	\fI	start italic text
#	\fC	start typewriter text
#	\fR	return to Roman text (also \fP)
#	\.	dot (at beginning of line)
#
# NB: note that .B and .I are not valid commands since they would
# start a new "paragraph" (this script splits text at command lines).
# Just convert ".B text" to "\fBtext\fR".
#
# Author: Oscar Nierstrasz -- oscar@cui.unige.ch -- June 1993
#
#v = "ms2html v1.1"; # 6.93
#v = "ms2html v1.2"; # 1.7.93	-- changed .DA to .BU3
#				-- tweaked file URL recognition
#v = "ms2html v1.3"; # 2.7.93	-- fixed separate numbering for NH and SH
#				-- added named TOC anchor
#v = "ms2html v1.4"; # 8.7.93	-- added .LL
#v = "ms2html v1.5";
#v = "ms2html v1.6"; # 4.8.93	-- made url'href into a library
$v = "ms2html v1.7"; # 25.8.93	-- added \. escape at beginning of line
#				-- changed TOC to use <UL>

unshift(@INC,"/user/u1/oscar/Cmd/PerlLib");
require("button.pl");
require("url.pl");

$usg = 'Usage: ms2html [-<options>] <msfile.ms> ...
	-b -- create a single body page only
	-p -- use plain text instead of button to navigate
';

($#ARGV >= 0) || die($usg);

# The record separator is a carriage return followed by a dot:
$/ = "\n\." ;

chop($date = `date +%d.%m.%y`);
$omn = '<A HREF="http://cui_www.unige.ch/OSG/omn.html"><I>OMN</I></A><P>';
$sig = "<I>This document was translated by $v on $date.</I>\n$omn<P>\n";

foreach $FILE (@ARGV) {
	$TL = 0; # TOC level (starts at 0)
	$toc = $refs = "";
	open(FILE,$FILE) || die "Can't open $FILE\n";
	($BASE = $FILE) =~ s/\.ms$//;	# drop the .ms suffix
	$TOC = $BASE . "-toc.html";	# the title page
	$CURR = $PREV = $TOC;		# current and previous pages
	&newpage($TOC); $inbody = 0;
	if ($b) { $TOTOC = ""; }
	else { $TOTOC = $TOC; }

	# some useful strings:
	$REFS = $BASE . "-refs.html";
	$totoc = "<I>To <A HREF=\"$TOTOC#TOC\">Table of Contents</A></I><P>\n\n";
	$torefs = "<I>To <A HREF=\"$REFS\">References</A></I><P>\n\n";

	# translate:
	while(<FILE>) { &ms2html; }

	# gracefully close the last body page:
	&lastbody;

	# put the collected table of contents at the end
	# of the title page:
	while ($TL > 0) { $toc .= "</UL>\n"; $TL--; }
	open(TOC, ">>$TOC");
	print TOC "<H1><A NAME=\"TOC\">Table of Contents</H1>\n$toc\n";
	if ($refs =~ /./) { print TOC $torefs; }
	print TOC $sig;
	close(TOC);

	# if there are references, print them out:
	if ($refs =~ /./) {
		&newpage($REFS);
		&printtitle("References");
		print "<H2>References</H2>\n";
		print "<OL>\n$refs\n</OL>\n\n";
		&up; print "<P>\n";
		print $sig;
		close(STDOUT);
	}
}

# convert some standard sequences to HTML:
sub accent2html {
	# escape & < and >:
	s/\&/\&amp;/g;
	s/</\&lt;/g;
	s/>/\&gt;/g;
	# convert dead-key accents to HTML
	s/\\AE/\&AElig;/g;
	s/\\'([AEIOUYaeiouy])/\&$1acute;/g;
	s/\\[<^]([AEIOUaeiou])/\&$1circ;/g;
	s/\\`([AEIOUaeiou])/\&$1grave;/g;
	s/\\o([Aa])/\&$1ring;/g;
	s/\\~([ANOano])/\&$1tilde;/g;
	s/\\[:"]([AEIOUYaeiouy])/\&$1uml;/g;
	s/\\,([Cc])/\&$1cedil;/g;
	s/\\\/([Oo])/\&$1slash;/g;
	s/\\ss/\&szlig;/g;
}

# translate the next line:
sub ms2html {
	s/^\.//;	# delete initial "." (only needed for first record)
	s/\n+\.//;	# delete the record separator
	s/\s+\n/\n/g;	# delete trailing white space
	s/\n\\\./\n./g;	# unescape leading dots
	&accent2html;	# expand accents
	s/\\f([IB])([^\\]*)\\f[RP]/<$1>$2<\/$1>/g;	# italics & bold
	s/\\fC([^\\]*)\\f[RP]/<CODE>$1<\/CODE>/g;	# code
	&url'href;

	# expand references into HTML links:
	s/\[RF:(\d*)\]/<A HREF="${REFS}#RF:$1">[$1]<\/A>/g;

	if (/^$/) { return; }	# blank record!?
	# separate the text from the command:
	$text = "";
	s/^(\S+)[ \t]+/$1\n/;
	s/^(\S+)\n// && do { $text = $_; $_ = $1; };
	# NB: s/^(\S+)\s+(.*)// doesn't work since the text may contain newlines.

	&popall unless
		/^[LI][PL]/ || /^N[SN]$/ || /^BU[1234]/;
	/^TL$/ && do { $title = $text; &printtitle("Title Page"); return; };
	/^ST$/ && do { print "<B>$text</B><P>\n\n"; return; };
	/^AI$/ && do { print "<I>$text</I><P>\n\n"; return; };
	/^AU$/ && do { print "<B>$text</B><P>\n\n"; return; };
	/^AB$/ && do { print "<B>Abstract</B><P>\n\n$text<P>\n"; return; };
	/^PP$/ && do { print "$text<P>\n\n"; return; };
	/^BH$/ && do { print "<DL><DT>$text</DL>\n\n"; return; };
	/^BC$/ && do { print "<DL><DD>$text</DL>\n\n"; return; };

	/^FS$/ && do { print "<DL><DD><I>$text</I></DL>\n\n"; return; };
	/^QP$/ && do { print "<DL><DD><I>$text</I></DL>\n\n"; return; };
	/^DS$/ && do { print "<PRE>\n$text\n</PRE>\n\n"; return; };

	# don't distinguish LP, LL & IP for nesting purposes:
	/^LP$/ && do { &listitem("LP"); print "<DT>$text\n\n"; return; };
	/^LL$/ && do { &listitem("LP"); print "<DT><B>$text</B>\n\n"; return; };
	/^IP$/ && do { &listitem("LP"); print "<DD>$text\n\n"; return; };

	(/^N[SN]$/ || /^BU[1234]$/)
		&& do { &listitem($_); print "<LI>$text\n\n"; return; };

	/^MD$/ && do {
		$md++;
		print "<DL><DT><B>Definition $md</B>\n<DD>$text</DL>\n\n";
		return; };
	/^MT$/ && do {
		$mt++;
		print "<DL><DT><B>Theorem $md</B>\n<DD>$text</DL>\n\n";
		return; };
	/^ML$/ && do {
		$ml++;
		print "<DL><DT><B>Lemma $md</B>\n<DD>$text</DL>\n\n";
		return; };
	/^MP$/ && do {
		$mp++;
		print "<DL><DT><B>Proposition $md</B>\n<DD>$text</DL>\n\n";
		return; };
	/^PR$/ && do {
		print "<DL><DT><B>Proof</B>\n<DD>$text</DL>\n\n";
		return; };

	if (/^([NS])H(\d)$/) {
		# skip if this is the reference section:
		if (($text eq "References") || ($text eq "Bibliography"))
			{ return; };
		$stype = $1;	# numbered or unnumbered sections
		$H = $2;	# the header level

		if ($H == 1) {
			if ($stype =~ /N/) { $n1++; $n2 = $n3 = $n4 = 0; $id = "$n1"; }
			else { $s1++; $s2 = $s3 = $s4 = 0; $id = "$s1"; }
		}
		elsif ($H == 2) {
			if ($stype =~ /N/) { $n2++; $n3 = $n4 = 0; $id = "$n1.$n2"; }
			else { $s2++; $s3 = $s4 = 0; $id = "$s1.$s2"; }
		}
		elsif ($H == 3) {
			if ($stype =~ /N/) { $n3++; $n4 = 0; $id = "$n1.$n2.$n3"; }
			else { $s3++; $s4 = 0; $id = "$s1.$s2.$s3"; }
		}
		elsif ($H == 4) {
			if ($stype =~ /N/) { $n4++; $id = "$n1.$n2.$n3.$n4"; }
			else { $s4++; $id = "$s1.$s2.$s3.$s4"; }
		}

		while ($TL < $H) { $toc .= "<UL>\n"; $TL++; }
		while ($TL > $H) { $toc .= "</UL>\n"; $TL--; }

		$name = "${stype}-$id";		# unique anchor name
		if ($stype =~ /N/) { $num = "$id "; }
		else { $num = ""; }
		# start a new page unless -b option was selected:
		if (!$b) {
			$NEXT = "$BASE-$name.html" ;
			&popall;
			&newbody($NEXT);
			&printtitle("${num}$text");
		}
		$inbody = 1;
		print "<H$H><A NAME=\"$name\">${num}$text</H$H>\n\n";
		if ($b) { print $totoc; }
		$toc .= "<LI><A HREF=\"${CURR}#$name\">${num}$text</A>\n";
		return;
	}

	/^(RF:\d+)$/ && do {
		$refs .= "\n<LI><A NAME=\"$1\">$text</A>\n\n";
		return; };
	/^UR$/ && do {
		$refs .= "\n<DT>$text</A>\n\n";
		return; };
	/^\\"$/ && do { return; };
	# these are ignored:
	(/^AE$/ || /^FE$/ || /^DE$/) && do { return; };
	print STDERR "Unrecognized command (assuming .PP): \"$_ $text\"\n";
	print "$text<P>\n\n";
}

# close the current body page and open a new one:
sub newbody {
	local($NEXT) = @_;
	&popall;
	if ($inbody) {
		&left; &up; &right; print "<P>\n";
		print $sig;
	}
	close(STDOUT);
	$PREV = $CURR; $CURR = $NEXT;
	&newpage($CURR);
}

# terminate the last body page:
sub lastbody {
	local($NEXT);
	&popall;
	&left; &up;
	# pointer to next only if references exist:
	if ($refs =~ /./) { $NEXT = $REFS; &right; };
	print "<P>\n";
	# clean up:
	print $sig;
	close(STDOUT);
}

# open a new page:
sub newpage {
	local($PAGE) = @_;
	open(STDOUT, ">$PAGE") || die "Can't create $PAGE";
	print STDERR "Created $PAGE\n";
}

# check if need to push or pop a list level
# when a new list item appears:
sub listitem {
	local($ltype) = @_;	# this list item type
	if ($#lstack < 0) {
		# no current list, so start new list:
		&newlist($ltype);
	}
	elsif ($lstack[$#lstack] ne $ltype) {
		if (($#lstack > 0) && ($lstack[$#lstack - 1] eq $ltype)) {
		# print STDERR "Popping from $ltype to $lstack[$#lstack-1]\n";
			&poplist;
		}
		else {
			&newlist($ltype);
		}
	}
}

# start a new list:
sub newlist {
	local($ltype) = @_;	# this list item type
	if ($ltype eq "NS") { print "<OL>\n"; $ltype = "NN"; }
	elsif ($ltype eq "NN") { print "<OL>\n"; }
	elsif ($ltype =~ /BU[1234]/) { print "<UL>\n"; }
	elsif ($ltype =~ /LP/) { print "<DL>\n"; }
	push(@lstack,$ltype);
}

# pop the current list:
sub poplist {
	local($ltype);
	$ltype = pop(@lstack);
	if ($ltype eq "NN") { print "</OL>\n\n"; }
	elsif ($ltype =~ /BU[1234]/) { print "</UL>\n\n"; }
	elsif ($ltype =~ /LP/) { print "</DL>\n"; }
	else { print STDERR "poplist error: unknown list type \"$ltype\"\n"; }
		# should never happen!
}

# pop out of all remaining lists:
sub popall {
	while ($#lstack >= 0) {
		&poplist;
	}
}

# yep, you guessed it!
sub printtitle {
	local($name) = @_;
	print "<TITLE>$title -- $name</TITLE>\n\n" ;
	print "<H1>$title</H1>\n\n";
}

# standard buttons:
sub up {
	if ($p) { print $totoc; }
	else { &button("up","$TOC#TOC"); }
}
sub left {
	if ($p) { print "<I>To <A HREF=\"$PREV\">Previous Page</A></I><P>\n\n"; }
	else { &button("left","$PREV"); }
}
sub right {
	if ($p) { print "<I>To <A HREF=\"$NEXT\">Next Page</A></I><P>\n\n"; }
	else { &button("right","$NEXT"); }
}

__END__

