#!/local/bin/perl -s
#
# catpar	--- extract HTML paragraphs for WWW catalogue
#
# Processes interesting HTML pages so that they can be
# searched using parscan.  (Each "paragraph" must be a
# standalone block of HTML text, and paragraphs must be separated
# by blank lines.)  Sources should be retrieved using hget -abs
# (to convert relative URLs to absolute ones).
#
# Oscar Nierstrasz oscar@cui.unige.ch 4/9/93
#
# Revised 8/10/93
# 24/11/93 -- fixed nwn() to drop <i>...</I> in date
# 21/01/94 -- made HREF and sig arguments

$usg = 'Usage: catpar -<style> <input> <URL> <signature>
	-nwn -- NCSA What\'s New
	-nsp -- Network Starting Points
	-isl -- Internet Services List
	-ihl -- Internet Hypertext List
	-cvl -- CERN Virtual Library
	-nmi -- NCSA Meta Index
	-msp -- Marcus Speh C++ Docs
	-mis -- Multimedia Information Sources
	-ali -- AliWeb ';

if ($#ARGV != 2) { die "$usg"; }

($input,$href,$key) = @ARGV;
$sig = "<A HREF=\"$href\">$key</A>";
open(INPUT,$input) || die "catpar: Can't open $input";

if ($nwn) { &whatsnew; }
elsif ($nsp) { &starting; }
elsif ($isl) { &isl; }
elsif ($ihl) { &ihl; }
elsif ($cvl) { &item; }
elsif ($cmc) { &cmc; }
elsif ($nmi) { &item; }
elsif ($msp) { &msp; }
elsif ($mis) { &mis; }
elsif ($ali) { &ali; }
else { die "$usg"; }

# CERN's Virtual Library Subject Catalog
# http://info.cern.ch./hypertext/DataSources/bySubject/Overview.html
# http://info.cern.ch./hypertext/DataSources/ByAccess.html

# More or-less generic routine to bundle up
# description and list items as signed paragraphs:
sub item {
	local($h,$count,$exit);
	while (<INPUT>) {
		/^<H[123]>(.*)<\/H[123]>/ && ($h = "<B>$1:</B> ") && next;
		# for CERN server list:
		/^NAME=.*">(.*)<\/A><\/H[123]>/ && ($h = "<B>$1:</B> ") && next;
		if (/^<DL>/) {
			$count = 0; $exit = 1;
			print "<DL>\n";
			while (<INPUT>) {
				/^$/ && next;
				/^<\/DL>/ && ($exit = 0, last);
				if ($count++ > 0) {
					s/^<DT>/($sig)\n<\/DL>\n\n<DL>\n<DT>$h/;
				}
				else { s/^<DT>/<DT>$h/; }
				print;
			}
			if ($count > 0) { print "($sig)\n</DL>\n\n"; }
			($exit == 1) && return;
		}
		elsif (/^<[UO]L>/) {
			$count = 0; $exit = 1;
			while (<INPUT>) {
				/^$/ && next;
				/^<\/[OU]L>/ && ($exit = 0, last);
				if ($count++ > 0) { s/^<LI>/($sig)<P>\n\n$h/i; }
				else { s/^<LI>/\n\n$h/i; }
				print;
			}
			if ($count > 0) { print "($sig)\n<P>\n\n"; }
			($exit == 1) && return;
		}
		elsif (/^<menu>/i) {
			$count = 0; $exit = 1;
			while (<INPUT>) {
				/^$/ && next;
				/^<\/menu>/i && ($exit = 0, last);
				if ($count++ > 0) { s/^<LI>/($sig)<P>\n\n$h/i; }
				else { s/^<LI>/\n\n$h/i; }
				print;
			}
			if ($count > 0) { print "($sig)\n<P>\n\n"; }
			($exit == 1) && return;
		}
	}
}

# Not presently used:
sub head {
	local($h1,$h2);
	/^<H1>(.*)<\/H1>/ && ($h1 = $1);
	/^<H2>(.*)<\/H2>/ && ($h2 = $1);
	if ($h1 =~ /./) {
		if ($h2 =~ /./) { $h = "<B>$h1 -- $h2:</B> "; }
		else { $h = "<B>$h1:</B> "; }
	}
	else { $h = "<B>$h2:</B> "; }
}

# NCSA What's New:
# http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html
sub whatsnew {
	while (<INPUT>) {
		next unless /^<DL>/;
		$first = $new = 1;
		while (<INPUT>) {
			/<\/DL>\s*$/ && last;
			s/\s*<P>\s*\n$/\n/;
			s/^<DT>\s*(.*)\n//
				&& (($date = "<B>$1:</B> ") =~ s/<\/?[IiPp]>//g);
			s/^\s+//;
			s/^<DD>\s*//;
			if (/^$/) {
				$new = 1;
				next;
			}
			elsif ($new == 1) {
				s/^/$date/;
				s/^/($sig)<P>\n\n/ unless ($first);
				$new = $first = 0;
			}
			print;
		}
		print "($sig)<P>\n\n";
		return;
	}
}

# NCSA Starting Points:
# http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/StartingPoints/NetworkStartingPoints.html
sub starting {
	while (<INPUT>) {
		next unless /^<UL>/;
		while (<INPUT>) {
			/^<\/UL>/ && last;
			s/ NAME=\d+//;
			if ($first++ == 0) {
				s/<LI>\s*//;
				print;
				next;
			}
			s/^\s+//;
			/^<UL>\s*$/ && next;
			/^<\/UL>\s*$/ && next;
			s/^<LI>\s*/($sig)<P>\n\n/;
			print;
		}
		print "($sig)<P>\n\n";
		return;
	}
}

# Stanford's Internet Services List:
# http://slacvx.slac.stanford.edu:80/misc/internet-services.html
# revised: 16/11/93
sub isl {
	local($h);
	while (<INPUT>) {
		next unless /^<H3>/;
		s/<\/?H3>//g && ($h = $_);
		print "<B>$h:</B>\n";
		$first = 1;
		while (<INPUT>) {
			next if /<\/H3>/;
			next if /<UL>/;
			if (s/<LI>//) {
				print "($sig)<P>\n\n<B>$h:</B>\n"
					unless $first;
				$first = 0;
			}
			last if (/<\/UL>/);
			print;
		}
		print "($sig)<P>\n\n";
	}
}

# Indiana Internet Hypertext List:
# NB: IHL is based on ISL and is partially redundant!
# http://cs.indiana.edu/internet/internet.html
sub ihl {
	while (<INPUT>) {
		if (s/^<LI>\s*//) {
			s/ NAME=\d+//;
			print "$_($sig)<P>\n\n";
		}
	}
}

# John December's 
# Information Sources: the Internet and Computer-Mediated Communication
# ftp://ftp.rpi.edu/pub/communications/internet-cmc.html
sub cmc {
	$/ = "<LI>";
	while (<INPUT>) {
		next unless /name=/;
		while (<INPUT>) {
			s/<[aA] name=[\w\W]*//;
			s/<[hH][\w\W]*//;
			s/<LI>//g;
			s/<\/?[OUD][LDT]>//g;
			s/\n+/\n/g;
			next unless (/<[aA]/);
			print "$_($sig)<P>\n";
		}
		return;
	}
}

# Marcus Speh's C++ documents
# http://info.desy.de/general/users.html
sub msp {
	local($title);
	$/ = "";
	while (<INPUT>) {
		if (/<TITLE>(.*)<\/TITLE>/) {
			$title = $1;
			next;
		}
		if (/^<DT>/) {
			s/<\/DL>[\w\W]*$//;
			s/___[\w\W]*$//;
			s/<DT>/$&<B>$title:<\/B> /;
			s/\n*$//;
			print "<DL>\n$_\n($sig)\n</DL>\n\n";
		}
	}
}

# Multimedia Information Sources
# http://cui_www.unige.ch/OSG/MultimediaInfo/index.html
sub mis {
	local($dt);
	local($first) = 1;
	while (<INPUT>) {
		next unless /<HR>/;
		while (<INPUT>) {
			if (/<DT>/) { $dt = $_; next; }
			next if /<HR>/;
			next if /^$/;
			last if /^<\/DL>/;
			if (/^<DD>/) {
				print "($sig)\n</DL>\n\n" unless $first;
				print "<DL>\n$dt";
				$first = 0;
			}
			print;
		}
		print "($sig)\n</DL>\n\n";
		return;
	}
}

# Aliweb
# http://web.nexor.co.uk/aliweb/doc/aliweb.html
sub ali {
	while (<INPUT>) {
		next unless /^<DL>/;
		$first = 1;
		while (<INPUT>) {
			/<\/DL>\s*$/ && last;
			if (/^<A/) {
				print "($sig)\n</DL>\n\n" unless ($first);
				print "<DL>\n";
				$first = 0;
			}
			print;
		}
		print "($sig)\n</DL>\n\n";
		return;
	}
}

