#
# parscan.pl	--- query a file of HTML paragraphs or a refer database
#
# Oscar Nierstrasz oscar@cui.unige.ch
#
# Simple package to query a file of HTML paragraphs.  Paragraphs can be
# arbitrary, standalone blocks of HTML text, separated by blank lines.
# To query a file, use the URL:
#
#	http://<site>/parscan/<file>?<query>
#
# where http://<site>/<file> is the file's real URL.  If <base> is the name
# without the .html extension, <base> can be used instead of <file>.  If
# a header file <base>.hdr exists, parscan will print that instead of the
# default header.  In addition, if <base>.query exists, it will be used
# whenever a non-empty query is given.  (Normally <base>.hdr will be a
# cover page with introductory information, whereas <base>.query will
# only contain the title and main headline.)
#
# NB: Note that you *must* include the tag <ISINDEX> in the header of
# your file, or the search engine will not be activated.
#
# If the file contents are not standalone HTML blocks but, for example,
# list items or pre-formatted text, parscan can be instructed to bracket
# the results of the search with <PRE> and </PRE>, <DL> and </DL>,
# <OL> and </OL> or <UL> anmd </UL>.  The URL to use is:
#
#	http://<site>/parscan/<flag>/<file>?<query>
#
# where <flag> is one of: -pre, -dl, -ol or -ul.
# Adjacent blocks of text must still be separated by a blank line, however.
#
# The flag -url will cause parscan to search for URLs and ftp pointers
# and convert them into hypertext links.  This is probably most interesting
# in combination with the -pre flag to query plain text files.
# An example is the Free Compilers List:
# http://cui_www.unige.ch/parscan/-pre-url/OSG/Langlist/free
#
# The same package can be used to query a database of refer(1) style
# bibliography entries.  Use the URL:
#
#	http://<site>/parscan/-r/<file>?<query>
#
# See, for example, the OO Bibliography Database at CUI:
# http://cui_www.unige.ch/bibrefs
#
# The -a flag is used internally by parscan and is automatically
# generated when a bibliography entry contains an abstract (%X field).
# The URL http://<site>/parscan/-a/<file>?<label> is then automatically
# generated where <label> is the value of the %L field.
#
# Finally, the -l flag causes parscan to retrieve refer records on a
# line-by-line basis, if fields are separated by ^A instead of a "\n".
# (This is mainly interesting for the CUI library database.)
#
# For use with the Plexus 3.0.
#
# Version 3.0:
# add the following to local.conf:
# load	parscan.pl
# map	parscan		parscan.pl	&do_parscan("$rest,$query")
#
# Similarly, aliases can be added as before:
# map	bibrefs		parscan.pl	&do_parscan("-r/OSG/Bib/main.bib,$query")
#
# A version for 2.2.1 is also available by ftp at
# cui.unige.ch:/PUBLIC/oscar/scripts/parscan.pl.2.2.1

# $map{'parscan'} = '&do_parscan("$rest,$query")';

package parscan;

# Versions:
#v = 'parscan v1.0'; # 2/9/93
#v = 'parscan v1.1'; # 7/9/93 -- added .query files; merged with bibrefs.pl
#v = 'parscan v1.2'; # 8/9/93 -- added -pre, -dl etc. flags and -url
#v = 'parscan v1.3'; # 8/9/93 -- version for plexus 3.0
#v = 'parscan v1.4'; # 10/9/93 -- added &hesc for -pre mode
#v = 'parscan v1.5'; # 14/9/93 -- process "+" in queries
#v = 'parscan v1.6'; # 15/9/93 -- use "eval" to error-check queries
#v = 'parscan v1.7'; # 6/10/93 -- absmode returns 1
#v = 'parscan v1.8'; # 21/10/93 -- altered handling of <HEAD> & <BODY> ...
#v = 'parscan v1.9'; # 23/10/93 -- added unescaping of special chars in queries
#v = 'parscan v1.10'; # 26/10/93 -- ftp links now point to directories
$v = 'parscan v1.11'; # 28/10/93 -- converted "die" to &main'error

$omn = '<A HREF="http://cui_www.unige.ch/OSG/omn.html"><I>OMN</I></A><P>';
$sig = "<I>This page was generated by $v.</I>\n$omn<P>\n";

$maxcount = 250; # MAX records to retrieve (hardwired ...)

sub main'do_parscan {
	local($file,$query) = @_;
	local($base);

	($file =~ s/^(-[^\/]*)\///) && ($flags = "$1-");

	# splitquery decodes %## characters and splits on "+" (space)
	$query = join(" ", &main'splitquery($query));

	if ($file eq "") { &main'error('bad_request',
				"do_parscan: missing filename"); }

	unless (-e $file) {
		$file = "$file.html";
		-e $file ||
		&main'error('not_found', "do_parscan: Can't find $file\n");
	}
	if (-d $file) { $file = "$file/index.html"; }
	unless (-f $file) {
		$file = "$file.html" unless ($file =~ /\.html$/);
		-f $file ||
		&main'error('not_found', "do_parscan: Can't find $file\n");
	}

	&main'MIME_header('ok', 'text/html');

	# look for a header file:
	($base = $file) =~ s/\.html$//;
	if (($query && &main'safeopen(parscan'HDR,"$base.query"))
		|| (&main'safeopen(parscan'HDR,"$base.hdr"))) {
			$/ = undef; # gobble the whole input
			print <HDR>;
		}
	else {
		# no header file, so construct one
		print "<HEAD>\n<TITLE>Scan of $file</TITLE>\n";
		unless ($flags =~ /-a-/) { print "<ISINDEX>\n"; }
		print "</HEAD>\n<BODY>\n<H1>Scan of $file</H1>\n";
	}

	unless ($query || $flags =~ /-a-/) {
		print <<EOM;
Provide a
<A HREF=http://www.cis.ohio-state.edu:85/info/perl.info,Regular%20Expressions>
Perl regular expression</A> as a search pattern.
<P>
EOM
	}

	if ($query) {
		print "<B>Result of search for \"$query\":</B><P>\n";
		&main'safeopen(parscan'FILE,"$file.html")
			|| &main'safeopen(parscan'FILE,$file)
			|| &main'error('not_found',
				"do_parscan: \"$file\" $!");

		if ($flags =~ /-l-/) { $/ = "\n"; }
		else { $/ = ""; }

		if ($flags =~ /-r-/) { $mode = "&refmode"; }
		elsif ($flags =~ /-a-/) { $mode = "&absmode"; }
		else { $mode = "&parmode"; }

		eval "$mode(\$query)" ||
		print "$post\n<I>Garbled search pattern:</I> $@\n<P>\n",
			"Be sure to use a valid <A HREF=\"",
			$main'plexus{'perlexp'},
			"\">Perl regular expression</A>.<P>\n";
		$@ = undef;

		if ($count == $maxcount) {
			print "<B>Too many matching records (> $maxcount)!\n",
			"Try a more restrictive pattern.</B><P>";
		}
		elsif ($count == 0) {
			print "<B>No matching entries found.</B><P>\n";
		}
		print "$sig\n";
		print "</BODY>";
		close(FILE);
	}
}

sub style {
	if ($flags =~ /-pre-/) { $pre = "<PRE>\n"; $post = "</PRE>\n"; }
	elsif ($flags =~ /-dl-/) { $pre = "<DL>\n"; $post = "</DL>\n"; }
	elsif ($flags =~ /-ol-/) { $pre = "<OL>\n"; $post = "</OL>\n"; }
	elsif ($flags =~ /-ul-/) { $pre = "<UL>\n"; $post = "</UL>\n"; }
}

sub parmode {
	local($query) = @_;
	&style;
	print $pre;
	$count = 0;
	while (<FILE>) {
		/$query/oi || next;
		if ($flags =~ /-pre-/) { &hesc; }
		if ($flags =~ /-url-/) { &href; }
		print;
		last if (++$count == $maxcount);
	}
	print $post;
}

sub refmode {
	local($query) = @_;
	&html_init;
	$count = 0;
	$pre = "<OL>\n"; $post = "</OL>\n"; # default style
	&style;
	print $pre;
	while (<FILE>) {
		/$query/oi || next;
		tr//\n/; # expand ^A to CR
		&html_accent;
		&getref;
		print "\n<LI>$ref\n";
		if ($abstract ne "") {
			$what = "abstract";
			if ($ftp ne "") { $what .= "+ftp"; }
			print "<A HREF=\"/parscan/-a/$file?$lbl\"><I>$what</I></A>\n";
		}
		elsif ($ftp ne "") {
			# point to the directory rather than the file:
			$url =~ s|[^/]*$||;
			$ftp =~ s|([^/]*)$||;
			$ftpfile = $1;
			print "<I>ftp: <A HREF=\"$url\">$ftp</A>$ftpfile</I>\n";
			# print "<I>ftp: <A HREF=\"$url\">$ftp</A></I>\n";
		}
		last if (++$count == $maxcount);
	}
	print $post;
}

sub absmode {
	local($query) = @_;
	&html_init;
	print "_____________________________________",
		"_____________________________________<P>\n";
	while (<FILE>) {
		/$query/oi || next;
		&html_accent;
		&getref;
		print "<H2>$title</H2>\n";
		print "$ref<P>\n";
		if ($abstract ne "") {
			print "<H3>Abstract</H3>\n$abstract<P>\n";
		}
		if ($ftp ne "") {
			# point to the directory rather than the file:
			$url =~ s|[^/]*$||;
			$ftp =~ s|([^/]*)$||;
			$ftpfile = $1;
			print "<B>ftp:</B> ",
			"<A HREF=\"$url\"><I>$ftp</I></A>$ftpfile<P>\n";
			# print "<B>ftp:</B> <A HREF=\"$url\"><I>$ftp</I></A><P>\n";
		}
		print "_____________________________________",
			"_____________________________________<P>\n";
		last if (++$count == $maxcount);
	}
	1;
}

# ========================================================================

# The remaining procedures are mostly identical to those used in "bib",
# and are dedicated to handling refer bibliographies
# (should be separate libraries)

# build up a reference (set global vars):
sub getref {
	$ref = $lbl = $keys = $auth = $ed = $title = "";
	$abstract = $ftp = $url = $junk = "";
	# study; # strangely, this slows us down!
	s/\n\s+/\n/g;	# remove leading white space
	s/%L (.*)\n// && ($lbl = $1);			# label
	s/%K (.*)\n// && ($keys = $1);			# keywords
	# if ($lbl eq "") { print STDERR "Warning -- missing label:\n$_"; }
	# Collect authors:
	while (s/%[AQ] (.*)\n(%[AQ] .*\n%[AQ])/$2/) { $auth .= "$1,\n"; }
	s/%[AQ] (.*)\n%[AQ] (.*)\n// && ($auth .= "$1 and\n$2");
	s/%[AQ] (.*)\n// && ($auth = $1);
	# Collect editors:
	while (s/%E (.*)\n(%E .*\n%E)/$2/) { $ed .= "$1,\n"; }
	s/%E (.*)\n%E (.*)\n// && ($ed .= "$1 and\n$2");
	s/%E (.*)\n// && ($ed = $1);
	# Check for missing authors:
	if ($auth eq "") {
		if ($ed ne "") { $auth = "$ed (Ed.)"; $ed = ""; }
		else {
			$auth = "(Anonymous)";
			# print STDERR "Warning ($lbl): missing author\n";
		}
	}
	else { $ref = "$auth,\n"; }
	# from this point on, ref ends without newline so commas
	# can be added incrementally.
	# grab the title:
	s/%T ([^%]*)\n// && ($title = $1);
	# determine kind of publication:
	if (/%J/) {				# Journal paper
		$ref .= "$LQ$title$RQ";
		s/%J ([^%]*)\n// && ($ref .= ",\n$I$1$R");
	}
	elsif(/%B/) {				# Article in book
		$ref .= "$LQ$title$RQ";
		s/%B ([^%]*)\n// && ($ref .= ",\n$I$1$R");
	}
	elsif(/%R/) {				# Technical report
		$ref .= "$LQ$title$RQ";
		s/%R ([^%]*)\n// && ($ref .= ",\n$1");
	}
	else { $ref .= "$I$title$R"; }		 # Book
	# If more than one of J, B or R, will show up as JUNK:
	# if (/(%[JBR])/) {
		# print STDERR "Warning ($lbl): type conflict [$1]\n";
	# }
	# add remaining fields in standard ord:
	if ($ed ne "") { $ref .= ",\n$ed (Ed.)"; }
	s/%S (.*)\n// && ($ref .= ",\n$1");		# series
	s/%V (.*)\n// && ($ref .= ",\nvol. $1");	# volume
	s/%N (.*)\n// && ($ref .= ", no. $1");		# number
	s/%I ([^%]*)\n// && ($ref .= ",\n$1");		# institution
	s/%C ([^%]*)\n// && ($ref .= ",\n$1");		# city
	s/%D (.*)\n// && ($ref .= ", $1");		# date
	s/%P (.*)\n// && ($ref .= ",\npp. $1");		# page numbers
	s/%O ([^%]*)\n// && ($ref .= ",\n$1");		# other (e.g. to appear)
	# these may not necessarily be printed:
	s/%X\s*([^%]*)\n// && do { $abstract = $1; };	# abstract
	s/%% ftp: (.*)\n// && ($ftp = $1);		# should build a list?
	($url = $ftp) =~ s!^([^:]+):(.*)$!ftp://$1/$2!;
	while(s/%% ([^%]*)\n//) { $junk .= $1; };	# comments
	$ref =~ s/$RQ,/,$RQ/go;				# fix commas
	$ref .= ".\n";
	# If anything is left, complain:
	# if ($_ =~ /./) { print STDERR "Warning ($lbl) -- extra fields:\n$_\n"; }
}

# escape characters in plain text:
sub hesc {
	s/&/&amp;/g;
	s/</&lt;/g;
	s/>/&gt;/g;
}

sub html_init {
	$I = "<I>"; $R = "</I>";
	$LQ = "<B>``"; $RQ = "''</B>";
}

# convert dead-key and -ms accents to HTML
sub html_accent {
	study;
	# dead-key accents:
	s/\\AE/\&AElig;/g;
	s/\\'([AEIOUYaeiouy])/\&$1acute;/g;
	s/\\[<^]([AEIOUaeiou])/\&$1circ;/g;
	s/\\`([AEIOUaeiou])/\&$1grave;/g;
	s/\\o([Aa])/\&$1ring;/g;
	s/\\~([ANOano])/\&$1tilde;/g;
	s/\\[:"]([AEIOUYaeiouy])/\&$1uml;/g;
	s/\\,([Cc])/\&$1cedil;/g;
	s/\\\/([Oo])/\&$1slash;/g;
	s/\\ss/\&szlig;/g;
	# -ms accents:
	s/([AEIOUYaeiouy])\\\*'/\&$1acute;/g;
	s/([AEIOUaeiou])\\\*'/\&$1grave;/g;
	s/([AEIOUaeiou])\\\*^/\&$1circ;/g;
	s/([AEIOUYaeiouy])\\\*:/\&$1uml;/g;
	s/([ANOano])\\\*~/\&$1tilde;/g;
	s/([Cc])\\\*,/\&$1cedil;/g;
}

# Try to recognize URLs and ftp file indentifiers and convert them into HREFs:
# This routine is evolving.  The patterns are not perfect.
# This is really a parsing problem, and not a job for perl ...
# It is also generally impossible to distinguish ftp site names
# from newsgroup names if the ":<directory>" is missing.
# An arbitrary file name ("runtime.pl") can also be confused.
sub href {
	# study; # doesn't speed things up ...
	# to avoid special cases for beginning & end of line
	s|^|#|; s|$|#|;
	# URLS: <serice>:<rest-of-url>
	s|(news:[\w.]+)|<A HREF="$&">$&</A>|g;
	s|(http:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(file:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(ftp:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(wais:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(gopher:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(telnet:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	# s|(\w+://[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	# catch some newsgroups to avoid confusion with sites:
	s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	# FTP locations (with directory):
	# anonymous@<site>:<path>
	s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# ftp@<site>:<path>
	s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# <site>:<path>
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# NB: don't confuse an http server with a port number for
	# an FTP location!
	# internet number version: <internet-num>:<path>
	s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1<A HREF="file://$2/$3">$2:$3</A>|g;
	# just the site name (assume two dots): <site>
	s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/.:!])|$1<A HREF="file://$2">$2</A>$3|g;
	# NB: can be confused with newsgroup names!
	# <site>.com has only one dot:
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;
	# just internet numbers:
	s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;
	# unfortunately inet numbers can easily be confused with
	# european telephone numbers ...
	s|^#||; s|#$||;
}

1;



