#
# parscan.pl.2.2.1 --- query a file of HTML paragraphs (Version for Plexus 2.2.1)
#
# Oscar Nierstrasz oscar@cui.unige.ch
#
# Simple package to query a file of HTML paragraphs.  Paragraphs can be
# arbitrary, standalone blocks of HTML text, separated by blank lines.
# To query a file, use the URL:
#
#	http://<site>/parscan/<file>?<query>
#
# where http://<site>/<file> is the file's real URL.  If <base> is the name
# without the .html extension, <base> can be used instead of <file>.  If
# a header file <base>.hdr exists, parscan will print that instead of the
# default header.  In addition, if <base>.query exists, it will be used
# whenever a non-empty query is given.  (Normally <base>.hdr will be a
# cover page with introductory information, whereas <base>.query will
# only contain the title and main headline.)
#
# If the file contents are not standalone HTML blocks but, for example,
# list items or pre-formatted text, parscan can be instructed to bracket
# the results of the search with <PRE> and </PRE>, <DL> and </DL>,
# <OL> and </OL> or <UL> anmd </UL>.  The URL to use is:
#
#	http://<site>/parscan/<flag>/<file>?<query>
#
# where <flag> is one of: -pre, -dl, -ol or -ul.
# Adjacent blocks of text must still be separated by a blank line, however.
#
# The flag -url will cause parscan to search for URLs and ftp pointers
# and convert them into hypertext links.  This is probably most interesting
# in combination with the -pre flag to query plain text files.
#
# The same package can be used to query a database of refer(1) style
# bibliography entries.  Use the URL:
#
#	http://<site>/parscan/-r/<file>?<query>
#
# The -a flag is used internally by parscan and is automatically
# generated when a bibliography entry contains an abstract (%X field).
# The URL http://<site>/parscan/-a/<file>?<label> is then automatically
# generated where <label> is the value of the %L field.
#
# Finally, the -l flag causes parscan to retrieve refer records on a
# line-by-line basis, if fields are separated by ^A instead of a "\n".
# (This is mainly interesting for the CUI library database.)
#
# For use with Plexus 2.2.1:
#
# Add the following line to plexus.conf:
# $map{'parscan'} =
#		'require "$http_sdir/parscan.pl"; &do_parscan("$rest?$query")';
# Add to @inclist: "$http_sdir/parscan.pl",
#
# You may also want to add some aliases in $map to hide the location of the
# database to be searched, for example:
#
# $oobib = "-r/OSG/Bib/main.bib";
# $map{'bibrefs'} = 'require "$http_sdir/parscan.pl"; &do_parscan("$oobib?$query")';
#
# NB: A version for Plexus 3.0 is also available by ftp from
# cui.unige.ch:/PUBLIC/oscar/scripts/parscan.pl

$map{'parscan'} = '&do_parscan("$rest?$query")';

package parscan;

# Versions:
#v = 'parscan v1.0'; # 2/9/93
#v = 'parscan v1.1'; # 7/9/93 -- added .query files; merged with bibrefs.pl
$v = 'parscan v1.2'; # 8/9/93 -- added -pre, -dl etc. flags and -url

$omn = '<A HREF="http://cui_www.unige.ch/OSG/omn.html"><I>OMN</I></A><P>';
$sig = "<I>This page was generated by $v.</I>\n$omn<P>\n";

$maxcount = 250; # MAX records to retrieve (hardwired ...)

sub main'do_parscan {
	local($file) = @_;
	($file =~ s/\?(.*)//) && ($query = $1);
	($file =~ s/^(-[^\/]*)\///) && ($flags = "$1-");

	if ($file eq "") { die "do_parscan: missing filename\n"; }

	unless (-e $file) {
		$file = "$file.html";
		-e $file || die "do_parscan: Can't find $file\n";
	}
	if (-d $file) { $file = "$file/index.html"; }
	unless (-f $file) {
		$file = "$file.html";
		-f $file || die "do_parscan: Can't find $file\n";
	}

	&main'MIME_header('ok', 'text/html');

	unless ($flags =~ /-a-/) { print "<HEAD>\n<ISINDEX>\n"; }

	# look for a header file:
	($base = $file) =~ s/\.html$//;
	if (($query && open(HDR,"$base.query"))
		|| (open(HDR,"$base.hdr"))) {
			$/ = undef; # gobble the whole input
			print <HDR>;
		}
	else {
		# no header file, so construct one
		print "<TITLE>Scan of $file</TITLE>\n",
		"</HEAD>\n<BODY>\n<H1>Scan of $file</H1>\n";
	}

	unless ($query || $flags =~ /-a-/) {
		print <<EOM;
___________________________________________________________________<P>
Provide a
<A HREF=http://www.cis.ohio-state.edu:85/info/perl.info,Regular%20Expressions>
Perl regular expression</A> as a search pattern.
<P>
EOM
	}

	if ($query) {
		print "<B>Result of search for \"$query\":</B><P>\n";
		open(FILE,"$file.html")
			|| open(FILE,$file)
			|| die "do_parscan: \"$file\" $!";

		if ($flags =~ /-l-/) { $/ = "\n"; }
		else { $/ = ""; }

		if ($flags =~ /-pre-/) { $pre = "<PRE>\n"; $post = "</PRE>\n"; }
		elsif ($flags =~ /-dl-/) { $pre = "<DL>\n"; $post = "</DL>\n"; }
		elsif ($flags =~ /-ol-/) { $pre = "<OL>\n"; $post = "</OL>\n"; }
		elsif ($flags =~ /-ul-/) { $pre = "<UL>\n"; $post = "</UL>\n"; }

		if ($flags =~ /-r-/) { &refmode($query); }
		elsif ($flags =~ /-a-/) { &absmode($query); }
		else { &parmode($query); }

		if ($count == $maxcount) {
			print "<B>Too many matching records (> $maxcount)!\n",
			"Try a more restrictive pattern.</B><P>";
		}
		elsif ($count == 0) {
			print "<B>No matching entries found.</B><P>\n";
		}
		print "$sig\n";
		close(FILE);
	}
}

sub parmode {
	local($query) = @_;
	print $pre;
	while (<FILE>) {
		/$query/oi || next;
		if ($flags =~ /-url-/) { &href; }
		print;
		last if (++$count == $maxcount);
	}
	print $post;
}

sub refmode {
	local($query) = @_;
	&html_init;
	$count = 0;
	print "<OL>\n";
	while (<FILE>) {
		/$query/oi || next;
		tr//\n/; # expand ^A to CR
		&html_accent;
		&getref;
		print "\n<LI>$ref\n";
		if ($abstract ne "") {
			$what = "abstract";
			if ($ftp ne "") { $what .= "+ftp"; }
			print "<A HREF=\"/parscan/-a/$file?$lbl\"><I>$what</I></A>\n";
		}
		elsif ($ftp ne "") {
			print "<I>ftp: <A HREF=\"$url\">$ftp</A></I>\n";
		}
		last if (++$count == $maxcount);
	}
	print "</OL>\n";
}

sub absmode {
	local($query) = @_;
	&html_init;
	print "_____________________________________",
		"_____________________________________<P>\n";
	while (<FILE>) {
		/$query/oi || next;
		&html_accent;
		&getref;
		print "<H2>$title</H2>\n";
		print "$ref<P>\n";
		if ($abstract ne "") {
			print "<H3>Abstract</H3>\n$abstract<P>\n";
		}
		if ($ftp ne "") {
			print "<B>ftp:</B> <A HREF=\"$url\"><I>$ftp</I></A><P>\n";
		}
		print "_____________________________________",
			"_____________________________________<P>\n";
		last if (++$count == $maxcount);
	}
}

# ========================================================================

# The remaining procedures are mostly identical to those used in "bib",
# and are dedicated to handling refer bibliographies
# (should be separate libraries)

# build up a reference:
sub getref {
	$ref = $lbl = $keys = $auth = $ed = $title = "";
	$abstract = $ftp = $url = $junk = "";

	# study; # strangely, this slows us down!

	s/\n\s+/\n/g;	# remove leading white space
	s/%L (.*)\n// && ($lbl = $1);			# label
	s/%K (.*)\n// && ($keys = $1);			# keywords
	# if ($lbl eq "") { print STDERR "Warning -- missing label:\n$_"; }

	# Collect authors:
	while (s/%[AQ] (.*)\n(%[AQ] .*\n%[AQ])/$2/) { $auth .= "$1,\n"; }
	s/%[AQ] (.*)\n%[AQ] (.*)\n// && ($auth .= "$1 and\n$2");
	s/%[AQ] (.*)\n// && ($auth = $1);

	# Collect editors:
	while (s/%E (.*)\n(%E .*\n%E)/$2/) { $ed .= "$1,\n"; }
	s/%E (.*)\n%E (.*)\n// && ($ed .= "$1 and\n$2");
	s/%E (.*)\n// && ($ed = $1);

	# Check for missing authors:
	if ($auth eq "") {
		if ($ed ne "") { $auth = "$ed (Ed.)"; $ed = ""; }
		else {
			$auth = "(Anonymous)";
			# print STDERR "Warning ($lbl): missing author\n";
		}
	}
	else { $ref = "$auth,\n"; }

	# from this point on, ref ends without newline so commas
	# can be added incrementally.

	# grab the title:
	s/%T ([^%]*)\n// && ($title = $1);
	# determine kind of publication:
	if (/%J/) {				# Journal paper
		$ref .= "$LQ$title$RQ";
		s/%J ([^%]*)\n// && ($ref .= ",\n$I$1$R");
	}
	elsif(/%B/) {				# Article in book
		$ref .= "$LQ$title$RQ";
		s/%B ([^%]*)\n// && ($ref .= ",\n$I$1$R");
	}
	elsif(/%R/) {				# Technical report
		$ref .= "$LQ$title$RQ";
		s/%R ([^%]*)\n// && ($ref .= ",\n$1");
	}
	else { $ref .= "$I$title$R"; }		 # Book
	# If more than one of J, B or R, will show up as JUNK:
	# if (/(%[JBR])/) {
		# print STDERR "Warning ($lbl): type conflict [$1]\n";
	# }

	# add remaining fields in standard ord:
	if ($ed ne "") { $ref .= ",\n$ed (Ed.)"; }
	s/%S (.*)\n// && ($ref .= ",\n$1");		# series
	s/%V (.*)\n// && ($ref .= ",\nvol. $1");	# volume
	s/%N (.*)\n// && ($ref .= ", no. $1");		# number
	s/%I ([^%]*)\n// && ($ref .= ",\n$1");		# institution
	s/%C ([^%]*)\n// && ($ref .= ",\n$1");		# city
	s/%D (.*)\n// && ($ref .= ", $1");		# date
	s/%P (.*)\n// && ($ref .= ",\npp. $1");		# page numbers
	s/%O ([^%]*)\n// && ($ref .= ",\n$1");		# other (e.g. to appear)

	# these may not necessarily be printed:
	s/%X ([^%]*)\n// && do { $abstract = $1; };	# abstract
	s/%% ftp: (.*)\n// && ($ftp = $1);		# should build a list?
	($url = $ftp) =~ s!^([^:]+):(.*)$!ftp://$1/$2!;

	while(s/%% ([^%]*)\n//) { $junk .= $1; };	# comments

	$ref =~ s/$RQ,/,$RQ/go;				# fix commas
	$ref .= ".\n";

	# If anything is left, complain:
	# if ($_ =~ /./) { print STDERR "Warning ($lbl) -- extra fields:\n$_\n"; }
}

sub html_init {
	$I = "<I>"; $R = "</I>";
	$LQ = "<B>``"; $RQ = "''</B>";
}

# convert dead-key and -ms accents to HTML
sub html_accent {
	study;

	# dead-key accents:
	s/\\AE/\&AElig;/g;
	s/\\'([AEIOUYaeiouy])/\&$1acute;/g;
	s/\\[<^]([AEIOUaeiou])/\&$1circ;/g;
	s/\\`([AEIOUaeiou])/\&$1grave;/g;
	s/\\o([Aa])/\&$1ring;/g;
	s/\\~([ANOano])/\&$1tilde;/g;
	s/\\[:"]([AEIOUYaeiouy])/\&$1uml;/g;
	s/\\,([Cc])/\&$1cedil;/g;
	s/\\\/([Oo])/\&$1slash;/g;
	s/\\ss/\&szlig;/g;

	# -ms accents:
	s/([AEIOUYaeiouy])\\\*'/\&$1acute;/g;
	s/([AEIOUaeiou])\\\*'/\&$1grave;/g;
	s/([AEIOUaeiou])\\\*^/\&$1circ;/g;
	s/([AEIOUYaeiouy])\\\*:/\&$1uml;/g;
	s/([ANOano])\\\*~/\&$1tilde;/g;
	s/([Cc])\\\*,/\&$1cedil;/g;
}

# Try to recognize URLs and ftp file indentifiers and convert them into HREFs:
# This routine is evolving.  The patterns are not perfect.
# This is really a parsing problem, and not a job for perl ...
# It is also generally impossible to distinguish ftp site names
# from newsgroup names if the ":<directory>" is missing.
# An arbitrary file name ("runtime.pl") can also be confused.
sub href {
	# study; # doesn't speed things up ...

	# to avoid special cases for beginning & end of line
	s|^|>>>|; s|$|<<<|;

	# URLS:
	s|(news:[\w.]+)|<A HREF="$&">$&</A>|g;
	s|(http:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(file:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(ftp:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(wais:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(gopher:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(telnet:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	# s|(\w+://[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;

	# catch some newsgroups to avoid confusion with sites:
	s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;

	# FTP locations (with directory):
	s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# NB: don't confuse an http server with a port number for
	# an FTP location!
	# internet number version:
	s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w+\-/.]+)|$1<A HREF="file://$2/$3">$2:$3</A>|g;

	# just the site name (assume two dots):
	s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\-/.:!])|$1<A HREF="file://$2">$2</A>$3|g;
	# NB: can be confused with newsgroup names!
	# <site>.com has only one dot:
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;

	# just internet numbers:
	s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;
	# unfortunately inet numbers can easily be confused with
	# european telephone numbers ...

	s|^>>>||; s|<<<$||;
}

1;

