is the value of the %L field.
#
# Finally, the -l flag causes parscan to retrieve refer records on a
# line-by-line basis, if fields are separated by ^A instead of a "\n".
# (This is mainly interesting for the CUI library database.)
#
# For use with the Plexus 3.0.
#
# Version 3.0:
# add the following to local.conf:
# load parscan.pl
# map parscan parscan.pl &do_parscan("$rest,$query")
#
# Similarly, aliases can be added as before:
# map bibrefs parscan.pl &do_parscan("-r/OSG/Bib/main.bib,$query")
#
# A version for 2.2.1 is also available by ftp at
# cui.unige.ch:/PUBLIC/oscar/scripts/parscan.pl.2.2.1
# $map{'parscan'} = '&do_parscan("$rest,$query")';
package parscan;
# Versions:
#v = 'parscan v1.0'; # 2/9/93
#v = 'parscan v1.1'; # 7/9/93 -- added .query files; merged with bibrefs.pl
#v = 'parscan v1.2'; # 8/9/93 -- added -pre, -dl etc. flags and -url
#v = 'parscan v1.3'; # 8/9/93 -- version for plexus 3.0
#v = 'parscan v1.4'; # 10/9/93 -- added &hesc for -pre mode
#v = 'parscan v1.5'; # 14/9/93 -- process "+" in queries
#v = 'parscan v1.6'; # 15/9/93 -- use "eval" to error-check queries
#v = 'parscan v1.7'; # 6/10/93 -- absmode returns 1
#v = 'parscan v1.8'; # 21/10/93 -- altered handling of & ...
#v = 'parscan v1.9'; # 23/10/93 -- added unescaping of special chars in queries
#v = 'parscan v1.10'; # 26/10/93 -- ftp links now point to directories
$v = 'parscan v1.11'; # 28/10/93 -- converted "die" to &main'error
$omn = 'OMN ';
$sig = "This page was generated by $v. \n$omn
\n";
$maxcount = 250; # MAX records to retrieve (hardwired ...)
sub main'do_parscan {
local($file,$query) = @_;
local($base);
($file =~ s/^(-[^\/]*)\///) && ($flags = "$1-");
# splitquery decodes %## characters and splits on "+" (space)
$query = join(" ", &main'splitquery($query));
if ($file eq "") { &main'error('bad_request',
"do_parscan: missing filename"); }
unless (-e $file) {
$file = "$file.html";
-e $file ||
&main'error('not_found', "do_parscan: Can't find $file\n");
}
if (-d $file) { $file = "$file/index.html"; }
unless (-f $file) {
$file = "$file.html" unless ($file =~ /\.html$/);
-f $file ||
&main'error('not_found', "do_parscan: Can't find $file\n");
}
&main'MIME_header('ok', 'text/html');
# look for a header file:
($base = $file) =~ s/\.html$//;
if (($query && &main'safeopen(parscan'HDR,"$base.query"))
|| (&main'safeopen(parscan'HDR,"$base.hdr"))) {
$/ = undef; # gobble the whole input
print ;
}
else {
# no header file, so construct one
print "\nScan of $file \n";
unless ($flags =~ /-a-/) { print " \n"; }
print "\n\nScan of $file \n";
}
unless ($query || $flags =~ /-a-/) {
print <
Perl regular expression as a search pattern.
EOM
}
if ($query) {
print "Result of search for \"$query\":
\n";
&main'safeopen(parscan'FILE,"$file.html")
|| &main'safeopen(parscan'FILE,$file)
|| &main'error('not_found',
"do_parscan: \"$file\" $!");
if ($flags =~ /-l-/) { $/ = "\n"; }
else { $/ = ""; }
if ($flags =~ /-r-/) { $mode = "&refmode"; }
elsif ($flags =~ /-a-/) { $mode = "&absmode"; }
else { $mode = "&parmode"; }
eval "$mode(\$query)" ||
print "$post\nGarbled search pattern: $@\n
\n",
"Be sure to use a valid Perl regular expression .
\n";
$@ = undef;
if ($count == $maxcount) {
print "Too many matching records (> $maxcount)!\n",
"Try a more restrictive pattern.
";
}
elsif ($count == 0) {
print "No matching entries found.
\n";
}
print "$sig\n";
print "";
close(FILE);
}
}
sub style {
if ($flags =~ /-pre-/) { $pre = "
\n"; $post = " \n"; }
elsif ($flags =~ /-dl-/) { $pre = "\n"; $post = " \n"; }
elsif ($flags =~ /-ol-/) { $pre = "\n"; $post = " \n"; }
elsif ($flags =~ /-ul-/) { $pre = "\n"; }
}
sub parmode {
local($query) = @_;
&style;
print $pre;
$count = 0;
while () {
/$query/oi || next;
if ($flags =~ /-pre-/) { &hesc; }
if ($flags =~ /-url-/) { &href; }
print;
last if (++$count == $maxcount);
}
print $post;
}
sub refmode {
local($query) = @_;
&html_init;
$count = 0;
$pre = "\n"; $post = " \n"; # default style
&style;
print $pre;
while () {
/$query/oi || next;
tr//\n/; # expand ^A to CR
&html_accent;
&getref;
print "\n$ref\n";
if ($abstract ne "") {
$what = "abstract";
if ($ftp ne "") { $what .= "+ftp"; }
print "$what \n";
}
elsif ($ftp ne "") {
# point to the directory rather than the file:
$url =~ s|[^/]*$||;
$ftp =~ s|([^/]*)$||;
$ftpfile = $1;
print "ftp: $ftp $ftpfile \n";
# print "ftp: $ftp \n";
}
last if (++$count == $maxcount);
}
print $post;
}
sub absmode {
local($query) = @_;
&html_init;
print "_____________________________________",
"_____________________________________\n";
while () {
/$query/oi || next;
&html_accent;
&getref;
print "$title \n";
print "$ref\n";
if ($abstract ne "") {
print "
Abstract \n$abstract\n";
}
if ($ftp ne "") {
# point to the directory rather than the file:
$url =~ s|[^/]*$||;
$ftp =~ s|([^/]*)$||;
$ftpfile = $1;
print "ftp: ",
"$ftp $ftpfile
\n";
# print "ftp: $ftp
\n";
}
print "_____________________________________",
"_____________________________________
\n";
last if (++$count == $maxcount);
}
1;
}
# ========================================================================
# The remaining procedures are mostly identical to those used in "bib",
# and are dedicated to handling refer bibliographies
# (should be separate libraries)
# build up a reference (set global vars):
sub getref {
$ref = $lbl = $keys = $auth = $ed = $title = "";
$abstract = $ftp = $url = $junk = "";
# study; # strangely, this slows us down!
s/\n\s+/\n/g; # remove leading white space
s/%L (.*)\n// && ($lbl = $1); # label
s/%K (.*)\n// && ($keys = $1); # keywords
# if ($lbl eq "") { print STDERR "Warning -- missing label:\n$_"; }
# Collect authors:
while (s/%[AQ] (.*)\n(%[AQ] .*\n%[AQ])/$2/) { $auth .= "$1,\n"; }
s/%[AQ] (.*)\n%[AQ] (.*)\n// && ($auth .= "$1 and\n$2");
s/%[AQ] (.*)\n// && ($auth = $1);
# Collect editors:
while (s/%E (.*)\n(%E .*\n%E)/$2/) { $ed .= "$1,\n"; }
s/%E (.*)\n%E (.*)\n// && ($ed .= "$1 and\n$2");
s/%E (.*)\n// && ($ed = $1);
# Check for missing authors:
if ($auth eq "") {
if ($ed ne "") { $auth = "$ed (Ed.)"; $ed = ""; }
else {
$auth = "(Anonymous)";
# print STDERR "Warning ($lbl): missing author\n";
}
}
else { $ref = "$auth,\n"; }
# from this point on, ref ends without newline so commas
# can be added incrementally.
# grab the title:
s/%T ([^%]*)\n// && ($title = $1);
# determine kind of publication:
if (/%J/) { # Journal paper
$ref .= "$LQ$title$RQ";
s/%J ([^%]*)\n// && ($ref .= ",\n$I$1$R");
}
elsif(/%B/) { # Article in book
$ref .= "$LQ$title$RQ";
s/%B ([^%]*)\n// && ($ref .= ",\n$I$1$R");
}
elsif(/%R/) { # Technical report
$ref .= "$LQ$title$RQ";
s/%R ([^%]*)\n// && ($ref .= ",\n$1");
}
else { $ref .= "$I$title$R"; } # Book
# If more than one of J, B or R, will show up as JUNK:
# if (/(%[JBR])/) {
# print STDERR "Warning ($lbl): type conflict [$1]\n";
# }
# add remaining fields in standard ord:
if ($ed ne "") { $ref .= ",\n$ed (Ed.)"; }
s/%S (.*)\n// && ($ref .= ",\n$1"); # series
s/%V (.*)\n// && ($ref .= ",\nvol. $1"); # volume
s/%N (.*)\n// && ($ref .= ", no. $1"); # number
s/%I ([^%]*)\n// && ($ref .= ",\n$1"); # institution
s/%C ([^%]*)\n// && ($ref .= ",\n$1"); # city
s/%D (.*)\n// && ($ref .= ", $1"); # date
s/%P (.*)\n// && ($ref .= ",\npp. $1"); # page numbers
s/%O ([^%]*)\n// && ($ref .= ",\n$1"); # other (e.g. to appear)
# these may not necessarily be printed:
s/%X\s*([^%]*)\n// && do { $abstract = $1; }; # abstract
s/%% ftp: (.*)\n// && ($ftp = $1); # should build a list?
($url = $ftp) =~ s!^([^:]+):(.*)$!ftp://$1/$2!;
while(s/%% ([^%]*)\n//) { $junk .= $1; }; # comments
$ref =~ s/$RQ,/,$RQ/go; # fix commas
$ref .= ".\n";
# If anything is left, complain:
# if ($_ =~ /./) { print STDERR "Warning ($lbl) -- extra fields:\n$_\n"; }
}
# escape characters in plain text:
sub hesc {
s/&/&/g;
s/</g;
s/>/>/g;
}
sub html_init {
$I = ""; $R = " ";
$LQ = "``"; $RQ = "'' ";
}
# convert dead-key and -ms accents to HTML
sub html_accent {
study;
# dead-key accents:
s/\\AE/\Æ/g;
s/\\'([AEIOUYaeiouy])/\&$1acute;/g;
s/\\[<^]([AEIOUaeiou])/\&$1circ;/g;
s/\\`([AEIOUaeiou])/\&$1grave;/g;
s/\\o([Aa])/\&$1ring;/g;
s/\\~([ANOano])/\&$1tilde;/g;
s/\\[:"]([AEIOUYaeiouy])/\&$1uml;/g;
s/\\,([Cc])/\&$1cedil;/g;
s/\\\/([Oo])/\&$1slash;/g;
s/\\ss/\ß/g;
# -ms accents:
s/([AEIOUYaeiouy])\\\*'/\&$1acute;/g;
s/([AEIOUaeiou])\\\*'/\&$1grave;/g;
s/([AEIOUaeiou])\\\*^/\&$1circ;/g;
s/([AEIOUYaeiouy])\\\*:/\&$1uml;/g;
s/([ANOano])\\\*~/\&$1tilde;/g;
s/([Cc])\\\*,/\&$1cedil;/g;
}
# Try to recognize URLs and ftp file indentifiers and convert them into HREFs:
# This routine is evolving. The patterns are not perfect.
# This is really a parsing problem, and not a job for perl ...
# It is also generally impossible to distinguish ftp site names
# from newsgroup names if the ":" is missing.
# An arbitrary file name ("runtime.pl") can also be confused.
sub href {
# study; # doesn't speed things up ...
# to avoid special cases for beginning & end of line
s|^|#|; s|$|#|;
# URLS: :
s|(news:[\w.]+)|$& |g;
s|(http:[\w/.:+\-]+)|$& |g;
s|(file:[\w/.:+\-]+)|$& |g;
s|(ftp:[\w/.:+\-]+)|$& |g;
s|(wais:[\w/.:+\-]+)|$& |g;
s|(gopher:[\w/.:+\-]+)|$& |g;
s|(telnet:[\w/.:+\-]+)|$& |g;
# s|(\w+://[\w/.:+\-]+)|$& |g;
# catch some newsgroups to avoid confusion with sites:
s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2 |g;
# FTP locations (with directory):
# anonymous@:
s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4 $3|g;
# ftp@:
s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4 $3|g;
# :
s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4 $3|g;
# NB: don't confuse an http server with a port number for
# an FTP location!
# internet number version: :
s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1$2:$3 |g;
# just the site name (assume two dots):
s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/.:!])|$1$2 $3|g;
# NB: can be confused with newsgroup names!
# .com has only one dot:
s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2 $3|g;
# just internet numbers:
s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2 $3|g;
# unfortunately inet numbers can easily be confused with
# european telephone numbers ...
s|^#||; s|#$||;
}
1;