# # parscan.pl.2.2.1 --- query a file of HTML paragraphs (Version for Plexus 2.2.1) # # Oscar Nierstrasz oscar@cui.unige.ch # # Simple package to query a file of HTML paragraphs. Paragraphs can be # arbitrary, standalone blocks of HTML text, separated by blank lines. # To query a file, use the URL: # # http:///parscan/? # # where http:/// is the file's real URL. If is the name # without the .html extension, can be used instead of . If # a header file .hdr exists, parscan will print that instead of the # default header. In addition, if .query exists, it will be used # whenever a non-empty query is given. (Normally .hdr will be a # cover page with introductory information, whereas .query will # only contain the title and main headline.) # # If the file contents are not standalone HTML blocks but, for example, # list items or pre-formatted text, parscan can be instructed to bracket # the results of the search with

and

and , #

and or

anmd . The URL to use is: # # http:///parscan//? # # where is one of: -pre, -dl, -ol or -ul. # Adjacent blocks of text must still be separated by a blank line, however. # # The flag -url will cause parscan to search for URLs and ftp pointers # and convert them into hypertext links. This is probably most interesting # in combination with the -pre flag to query plain text files. # # The same package can be used to query a database of refer(1) style # bibliography entries. Use the URL: # # http:///parscan/-r/? # # The -a flag is used internally by parscan and is automatically # generated when a bibliography entry contains an abstract (%X field). # The URL http:///parscan/-a/? is then automatically # generated where is the value of the %L field. # # Finally, the -l flag causes parscan to retrieve refer records on a # line-by-line basis, if fields are separated by ^A instead of a "\n". # (This is mainly interesting for the CUI library database.) # # For use with Plexus 2.2.1: # # Add the following line to plexus.conf: # $map{'parscan'} = # 'require "$http_sdir/parscan.pl"; &do_parscan("$rest?$query")'; # Add to @inclist: "$http_sdir/parscan.pl", # # You may also want to add some aliases in $map to hide the location of the # database to be searched, for example: # # $oobib = "-r/OSG/Bib/main.bib"; # $map{'bibrefs'} = 'require "$http_sdir/parscan.pl"; &do_parscan("$oobib?$query")'; # # NB: A version for Plexus 3.0 is also available by ftp from # cui.unige.ch:/PUBLIC/oscar/scripts/parscan.pl $map{'parscan'} = '&do_parscan("$rest?$query")'; package parscan; # Versions: #v = 'parscan v1.0'; # 2/9/93 #v = 'parscan v1.1'; # 7/9/93 -- added .query files; merged with bibrefs.pl $v = 'parscan v1.2'; # 8/9/93 -- added -pre, -dl etc. flags and -url $omn = 'OMN

'; $sig = "This page was generated by $v.\n$omn

\n"; $maxcount = 250; # MAX records to retrieve (hardwired ...) sub main'do_parscan { local($file) = @_; ($file =~ s/\?(.*)//) && ($query = $1); ($file =~ s/^(-[^\/]*)\///) && ($flags = "$1-"); if ($file eq "") { die "do_parscan: missing filename\n"; } unless (-e $file) { $file = "$file.html"; -e $file || die "do_parscan: Can't find $file\n"; } if (-d $file) { $file = "$file/index.html"; } unless (-f $file) { $file = "$file.html"; -f $file || die "do_parscan: Can't find $file\n"; } &main'MIME_header('ok', 'text/html'); unless ($flags =~ /-a-/) { print "\n\n"; } # look for a header file: ($base = $file) =~ s/\.html$//; if (($query && open(HDR,"$base.query")) || (open(HDR,"$base.hdr"))) { $/ = undef; # gobble the whole input print ; } else { # no header file, so construct one print "Scan of $file\n", "\n\n

Scan of $file

\n"; } unless ($query || $flags =~ /-a-/) { print < Provide a Perl regular expression as a search pattern.

EOM } if ($query) { print "Result of search for \"$query\":

\n"; open(FILE,"$file.html") || open(FILE,$file) || die "do_parscan: \"$file\" $!"; if ($flags =~ /-l-/) { $/ = "\n"; } else { $/ = ""; } if ($flags =~ /-pre-/) { $pre = "

\n"; $post = "

\n"; } elsif ($flags =~ /-dl-/) { $pre = "

\n"; $post = "\n"; } elsif ($flags =~ /-ol-/) { $pre = "

\n"; $post = "\n"; } elsif ($flags =~ /-ul-/) { $pre = "

\n"; $post = "\n"; } if ($flags =~ /-r-/) { &refmode($query); } elsif ($flags =~ /-a-/) { &absmode($query); } else { &parmode($query); } if ($count == $maxcount) { print "Too many matching records (> $maxcount)!\n", "Try a more restrictive pattern.

"; } elsif ($count == 0) { print "No matching entries found.

\n"; } print "$sig\n"; close(FILE); } } sub parmode { local($query) = @_; print $pre; while () { /$query/oi || next; if ($flags =~ /-url-/) { &href; } print; last if (++$count == $maxcount); } print $post; } sub refmode { local($query) = @_; &html_init; $count = 0; print "

) { /$query/oi || next; tr//\n/; # expand ^A to CR &html_accent; &getref; print "\n

$ref\n"; if ($abstract ne "") { $what = "abstract"; if ($ftp ne "") { $what .= "+ftp"; } print "$what\n"; } elsif ($ftp ne "") { print "ftp: $ftp\n"; } last if (++$count == $maxcount); } print "

\n"; } sub absmode { local($query) = @_; &html_init; print "_____________________________________", "_____________________________________

\n"; while () { /$query/oi || next; &html_accent; &getref; print "

$title

\n"; print "$ref

\n"; if ($abstract ne "") { print "

Abstract

\n$abstract

\n"; } if ($ftp ne "") { print "ftp: $ftp

\n"; } print "_____________________________________", "_____________________________________

\n"; last if (++$count == $maxcount); } } # ======================================================================== # The remaining procedures are mostly identical to those used in "bib", # and are dedicated to handling refer bibliographies # (should be separate libraries) # build up a reference: sub getref { $ref = $lbl = $keys = $auth = $ed = $title = ""; $abstract = $ftp = $url = $junk = ""; # study; # strangely, this slows us down! s/\n\s+/\n/g; # remove leading white space s/%L (.*)\n// && ($lbl = $1); # label s/%K (.*)\n// && ($keys = $1); # keywords # if ($lbl eq "") { print STDERR "Warning -- missing label:\n$_"; } # Collect authors: while (s/%[AQ] (.*)\n(%[AQ] .*\n%[AQ])/$2/) { $auth .= "$1,\n"; } s/%[AQ] (.*)\n%[AQ] (.*)\n// && ($auth .= "$1 and\n$2"); s/%[AQ] (.*)\n// && ($auth = $1); # Collect editors: while (s/%E (.*)\n(%E .*\n%E)/$2/) { $ed .= "$1,\n"; } s/%E (.*)\n%E (.*)\n// && ($ed .= "$1 and\n$2"); s/%E (.*)\n// && ($ed = $1); # Check for missing authors: if ($auth eq "") { if ($ed ne "") { $auth = "$ed (Ed.)"; $ed = ""; } else { $auth = "(Anonymous)"; # print STDERR "Warning ($lbl): missing author\n"; } } else { $ref = "$auth,\n"; } # from this point on, ref ends without newline so commas # can be added incrementally. # grab the title: s/%T ([^%]*)\n// && ($title = $1); # determine kind of publication: if (/%J/) { # Journal paper $ref .= "$LQ$title$RQ"; s/%J ([^%]*)\n// && ($ref .= ",\n$I$1$R"); } elsif(/%B/) { # Article in book $ref .= "$LQ$title$RQ"; s/%B ([^%]*)\n// && ($ref .= ",\n$I$1$R"); } elsif(/%R/) { # Technical report $ref .= "$LQ$title$RQ"; s/%R ([^%]*)\n// && ($ref .= ",\n$1"); } else { $ref .= "$I$title$R"; } # Book # If more than one of J, B or R, will show up as JUNK: # if (/(%[JBR])/) { # print STDERR "Warning ($lbl): type conflict [$1]\n"; # } # add remaining fields in standard ord: if ($ed ne "") { $ref .= ",\n$ed (Ed.)"; } s/%S (.*)\n// && ($ref .= ",\n$1"); # series s/%V (.*)\n// && ($ref .= ",\nvol. $1"); # volume s/%N (.*)\n// && ($ref .= ", no. $1"); # number s/%I ([^%]*)\n// && ($ref .= ",\n$1"); # institution s/%C ([^%]*)\n// && ($ref .= ",\n$1"); # city s/%D (.*)\n// && ($ref .= ", $1"); # date s/%P (.*)\n// && ($ref .= ",\npp. $1"); # page numbers s/%O ([^%]*)\n// && ($ref .= ",\n$1"); # other (e.g. to appear) # these may not necessarily be printed: s/%X ([^%]*)\n// && do { $abstract = $1; }; # abstract s/%% ftp: (.*)\n// && ($ftp = $1); # should build a list? ($url = $ftp) =~ s!^([^:]+):(.*)$!ftp://$1/$2!; while(s/%% ([^%]*)\n//) { $junk .= $1; }; # comments $ref =~ s/$RQ,/,$RQ/go; # fix commas $ref .= ".\n"; # If anything is left, complain: # if ($_ =~ /./) { print STDERR "Warning ($lbl) -- extra fields:\n$_\n"; } } sub html_init { $I = ""; $R = ""; $LQ = "``"; $RQ = "''"; } # convert dead-key and -ms accents to HTML sub html_accent { study; # dead-key accents: s/\\AE/\Æ/g; s/\\'([AEIOUYaeiouy])/\&$1acute;/g; s/\\[<^]([AEIOUaeiou])/\&$1circ;/g; s/\\`([AEIOUaeiou])/\&$1grave;/g; s/\\o([Aa])/\&$1ring;/g; s/\\~([ANOano])/\&$1tilde;/g; s/\\[:"]([AEIOUYaeiouy])/\&$1uml;/g; s/\\,([Cc])/\&$1cedil;/g; s/\\\/([Oo])/\&$1slash;/g; s/\\ss/\ß/g; # -ms accents: s/([AEIOUYaeiouy])\\\*'/\&$1acute;/g; s/([AEIOUaeiou])\\\*'/\&$1grave;/g; s/([AEIOUaeiou])\\\*^/\&$1circ;/g; s/([AEIOUYaeiouy])\\\*:/\&$1uml;/g; s/([ANOano])\\\*~/\&$1tilde;/g; s/([Cc])\\\*,/\&$1cedil;/g; } # Try to recognize URLs and ftp file indentifiers and convert them into HREFs: # This routine is evolving. The patterns are not perfect. # This is really a parsing problem, and not a job for perl ... # It is also generally impossible to distinguish ftp site names # from newsgroup names if the ":" is missing. # An arbitrary file name ("runtime.pl") can also be confused. sub href { # study; # doesn't speed things up ... # to avoid special cases for beginning & end of line s|^|>>>|; s|$|<<<|; # URLS: s|(news:[\w.]+)|$&|g; s|(http:[\w/.:+\-]+)|$&|g; s|(file:[\w/.:+\-]+)|$&|g; s|(ftp:[\w/.:+\-]+)|$&|g; s|(wais:[\w/.:+\-]+)|$&|g; s|(gopher:[\w/.:+\-]+)|$&|g; s|(telnet:[\w/.:+\-]+)|$&|g; # s|(\w+://[\w/.:+\-]+)|$&|g; # catch some newsgroups to avoid confusion with sites: s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2|g; # FTP locations (with directory): s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1$2:$4$3|g; s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1$2:$4$3|g; s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1$2:$4$3|g; # NB: don't confuse an http server with a port number for # an FTP location! # internet number version: s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w+\-/.]+)|$1$2:$3|g; # just the site name (assume two dots): s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\-/.:!])|$1$2$3|g; # NB: can be confused with newsgroup names! # .com has only one dot: s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2$3|g; # just internet numbers: s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2$3|g; # unfortunately inet numbers can easily be confused with # european telephone numbers ... s|^>>>||; s|<<<$||; } 1;