#! /bin/perl # # url.pl --- recognize, parse and retrieve URLs # # This package contains: # # url'href: identify URLs and turn them into hypertext links # url'get: parse an URL and perform an hget # url'parse: parse an URL and return ($host,$port,$path,$request) # url'abs: convert relative URLs to absolute ones # url'hget: perform an http request and return the result # # Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch # # BUGS: relative paths work only if directories are always # terminated with a "/" -- otherwise assumes the directory is # just a filename and remembers the parent directory as the # current path. # # Can't get $! to return error messages properly. package url; require "sys/socket.ph"; # locals: $host = undef; $port = undef; $request = undef; $sockaddr = 'S n a4 x8'; chop($thishost = `hostname`); ($name, $aliases, $proto) = getprotobyname("tcp"); ($name, $aliases, $type, $len, $thisaddr) = gethostbyname($thishost); $this = pack($sockaddr, &AF_INET, 0, $thisaddr); # Try to recognize URLs and ftp file indentifiers and convert them into HREFs: # This routine is evolving. The patterns are not perfect. # This is really a parsing problem, and not a job for perl ... # It is also generally impossible to distinguish ftp site names # from newsgroup names if the ":" is missing. # An arbitrary file name ("runtime.pl") can also be confused. sub href { # study; # doesn't speed things up ... # to avoid special cases for beginning & end of line s|^|>>>|; s|$|<<<|; # URLS: s|(news:[\w.]+)|$&|g; s|(http:[\w/.:+\-]+)|$&|g; s|(file:[\w/.:+\-]+)|$&|g; s|(ftp:[\w/.:+\-]+)|$&|g; s|(wais:[\w/.:+\-]+)|$&|g; s|(gopher:[\w/.:+\-]+)|$&|g; s|(telnet:[\w/.:+\-]+)|$&|g; # s|(\w+://[\w/.:+\-]+)|$&|g; # catch some newsgroups to avoid confusion with sites: s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2|g; # FTP locations (with directory): s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1$2:$4$3|g; s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1$2:$4$3|g; s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1$2:$4$3|g; # NB: don't confuse an http server with a port number for # an FTP location! # internet number version: s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w+\-/.]+)|$1$2:$3|g; # just the site name (assume two dots): s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\-/.:!])|$1$2$3|g; # NB: can be confused with newsgroup names! # .com has only one dot: s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2$3|g; # just internet numbers: s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2$3|g; # unfortunately inet numbers can easily be confused with # european telephone numbers ... s|^>>>||; s|<<<$||; } # parse an URL, issue the request and return the result sub get { local($url) = @_; ($host,$port,$path,$request) = &parse($host,$port,$path,$url); if ($host) { &hget($host,$port,$request); } else { undef; } } # convert an URL to (host,port,path,request) # given previous host, port and path, will handle relative URLs sub parse { local($host,$port,$path,$url) = @_; if ($url =~ m|^http://(.*)|) { $host = $1; $port = 80; # default $request = "/"; # default ($host =~ s|^([^/]+)(/.*)$|$1|) && ($request = $2); ($host =~ s/:(\d+)$//) && ($port = $1); ($path = $request) =~ s|[^/]*$||; } else { # relative URL of form "http:" if ($url =~ /^http:(.*)/) { $request = $1; } # other URL of form "service:" elsif ($url =~ /^\w+:/) { # $! = "url'parse: $url is not an http URL\n"; print STDERR "url'parse: $url is not an http URL\n"; return (undef,undef,undef,undef); } # relative URL of form "" else { $request = $url; } $request =~ s|^$|/|; $request =~ s|^([^/])|$path$1|; # relative path $request =~ s|/\./|/|g; while ($request =~ m|/\.\./|) { $request =~ s|[^/]*/\.\./||; } # assume previous host & port: unless ($host) { # $! = "url'parse: no host for $url\n"; print STDERR "url'parse: no host for $url\n"; return (undef,undef,undef,undef); } } ($host,$port,$path,$request); } # convert relative http URLs to absolute ones: sub abs { local($url,$page) = @_; ($host,$port,$path,$request) = &parse(undef,undef,undef,$url); $root = "http://$host:$port"; @hrefs = split(/<[Aa]/,$page); $n = $[; while (++$n <= $#hrefs) { # absolute URLs ok: ($hrefs[$n] =~ m|href\s*=\s*"http://|i) && next; ($hrefs[$n] =~ m|href\s*=\s*"\w+:|i) && next; # relative URL from root: ($hrefs[$n] =~ s|href\s*=\s*"/([^"]*)"|HREF="$root/$1"|i) && next; ($hrefs[$n] =~ s|href\s*=\s*/([^>]*)>|HREF=$root/$1>|i) && next; # relative from $path: $hrefs[$n] =~ s|href\s*=\s*"([^/"][^"]*)"|HREF="$root$path$1"|i; $hrefs[$n] =~ s|href\s*=\s*([^/">][^>]*)>|HREF=$root$path$1>|i; # collapse relative paths: $hrefs[$n] =~ s|/\./|/|g; while ($hrefs[$n] =~ m|/\.\./|) { $hrefs[$n] =~ s|[^/]*/\.\./||; } } join("; $SIG{'ALRM'} = "IGNORE"; !) { return undef; } close(FS); # With HTTP/1.0 would include MIME header $page; } sub timeout { die "Timeout\n"; } 1;