#! /bin/perl
#
# url.pl	--- recognize, parse and retrieve URLs
#
# This package contains:
#
# url'href:	identify URLs and turn them into hypertext links
# url'get:	parse an URL and perform an http get
# url'parse:	parse an URL and return ($type,$host,$port,$path,$request)
# url'abs:	convert relative URLs to absolute ones
# url'http:	perform an http request and return the result
# url'gopher:	perform a gopher request and return the result
# url'ftp:	perform an ftp request and return the result
#
# Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch
#
# 14/9/93 -- added url'gopher (not 100% stable) and url'ftp
# 15/01/94 -- fixed url'abs to handle HREFs without surrounding quotes
# 09/02/94 -- fixed url'abs to handle images as well
#
# BUGS: relative paths work only if directories are always
# terminated with a "/" -- otherwise assumes the directory is
# just a filename and remembers the parent directory as the
# current path.
#
# Can't get $! to return error messages properly.

package url;

require "sys/socket.ph";

# unshift(@INC, "/homes/spaf/lib/perl");
unshift(@INC, "/user/u1/oscar/Cmd/PerlLib");

# Gene Spafford's ftp package (and using the chat package).
# Added ftp'grab -- a variant of ftp'get that returns its result
# rather than writing to a local file.
require "ftplib.pl";

$user = getlogin;

# locals:
$host = undef;
$port = undef;
$request = undef;

$sockaddr = 'S n a4 x8';
chop($thishost = `hostname`);
($name, $aliases, $proto) = getprotobyname("tcp");
($name, $aliases, $type, $len, $thisaddr) = gethostbyname($thishost);
$thissock = pack($sockaddr, &AF_INET, 0, $thisaddr);

# Try to recognize URLs and ftp file indentifiers and convert them into HREFs:
# This routine is evolving.  The patterns are not perfect.
# This is really a parsing problem, and not a job for perl ...
# It is also generally impossible to distinguish ftp site names
# from newsgroup names if the ":<directory>" is missing.
# An arbitrary file name ("runtime.pl") can also be confused.
sub href {
	# study; # doesn't speed things up ...

	# to avoid special cases for beginning & end of line
	s|^|#|; s|$|#|;

	# URLS: <serice>:<rest-of-url>
	s|(news:[\w.]+)|<A HREF="$&">$&</A>|g;
	s|(http:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(file:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(ftp:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(wais:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(gopher:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(telnet:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	# s|(\w+://[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;

	# catch some newsgroups to avoid confusion with sites:
	s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;

	# FTP locations (with directory):
	# anonymous@<site>:<path>
	s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# ftp@<site>:<path>
	s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# <site>:<path>
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# NB: don't confuse an http server with a port number for
	# an FTP location!
	# internet number version: <internet-num>:<path>
	s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1<A HREF="file://$2/$3">$2:$3</A>|g;

	# just the site name (assume two dots): <site>
	s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/.:!])|$1<A HREF="file://$2">$2</A>$3|g;
	# NB: can be confused with newsgroup names!
	# <site>.com has only one dot:
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;

	# just internet numbers:
	s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;
	# unfortunately inet numbers can easily be confused with
	# european telephone numbers ...

	s|^#||; s|#$||;
}

# parse an URL, issue the request and return the result
sub get {
	local($url,$version) = @_;
	($type,$host,$port,$path,$request) = &parse($type,$host,$port,$path,$url);
	if ($host) {
		if ($type eq "http") { &http($host,$port,$request,$version); }
		elsif ($type eq "gopher") { &gopher($host,$port,$request); }
		elsif ($type eq "ftp") { &ftp($host,$request); }
		else { print STDERR "url'get: $type requests unimplemented\n"; }
	}
	else {
		undef;
	}
}

# convert an URL to ($type,host,port,path,request)
# given previous type, host, port and path, will handle relative URLs
# NB: May need special processing for different service types (e.g., news)
sub parse {
	local($type,$host,$port,$path,$url) = @_;
	if ($url =~ m|^(\w+)://(.*)|) {
		$type = $1;
		$host = $2;
		$port = &defport($type);
		$request = "/";	# default
		($host =~ s|^([^/]+)(/.*)$|$1|) && ($request = $2);
		($host =~ s/:(\d+)$//) && ($port = $1);
		($path = $request) =~ s|[^/]*$||;
	}
	else {
		# relative URL of form "<type>:<request>"
		if ($url =~ /^(\w+):(.*)/) {
			$type = $1;
			$request = $2;
		}
		# relative URL of form "<request>"
		else { $request = $url; }
		$request =~ s|^$|/|;
		$request =~ s|^([^/])|$path$1|; # relative path
		$request =~ s|/\./|/|g;
		while ($request =~ m|/\.\./|) {
			$request =~ s|[^/]*/\.\./||;
		}
		# assume previous host & port:
		unless ($host) {
			# $! = "url'parse: no host for $url\n";
			print STDERR "url'parse: no host for $url\n";
			return (undef,undef,undef,undef,undef);
		}
	}
	($type,$host,$port,$path,$request);
}

# convert relative http URLs to absolute ones:
# need to handle inlined images!
sub abs {
	local($url,$page) = @_;
	($type,$host,$port,$path,$request) = &parse(undef,undef,undef,undef,$url);
	$root = "http://$host:$port";
	@hrefs = split(/<[Aa]/,$page);
	$n = $[;
	while (++$n <= $#hrefs) {
		# absolute URLs ok:
		($hrefs[$n] =~ m|href\s*=\s*"?http://|i) && next;
		($hrefs[$n] =~ m|href\s*=\s*"?\w+:|i) && next;
		# relative URL from root:
		($hrefs[$n] =~ s|href\s*=\s*"?/([^"]*)"?|HREF="$root/$1"|i) && next;
		# relative from $path:
		$hrefs[$n] =~ s|href\s*=\s*"?([^/"][^"]*)"?|HREF="$root$path$1"|i;
		# collapse relative paths:
		$hrefs[$n] =~ s|/\./|/|g;
		while ($hrefs[$n] =~ m|/\.\./|) {
			$hrefs[$n] =~ s|[^/]*/\.\./||;
		}
	}
	$page = join("<A",@hrefs);
	# duplicate code could be merged into a subroutine ...
	@hrefs = split(/<IMG/,$page);
	$n = $[;
	while (++$n <= $#hrefs) {
		# absolute URLs ok:
		($hrefs[$n] =~ m|src\s*=\s*"?http://|i) && next;
		($hrefs[$n] =~ m|src\s*=\s*"?\w+:|i) && next;
		# relative URL from root:
		($hrefs[$n] =~ s|src\s*=\s*"?/([^"]*)"?|SRC="$root/$1"|i) && next;
		# relative from $path:
		$hrefs[$n] =~ s|src\s*=\s*"?([^/"][^"]*)"?|SRC="$root$path$1"|i;
		# collapse relative paths:
		$hrefs[$n] =~ s|/\./|/|g;
		while ($hrefs[$n] =~ m|/\.\./|) {
			$hrefs[$n] =~ s|[^/]*/\.\./||;
		}
	}
	join("<IMG",@hrefs);

}

# perform an http request and return the result
# Code adapted from Marc van Heyningen
sub http {
	local($host,$port,$request,$version) = @_;
	($fqdn, $aliases, $type, $len, $thataddr) = gethostbyname($host);
	$that = pack($sockaddr, &AF_INET, $port, $thataddr);
	socket(FS, &AF_INET, &SOCK_STREAM, $proto) || return undef;
	bind(FS, $thissock) || return undef;
	local($/);
	unless (eval q!
		$SIG{'ALRM'} = "url'timeout";
		alarm(30);
		connect(FS, $that) || return undef;
		select(FS); $| = 1; select(STDOUT);
		# NB: Need extra \n to terminate MIME header:
		if ($version) { print FS "GET $request HTTP/1.0\r\n\n"; }
		else { print FS "GET $request\r\n"; }
		$page = <FS>;
		$SIG{'ALRM'} = "IGNORE";
		!) {
			return undef;
		}
	close(FS);
	# With HTTP/1.0 would include MIME header
	$page;
}

# This doesn't always work -- gopher URLs sometimes contain
# a leading file type in the pathname which must be stripped off.
# needs work.  URLs may also contain blanks, tabs and other nasties.
# IS THIS THE RIGHT PROTOCOL FOR GOPHER???
sub gopher {
	local($host,$port,$request) = @_;
	($fqdn, $aliases, $type, $len, $thataddr) = gethostbyname($host);
	$that = pack($sockaddr, &AF_INET, $port, $thataddr);
	socket(FS, &AF_INET, &SOCK_STREAM, $proto) || return undef;
	bind(FS, $thissock) || return undef;
	# gopher doesn't need leading "/":
	$request =~ s|^/||;
	# try to strip off the gopher type ...
	($request =~ s|^([I]?\d+)/||) && ($gtype = $1);
	local($/);
	unless (eval q!
		$SIG{'ALRM'} = "url'timeout";
		alarm(30);
		connect(FS, $that) || return undef;
		select(FS); $| = 1; select(STDOUT);
		print FS "$request\r\n";
		$page = <FS>;
		$SIG{'ALRM'} = "IGNORE";
		!) {
			return undef;
		}
	close(FS);
	# This return value will also contain a leading type field.
	# Should be stripped off by the calling routine ...
	$page;
}

# ftp'grab is a version of ftp'get that returns the page
# retrieved rather than writing it to a local file.
# Perhaps not so nice for big files, but what the heck.
sub ftp {
	local($host,$file) = @_;
	&ftp'open($host, "ftp", "$user@$thishost") || &fail;
	&ftp'type("i") || &fail;
	$page = &ftp'grab($file) || &fail;
	&ftp'close;
	$page;
}

sub fail {
	$save = &ftp'error;
	&ftp'close;
	die $save;
}

sub timeout { die "Timeout\n"; }

# default ports
sub defport {
	local($type) = @_;
	if ($type eq "http") { 80; }
	elsif ($type eq "gopher") { 70; }
	else { undef; }
}

1;

