#! /bin/perl
#
# url.pl	--- recognize, parse and retrieve URLs
#
# This package contains:
#
# url'href:	identify URLs and turn them into hypertext links
# url'get:	parse an URL and perform an hget
# url'parse:	parse an URL and return ($host,$port,$path,$request)
# url'abs:	convert relative URLs to absolute ones
# url'hget:	perform an http request and return the result
# 
# Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch
#
# BUGS: relative paths work only if directories are always
# terminated with a "/" -- otherwise assumes the directory is
# just a filename and remembers the parent directory as the
# current path.
#
# Can't get $! to return error messages properly.

package url;

require "sys/socket.ph";

# locals:
$host = undef;
$port = undef;
$request = undef;

$sockaddr = 'S n a4 x8';
chop($thishost = `hostname`);
($name, $aliases, $proto) = getprotobyname("tcp");
($name, $aliases, $type, $len, $thisaddr) = gethostbyname($thishost);
$this = pack($sockaddr, &AF_INET, 0, $thisaddr);

# Try to recognize URLs and ftp file indentifiers and convert them into HREFs:
# This routine is evolving.  The patterns are not perfect.
# This is really a parsing problem, and not a job for perl ...
# It is also generally impossible to distinguish ftp site names
# from newsgroup names if the ":<directory>" is missing.
# An arbitrary file name ("runtime.pl") can also be confused.
sub href {
	# study; # doesn't speed things up ...

	# to avoid special cases for beginning & end of line
	s|^|>>>|; s|$|<<<|;

	# URLS:
	s|(news:[\w.]+)|<A HREF="$&">$&</A>|g;
	s|(http:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(file:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(ftp:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(wais:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(gopher:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	s|(telnet:[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;
	# s|(\w+://[\w/.:+\-]+)|<A HREF="$&">$&</A>|g;

	# catch some newsgroups to avoid confusion with sites:
	s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;
	s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1<A HREF="news:$2">$2</A>|g;

	# FTP locations (with directory):
	s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w+\-/.]+)|$1<A HREF="file://$2/$4">$2:$4</A>$3|g;
	# NB: don't confuse an http server with a port number for
	# an FTP location!
	# internet number version:
	s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w+\-/.]+)|$1<A HREF="file://$2/$3">$2:$3</A>|g;

	# just the site name (assume two dots):
	s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\-/.:!])|$1<A HREF="file://$2">$2</A>$3|g;
	# NB: can be confused with newsgroup names!
	# <site>.com has only one dot:
	s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;

	# just internet numbers:
	s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1<A HREF="file://$2">$2</A>$3|g;
	# unfortunately inet numbers can easily be confused with
	# european telephone numbers ...

	s|^>>>||; s|<<<$||;
}

# parse an URL, issue the request and return the result
sub get {
	local($url) = @_;
	($host,$port,$path,$request) = &parse($host,$port,$path,$url);
	if ($host) { &hget($host,$port,$request); }
	else { undef; }
}

# convert an URL to (host,port,path,request)
# given previous host, port and path, will handle relative URLs
sub parse {
	local($host,$port,$path,$url) = @_;
	if ($url =~ m|^http://(.*)|) {
		$host = $1;
		$port = 80;	# default
		$request = "/";	# default
		($host =~ s|^([^/]+)(/.*)$|$1|) && ($request = $2);
		($host =~ s/:(\d+)$//) && ($port = $1);
		($path = $request) =~ s|[^/]*$||;
	}
	else {
		# relative URL of form "http:<request>"
		if ($url =~ /^http:(.*)/) { $request = $1; }
		# other URL of form "service:<request>"
		elsif ($url =~ /^\w+:/) {
			# $! = "url'parse: $url is not an http URL\n";
			print STDERR "url'parse: $url is not an http URL\n";
			return (undef,undef,undef,undef);
		}
		# relative URL of form "<request>"
		else { $request = $url; }
		$request =~ s|^$|/|;
		$request =~ s|^([^/])|$path$1|; # relative path
		$request =~ s|/\./|/|g;
		while ($request =~ m|/\.\./|) {
			$request =~ s|[^/]*/\.\./||;
		}
		# assume previous host & port:
		unless ($host) {
			# $! = "url'parse: no host for $url\n";
			print STDERR "url'parse: no host for $url\n";
			return (undef,undef,undef,undef);
		}
	}
	($host,$port,$path,$request);
}

# convert relative http URLs to absolute ones:
sub abs {
	local($url,$page) = @_;
	($host,$port,$path,$request) = &parse(undef,undef,undef,$url);
	$root = "http://$host:$port";
	@hrefs = split(/<[Aa]/,$page);
	$n = $[;
	while (++$n <= $#hrefs) {
		# absolute URLs ok:
		($hrefs[$n] =~ m|href\s*=\s*"http://|i) && next;
		($hrefs[$n] =~ m|href\s*=\s*"\w+:|i) && next;
		# relative URL from root:
		($hrefs[$n] =~ s|href\s*=\s*"/([^"]*)"|HREF="$root/$1"|i) && next;
		($hrefs[$n] =~ s|href\s*=\s*/([^>]*)>|HREF=$root/$1>|i) && next;
		# relative from $path:
		$hrefs[$n] =~ s|href\s*=\s*"([^/"][^"]*)"|HREF="$root$path$1"|i;
		$hrefs[$n] =~ s|href\s*=\s*([^/">][^>]*)>|HREF=$root$path$1>|i;
		# collapse relative paths:
		$hrefs[$n] =~ s|/\./|/|g;
		while ($hrefs[$n] =~ m|/\.\./|) {
			$hrefs[$n] =~ s|[^/]*/\.\./||;
		}
	}
	join("<A",@hrefs);
}

# perform an http request and return the result
# Code adapted from Marc van Heyningen
sub hget {
	local($host,$port,$request) = @_;
	($fqdn, $aliases, $type, $len, $thataddr) = gethostbyname($host);
	$that = pack($sockaddr, &AF_INET, $port, $thataddr);
	socket(FS, &AF_INET, &SOCK_STREAM, $proto) || return undef;
	bind(FS, $this) || return undef;
	local($/);
	unless (eval q!
		$SIG{'ALRM'} = "url'timeout";
		alarm(30);
		connect(FS, $that) || return undef;
		select(FS); $| = 1; select(STDOUT);
		# The following would be needed for HTTP/1.0
		# NB: Need extra \n to terminate MIME header:
		# print FS "GET $request HTTP/1.0\r\n\n";
		print FS "GET $request\r\n";
		$page = <FS>; 
		$SIG{'ALRM'} = "IGNORE";
		!) {
			return undef;
		}
	close(FS);
	# With HTTP/1.0 would include MIME header
	$page;
}

sub timeout { die "Timeout\n"; }

1;

