#!/local/bin/perl
#
# explore		--- explore the WWW starting at a given URL
#
# Given a starting URL will generate "explore.html" -- a table of titles
# and hosts.  Tries to find as many new hosts as possible, and strictly
# limits the number of pages it will request from any one server.
#
# You can monitor "explore.html", or you can give as an optional
# argument the process id of an xmosaic client on which to
# display the current page.
#
# explore will stop when $maxtotal pages are retrieved (1000!),
# or when the list of pacges to check is exhausted,
# or when SIGINT is received (^C).
#
# NB: to get a recursive listing of all pages at your site, try:
#	explore -ls -d <home-page>
#
# (Not advised for remote hosts!)
#
# Author: Oscar Nierstrasz oscar@cui.unige.ch
#
# Strange bug: if Xmosaic can't connect to an information server,
# it becomes dead to the world.  Happens after about 30 pages ...
# Any further requests are rejected (e.g., to go to the home page),
# as if all servers were inaccessible.  Even "open local" is broken.
# Probably due to Xmosaic getting too many signals while it is
# trying to connect to a server, and gettings its internal data
# structures corrupted.  (Should use sockets/streams instead of files.)
# If this happens, send explore a ^C, kill Xmosaic, start a new one,
# and tell explore the PID of the new Xmosaic.
#
#v = '(v1.0)'; # August 30, 1993
#v = '(v1.1)'; # August 31 -- added triggering of xmosaic
#v = '(v1.2)'; # Sept 1 -- added various options; SIGINT handling
#v = '(v1.3)'; # Oct 21 -- fixed counting of hosts; added -d
#v = '(v1.4)'; # Oct 23 -- fixed sigint to allow <CR> to continue
		#	-- fixed printing of $hostsig
$v = '(v1.5)'; # Nov 15 -- made sleeptime valid for all requests

unshift(@INC,"/user/u1/oscar/Cmd/PerlLib");
require "url.pl";

$usg = 'Usage: explore [<options>] <http-url>
	<http-url>	-- URL to start with (no default)
	<output file>	-- default is explore.html
	<xmosaic-pid>	-- to monitor current page (no default)
	-m <maxpages>	-- max pages to get per site (default 5)
	-t <maxtotal>	-- max total pages to get (default 100)
	-h <maxhosts>	-- max hosts to explore (default unlimited)
	-s <sleeptime>	-- time to sleep between requests
	-ls		-- list all pages at starting site (use with care!)
	-d		-- also look at directories (URLs ending in "/")
';
$maxpages = 5;		# max pages to retrieve per site
$maxtotal = 100;	# max pages to retrieve in total
$maxhosts = undef;	# max hosts to visit

$sleeptime = 0;		# time to sleep after displaying current page

$hosts = 1;		# hosts visited (always at least 1)

$xmpid = undef;

chop($date = `date +%d.%m.%y`);
$omn = '<A HREF="http://cui_www.unige.ch/OSG/omn.html"><I>OMN</I></A>';
$sig = "<I>This page was generated by explore $v on $date.</I>\n$omn<P>\n";

# A good default starting point:
# $whatsnew = "http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html";
# $start = "http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/StartingPoints/NetworkStartingPoints.html";

# default log file:
$deflog = "explore.html";

while ($#ARGV >= $[) {
	$arg = shift @ARGV;

	if ($arg eq "-m") {
		$arg = shift @ARGV;
		if ($arg =~ /^\d+$/){
			print STDERR "maxpages = $arg (was $maxpages)\n";
			$maxpages = $arg;
			next;
		}
		else { die "Bad arg for -m\n$usg"; }
	}

	if ($arg eq "-t") {
		$arg = shift @ARGV;
		if ($arg =~ /^\d+$/){
			print STDERR "maxtotal = $arg (was $maxtotal)\n";
			$maxtotal = $arg;
			next;
		}
		else { die "Bad arg for -t\n$usg"; }
	}

	if ($arg eq "-h") {
		$arg = shift @ARGV;
		if ($arg =~ /^\d+$/){
			print STDERR "maxhosts = $arg (was $maxhosts)\n";
			$maxhosts = $arg;
			next;
		}
		else { die "Bad arg for -h\n$usg"; }
	}

	if ($arg eq "-s") {
		$arg = shift @ARGV;
		if ($arg =~ /^\d+$/){
			print STDERR "sleeptime = $arg (was $sleeptime)\n";
			$sleeptime = $arg;
			next;
		}
		else { die "Bad arg for -s\n$usg"; }
	}

	if ($arg eq "-ls") {
		$maxhosts = 1;
		$maxtotal = $maxpages = 1000;
		next;
	}

	if ($arg eq "-d") { $d = 1; next; }
	if ($arg eq "-h") { die "$usg"; }
	if ($arg =~ /^-/) { die "Invalid flag\n$usg"; }
	if ($arg =~ /^\d+$/) {
		$xmpid = $arg;
		if ($sleeptime == 0) { $sleeptime = 15; }
		next;
	}
	if ($arg =~ /^http:/) {
		if ($start) { die "Please give only one URL\n$usg"; }
		$start = $arg;
	}
	else {
		if ($log) { die "Please give only one output file\n$usg"; }
		$log = $arg;
	}
}

unless ($start) { die "$usg"; }
unless ($log) { $log = $deflog; }
(open(STDOUT,">$log")) || die "Can't create $log\n";
$| = 1;
print STDERR "Writing output to $log\n";
&explore($start);

# Explore the web, starting at $url.
# Maintains a list @tocheck of URLs to try.
sub explore {
	local($url) = @_;
	if ($maxhosts == 1) {
		print "<TITLE>Summary of HTML pages at $url</TITLE>\n",
			"<H1>Summary of HTML pages at $url</H1>\n",
			"<OL>\n\n";
	}
	else {
		print "<TITLE>Explore the WWW</TITLE>\n",
			"<H1>Explore the WWW</H1>\n",
			"Starting at $url<P>\n";
		if ($maxhosts) {
			print "Maximum hosts to visit = $maxhosts.\n";
		}
		print "Max pages per site = $maxpages.\n",
			"Max total pages = $maxtotal.<P>\n",
			"<OL>\n\n";
	}

	$SIG{'INT'} = 'sigint';	# Stop when SIGINT is received
	push(@tocheck,$url);	# Initialize
	$seen{$url} = 1;	# Remember that we've seen it
	($thistype,$thishost, $thisport, $thispath, $request) =
		&url'parse(undef,undef,undef,undef,$url);
	$seenhost{$thishost} = 1;

	while ($#tocheck >= $[) {
		if ($sleeptime > 0) {
			print STDERR "Sleeping $sleeptime seconds\n";
			sleep($sleeptime);
		}
		$url = shift @tocheck;
		# Remember current host, port and path:
		($thistype,$thishost, $thisport, $thispath, $request) =
			&url'parse(undef,undef,undef,undef,$url);
		print STDERR "Requesting $url\n";
		unless ($page = &url'get($url)) {
			print STDERR "Can't get $url\n";
			next;
		} ;
		if (!($maxhosts == 1)) { $hostsig = " ($thishost)"; }
		# Extract the title:
		if ($page =~ m|<title>([^<]+)</title>|i) { $title = $1; }
		else { $title = "UNTITLED: $url"; }
		if (($title =~ /Server Error:/)
			|| ($title =~ /Bad File Request/)) {
			print STDERR "Invalid page: $url\n";
			next;
		}
		$title =~ s/\n/ /g;
		# This page is ok, so log it:
		print "<LI><A HREF=\"$url\"><B>$title</B></A>$hostsig\n";
		# from the last request
		&xmget($url);	# display the current URL
		# give Xmosaic some time
		if (++$entries >= $maxtotal) { last; }

		foreach $href (&hrefs($page)) {
			# Parse the URL, if possible:
			($type,$host,$port,$path,$request) =
				&url'parse($thistype,$thishost,
					$thisport,$thispath,$href);
			unless ($type eq "http") { next; }
			# Skip this host if invalid:
			unless ($host) { next; }
			# Convert from relative to absolute URL:
			$href = "http://$host:$port$request";
			# Skip if seen already:
			if ($seen{$href}) { next; }
			$seen{$href} = 1;
			# Only look at guaranteed .html files:
			unless ($request =~ /\.html$/) {
				unless ($d && ($request =~ /\/$/)) {
					next;
				}
			}
			# Don't ask too many pages from a given host:
			unless (++$count{$host} <= $maxpages) { next; }
			if ($seenhost{$host}) {
				# Seen this host, so add to end of queue:
				print STDERR "Pushing $href\n";
				push(@tocheck,$href);
			}
			else {
				if ($maxhosts) {
					if (++$hosts > $maxhosts) { next; }
				}
				# New host, so add to front:
				print STDERR "Queueing $href\n";
				unshift(@tocheck,$href);
				$seenhost{$host} = 1;
			}
		}
	}

	print "\n</OL>\nSearch completed.<P>\n$sig";
	close(STDOUT);
	print STDERR "Result of exploration in $log\n";
}

sub sigint {
	local($/) = "\n";
	print STDERR "Enter <CR> to continue, \"q\" to quit, or ",
		"Xmosaic PID to monitor retrieved pages\n";
	chop ($xmpid = <STDIN>);
	return if ($xmpid eq "");
	if ($xmpid ne "q") {
		unless ($xmpid =~ /^\d+$/) {
			print STDERR "Invalid PID -- ignored\n";
			$xmpid = undef;
		}
		if ($sleeptime == 0) { $sleeptime = 15; }
		return;
	}
	print STDERR "Quitting\n";
	# should dump @tocheck in a file?
	print "\n</OL>\nInterrupted!<P>\n$sig";
	close(STDOUT);
	print STDERR "Result of exploration in $log\n";
	exit(0);
}

# If we have a pid for xmosaic, give it a kick.
# This means we have to get the same page twice (since url'get
# retrieves it as well).  Unfortunately if we try to write
# the retrieved page into a file and tell xmosaic to open that
# instead, then none of the relative HREFs or IMG files will work!
sub xmget {
	local($url) = @_;
	if ($xmpid) {
		$tmp = "/tmp/xmosaic.$xmpid";
		unless (open(XM,">$tmp")) {
			print STDERR "Can't open $tmp -- dropping PID $xmpid\n";
			$xmpid = undef; return;
		}
		print XM "goto\n$url\n";
		close(XM);
		unless (kill('USR1',$xmpid)) {
			print STDERR "xmget failed -- dropping PID $xmpid\n";
			$xmpid = undef; return;
		}
	}
}

# return a list of all the hrefs in a page
sub hrefs {
	local($page) = @_;
	$page =~ s/^[^<]+</</;
	$page =~ s/>[^<]*</></g;
	$page =~ s/<a[^>]*href\s*=\s*"([^"]+)"[^>]*>/$1\n/gi;
	$page =~ s/<[^>]*>//g;
	$page =~ s/\n+/\n/g;
	split(/\n/,$page);
}

__END__

