#!/usr/local/bin/perl

# Robert Morris' CS245 web crawler.

# robot exclusion
# fix weblinks to stop at a newline
# suppress md5-identical duplicates
# ignore pages with suspect contents (ie .ps.Z)

$min = 60; # interval between hits to the same server.

if($#ARGV < 0){
    print STDERR "Usage: crawler prefix [start-url] [domain]\n";
    exit(1);
}

$prefix = $ARGV[0];
$starturl = $ARGV[1];
$domain = $ARGV[2];
$| = 1;

$dbfile = "$prefix.db";
$bigfile = "$prefix.big";

dbmopen(%urls, $dbfile, 0666);
open(BIG, ">>$bigfile");

$pleasequit = 0;
sub handler {
    $pleasequit = 1;
}
$SIG{"INT"} = "handler";
$SIG{"HUP"} = "handler";

if($starturl ne ""){
    gotlink($starturl);
}

$changed = 1;
while($changed != 0 && $pleasequit == 0){
    $changed = 0;
    $waiting = 0;
    foreach $i (keys(%urls)){
	last if $pleasequit;
	one($i);
    }
    if($waiting && $changed == 0 && $pleasequit == 0){
	print "sleep($min)";
	sleep($min);
	print "\n";
	$changed = 1;
    }
}

dbmclose(%urls);

sub one {
    local($url) = @_;

    $status = $urls{$url};
    if($status ne ""){
	return;
    }

    $srvr = "";
    ($srvr) = ($url =~ m;http://([^/]*););
    $now = time();
    if($now - $times{$srvr} < $min){
	$waiting += 1;
	return;
    }
    $times{$srvr} = $now;

    print "$url ";

    $tmpfile = "/tmp/z$$";
    unlink($tmpfile);
    $cmd = "webget -r -h -t 20 '$url' > $tmpfile";
    $status = system($cmd);
    if($status != 0){
	$status /= 256;
	unlink($tmpfile);
	if($status >= 2 && $status <= 5){
	    # temporary failure?
	    $times{$srvr} += 1800;
	    print "temporary failure\n";
	} else {
	    $urls{$url} = "failed";
	    print "failed $status\n";
	}
	return;
    }
    open(P, $tmpfile);

    seek(P, 0, 2);
    $plen = tell(P);
    seek(P, 0, 0);

    $first = <P>;
    $first =~ s/[\r\n]//g;
    ($junk1, $status, $junk2) = split(/ /, $first);

    # consume headers
    $type = "";
    $location = "";
    while(<P>){
	$l = $_;
	$l =~ s/[\r\n]//g;
	if($l eq ""){
	    last;
	}
	$l =~ tr/A-Z/a-z/;
	($h, $v) = ($l =~ /^([^:]*): *(.*)/);
	if($h eq "content-type"){
	    $type = $v;
	} elsif($h eq "location"){
	    $location = $v;
	}
    }

    if($status != 200){
	$urls{$url} = "failed";
	print "bad status $status ";
    } elsif($status == 200 && $type =~ /^text/){
	$xlen = $plen - tell(P);
	print BIG "\nURL: $url\n$xlen\n";
	$start = tell(BIG);
	$len = 0;
	while(<P>){
	    $l = $_;
	    $len += length($l);
	    print BIG "$l";
	}
	$urls{$url} = "$start $len";
	if(tell(BIG) - $start != $xlen || $xlen != $len){
	    print "\nOOPS\n";
	}
    } else {
	$urls{$url} = "failed";
	print "odd type $type ";
    }

    if($type eq "text/html"){
	open(L, "weblinks '$url' < $tmpfile |");
	while(<L>){
	    $link = $_;
	    gotlink($link);
	}
	close(L);
    }
    
    if(($status == 302 || $status == 301) && $location ne ""){
	gotlink($location);
    }

    print "\n";
    close(P);
}

sub gotlink {
    local($link) = @_;

    $link =~ s/[\n\r]//g;
    $link =~ s/#.*//;
    $link =~ s|(^http://.*/)[^/]+/\.\./(.*)$|$1$2|;
    $link =~ s|(^http://.*/)\./(.*)|$1$2|;
    return if $urls{$link} ne "";
    return if $link =~ /'/; # '
    return if $link !~ m|^http://|;
    return if $link =~ m/\.Z$/;
    return if $link =~ m/\.gz$/;
    return if $domain ne "" && $link !~ m|$domain/| && $link !~ m|$domain$|;
    $urls{$link} = "";
    $changed += 1;
}
