#!/usr/local/bin/perl # Robert Morris' CS245 web crawler. # robot exclusion # fix weblinks to stop at a newline # suppress md5-identical duplicates # ignore pages with suspect contents (ie .ps.Z) $min = 60; # interval between hits to the same server. if($#ARGV < 0){ print STDERR "Usage: crawler prefix [start-url] [domain]\n"; exit(1); } $prefix = $ARGV[0]; $starturl = $ARGV[1]; $domain = $ARGV[2]; $| = 1; $dbfile = "$prefix.db"; $bigfile = "$prefix.big"; dbmopen(%urls, $dbfile, 0666); open(BIG, ">>$bigfile"); $pleasequit = 0; sub handler { $pleasequit = 1; } $SIG{"INT"} = "handler"; $SIG{"HUP"} = "handler"; if($starturl ne ""){ gotlink($starturl); } $changed = 1; while($changed != 0 && $pleasequit == 0){ $changed = 0; $waiting = 0; foreach $i (keys(%urls)){ last if $pleasequit; one($i); } if($waiting && $changed == 0 && $pleasequit == 0){ print "sleep($min)"; sleep($min); print "\n"; $changed = 1; } } dbmclose(%urls); sub one { local($url) = @_; $status = $urls{$url}; if($status ne ""){ return; } $srvr = ""; ($srvr) = ($url =~ m;http://([^/]*);); $now = time(); if($now - $times{$srvr} < $min){ $waiting += 1; return; } $times{$srvr} = $now; print "$url "; $tmpfile = "/tmp/z$$"; unlink($tmpfile); $cmd = "webget -r -h -t 20 '$url' > $tmpfile"; $status = system($cmd); if($status != 0){ $status /= 256; unlink($tmpfile); if($status >= 2 && $status <= 5){ # temporary failure? $times{$srvr} += 1800; print "temporary failure\n"; } else { $urls{$url} = "failed"; print "failed $status\n"; } return; } open(P, $tmpfile); seek(P, 0, 2); $plen = tell(P); seek(P, 0, 0); $first =

; $first =~ s/[\r\n]//g; ($junk1, $status, $junk2) = split(/ /, $first); # consume headers $type = ""; $location = ""; while(

){ $l = $_; $l =~ s/[\r\n]//g; if($l eq ""){ last; } $l =~ tr/A-Z/a-z/; ($h, $v) = ($l =~ /^([^:]*): *(.*)/); if($h eq "content-type"){ $type = $v; } elsif($h eq "location"){ $location = $v; } } if($status != 200){ $urls{$url} = "failed"; print "bad status $status "; } elsif($status == 200 && $type =~ /^text/){ $xlen = $plen - tell(P); print BIG "\nURL: $url\n$xlen\n"; $start = tell(BIG); $len = 0; while(

){ $l = $_; $len += length($l); print BIG "$l"; } $urls{$url} = "$start $len"; if(tell(BIG) - $start != $xlen || $xlen != $len){ print "\nOOPS\n"; } } else { $urls{$url} = "failed"; print "odd type $type "; } if($type eq "text/html"){ open(L, "weblinks '$url' < $tmpfile |"); while(){ $link = $_; gotlink($link); } close(L); } if(($status == 302 || $status == 301) && $location ne ""){ gotlink($location); } print "\n"; close(P); } sub gotlink { local($link) = @_; $link =~ s/[\n\r]//g; $link =~ s/#.*//; $link =~ s|(^http://.*/)[^/]+/\.\./(.*)$|$1$2|; $link =~ s|(^http://.*/)\./(.*)|$1$2|; return if $urls{$link} ne ""; return if $link =~ /'/; # ' return if $link !~ m|^http://|; return if $link =~ m/\.Z$/; return if $link =~ m/\.gz$/; return if $domain ne "" && $link !~ m|$domain/| && $link !~ m|$domain$|; $urls{$link} = ""; $changed += 1; }