#TDM 2005 # # my $x=0; #used on the FORM FILL Area on $sizeofharvestedURls index my $y=0; #used to index thru FORMS on page my $q=0; my $z=0; #Level I index my $a=0; my $b=0; my $c=0; my $d=0; my $e=0; #Level II index my $p = HTML::LinkExtor->new(\&callback); my $input = 0; #Used to input data from files my @harvestedULs = (); my $sizofharvestedURLs = 0; my $sizeofinput = 0; my $url = ""; #Level I my $url2 = ""; #Level II my @links = ();#stripped links array my $sizeoflinks = 0; my $counter = 0; $file = "searchdata.txt"; #DOT.COMS from searchdata.txt file #----------------------------- Set up Agent ------------------------------------ require LWP::UserAgent; use HTML::LinkExtor; use URI::URL; $ua = new LWP::UserAgent; $ua->timeout(5); #not sure of this number. Ex. code had 5, I put in 5 $ua->agent('Mozilla/4.75'); # $ua->proxy(http => 'http://127.0.0.1:8118'); # TOR TOR TOR $ua->from('www.xxxxx.com'); #----------------Load URL's array with links -------------------- print "\n\n******** Loading URL's *******\n\n"; if (open(A, "$file") == undef){ return( print "\n\n\nSHIT !!! Cannot open the file :( \n\n\n"); exit(-1); } #endif() while(){ $input=; push(@harvestedURLs, $input); }#endwhile() close(A); $sizeofharvestedURLs = $#harvestedURLs; print "Seed URL's = $sizeofharvestedURLs\n\n"; sleep(2); #used to let array to settle in ########################### Begin Spider ############################### print "\n\n Begin Spider run .....\n\n"; while($x <= $sizeofharvestedURLs){#aa #Loop for harvestedURLs $url = $harvestedURLs[$x]; #uses $x for indexing print "-- Home Page -- Level I -- $url\n\n"; sleep(1); # used to sow down for TOR. #$counter++; #print "$counter\n"; $req = new HTTP::Request GET => $harvestedURLs[$x]; $response = $ua->request($req); my $base = $response->base; if($response->is_success) {#bb sleep(2); # Used to slow down for TOR $p->parse($response->content); # ** LINK STRIPPING ** @links = map { $_ = url($_, $base)->abs; } @links; #print "@links"; # test point for link stripping $sizeoflinks = $#links; # ** End LINK STRIPPING ** # Here is where you set up for a run on home page # }#bb# #****************** LVL 2 - BEGIN ********************************************* while($c <= $sizeoflinks ){#xxx $url2 = $links[$c++]; print "$url2\n"; print "Level 2 STRIPPED URL\n\n"; sleep(10); #used to slow down for viewing the spider operation # Enter into level 3 # # *** # Exiting Level 3 # # Here is where you set up for a run on Level 2 # }#xxx Exit Level 2 #******************** LVL 2 - END ********************************************* $c = 0; #reset level 2 $links variable $x++; # Used on $harvestedURLs[$x] @links = ""; # makes sure that @array is empty }#aa Exit Level 1 ######################### END Spider ########################################### #----------------------Link Stripping Sub-Routine------------------------------- sub callback { #999 my($tag, %attr) = @_; return if $tag ne 'a'; # Tag to strip , , ....etc push(@links, values %attr); } #999 End sub callback #------------------------------------------------------------------------------- # TDM 2005 # Updated Feb. 01, 2008 -- Triad # Update Apr.29.2010 - Triad # Updated June 19 2010 - Triad ################################################################################