The following code was written to crawl a fairly large Fortune 500 company website in order to create a site map and document the content. This isn’t an example of great, well-refactored code. This is an example of a brute force, high-yield technique that can be applied by a modestly talented developer in a few hours.
We acquired a lot of information with this – much more than is presented by the document markup itself*. If you’re interested in Mechanize, I recommend this cheat sheet: WWW-Mechanize-1.60.
#!/usr/bin/perl
#use strict; use WWW::Mechanize; use HTTP::Cookies; #MECHANIZE $mech = WWW::Mechanize->new(); $mech->agent_alias( 'Windows IE 6' ); $mech->cookie_jar(HTTP::Cookies->new()); #URLS and SORT @urls = ("http://www.chrismaynard.net"); #Fill with a list of URLS @urls = sort(@urls); #TURN ON/OFF SORTING OF @URLS ARRAY # DECLARE AND INIT THE OMNITURE HASH VALUES @omniture_vars; &setOmnitureVars(); #------- MAJOR ACTIONS -------# &runURLSearch(); #&runContentParse(); # RUN THE URL SEARCH #$language = 'us_spanish'; sub runURLSearch() { $end_search = 0; #$language = 'us_spanish'; @searchable_urls = ("http://chrismaynard.net"); &doNextURL(); #uses additional methods. &storeLinksToFile(); } # RUN THE CONTENT PARSE sub runContentParse() { $file_out = '>C:/perl_searches/spidered_pages.cvs'; open (MYFILE, $file_out) or die "Cannot open the specified file."; print MYFILE "URLtPAGE TITLEt"; foreach $omniture_var (@omniture_vars) { print MYFILE $omniture_var."t"; } print MYFILE "n"; &processURLs(); close MYFILE; exit; } #------- SUBROUTINES -------# # URL Scrapcing Below sub doNextURL() { if (($size = @searchable_urls) == 0) { return; } #print ($url = pop(@searchable_urls)); print "SEARCHABLE URLS REMAINING: ".($size = @searchable_urls)."n"; $url = pop(@searchable_urls); &findAllLinks(); &doNextURL(); } sub findAllLinks() { getURL(); if ($@) { print "NO PAGE FOUNDn"; } else { %urllinks = (); #empty %urllinks = $mech->find_all_links(url_regex => qr/^http://chrismaynard/); &storeLinksToHash(); } } sub storeLinksToHash() { while (($key, $value) = each(%urllinks)){ #print $urllinks{$key}[0]."n"; if ($storedurls{$urllinks{$key}[0]} eq "") { $storedurls{$urllinks{$key}[0]} = " ".$urllinks{$key}[0]; if (($size = @searchable_urls) < 100 && $end_search == 0) { print "Pushing Searchable URL: ".$storedurls{$urllinks{$key}[0]}."n"; push(@searchable_urls, $storedurls{$urllinks{$key}[0]} ); } elsif (($size = @searchable_urls) >= 100 && $end_search == 0) { print "nnn------------------ SEARCH ENDED ----------------------nnn"; $end_search = 1; } } else { print "Existing Searchable URL: ".$storedurls{$urllinks{$key}[0]}."n"; } } } sub storeLinksToFile() { #FILE $file_out = '>C:/perl_searches/spidered_urls.cvs'; open (MYFILE, $file_out) or die "Cannot open the specified file."; while (($key, $value) = each(%storedurls)){ print MYFILE $value."n"; } close MYFILE; } #Scraping Page Name Functions Below sub processURLs() { foreach $url (@urls) { print $url."n"; #&doGuessMultipleLanguages(); &doKnowMultipleLanguages(); } } sub doKnowMultipleLanguages() { print MYFILE $url."t"; &getPageName(); } sub doGuessMultipleLanguages() { for ($i = 0; $i < 4; $i++ ) { print MYFILE $url."t"; &getPageName(); &updateURL(); } } sub getURL() { return eval { $mech->get($url) }; } sub getPageName() { getURL(); if ($@) { print MYFILE "NO PAGE FOUNDn"; } else { $mech->content(); #PAGE TITLE if ($mech->content() =~ m/<title>(.*)</title>/ig) { $pageTitle = $1; print MYFILE $pageTitle; } print MYFILE "t"; #OMNITURE VARS foreach $omniture_var (@omniture_vars) { if ($mech->content() =~ m/var $omniture_var="(.*)";/ig) { $var_value = $1; print MYFILE $var_value; } print MYFILE "t"; } # NEW LINE print MYFILE "n"; } } sub updateURL() { # Do Some URL Manipulation } sub setOmnitureVars() { @omniture_vars = ("s_prop1"); };
spider.pl
*Some of the content-parsing functionality has been removed. The host has been changed. An init function which controlled execution order has been deleted. However, the essential crawl functions are still present.

