#!/usr/bin/perl -s package SearchStat; chdir "/root/searchstat"; use XML::Parser; $p = new XML::Parser(Style => 'Subs', ErrorContext => 2, ParseParamEnt => 1, Handlers => {Char => \&handle_char}); @t=$p->parsefile("/root/searchstat/results.xml"); require LWP::UserAgent; use HTTP::Request::Common; $ua = LWP::UserAgent->new; $ua->agent('Mozilla/4.7 [en] (WinNT; I)'); $fetchcount=0; $goodfetchcount=0; # Package Error Values $E_RANGE = -10001; $E_NOANSWER = -10002; $E_NOTFOUND = -10003; @engines = ( "altavista", "google", "yahoo_sites", "yahoo_pages", "directhit", "excite", "hotbot", "infoseek", "lycos", "msn", "snap", # "snap_pages", # "snap_directory", "webcrawler", ); if(defined $main::engine) { if(&in($main::engine, @engines)>=0) { @engines = ($main::engine); } else { die "Invalid -engine= value!"; } } @searches = ( ); $matchregex = "[/.](###########)"; $| = 1; $searchtime = time(); foreach $engine (@engines) { require "$engine.pl"; next unless $valid; print "\nSearching $engine\n"; foreach $search (@searches) { print "Searching for '$search'"; $searchstr = &urlize($search); @hits = (); $hitnum=1; dosearch: for($page=1; ; $page++) { print "."; my($pagehits) = &getpage($page, $searchstr); last if($pagehits == $E_RANGE); $pagehits = &getpage($page, $searchstr) if $pagehits == $E_NOANSWER; last if(&error($pagehits)); @hits = (@hits, @$pagehits); for(; $hitnum<=@hits; $hitnum++) { last dosearch if $hits[$hitnum-1] =~ m!$matchregex!is; } } print "\n"; if($hits[$hitnum-1] =~ m!$matchregex!is) { print "Found $1 at #$hitnum on page $page!\n"; $engines{$engine}->{$search}->{$searchtime} = $hitnum; } else { $page--; print "Not found in $page pages, ",scalar(@hits)," hits.\n"; $engines{$engine}->{$search}->{$searchtime} = 0; } } } &save(); &printdata(); print "$fetchcount pages fetched, $goodfetchcount successfully.\n"; sub urlize { my($s) = @_; $s =~ s!(\`|\@|\!|\#|\$|\%|\^|\&|\(|\)|\=|\\|\+|\||\[|\]|\{|\}|\;|\'|\:|\"|\,|\/|\<|\>|\?)!sprintf("%%%2.2x", ord($1))!eg; $s =~ s! !+!g; return $s; } sub unurlize { my($todecode) = @_; $todecode =~ tr/+/ /; # pluses become spaces $todecode =~ s/%([0-9a-fA-F]{2})/pack("c",hex($1))/ge; return $todecode; } sub error { return 1 if $_[0] == $E_RANGE; return 1 if $_[0] == $E_NOANSWER; return 1 if $_[0] == $E_NOTFOUND; return 0; } sub in { local($val, @list) = @_; for($i=0; $i<=$#list; $i++) { return $i if uc($list[$i]) eq uc($val); } return -1; } sub printdata { foreach $engine (sort keys %engines) { print "Engine: $engine\n"; $ref=$engines{$engine};%engine = %$ref; foreach $query (sort keys %engine) { print " Query: $query\n"; $ref=$engines{$engine}->{$query};%query = %$ref; foreach $date (sort keys %query) { print " ",scalar(localtime($date))," => ",$engines{$engine}->{$query}->{$date},"\n"; } } } } sub save { open(NEW,">/root/searchstat/results.xml"); print NEW "\n\n"; foreach $engine (sort keys %engines) { print NEW " \n"; $ref=$engines{$engine};%engine = %$ref; foreach $query (sort keys %engine) { print NEW " \n"; $ref=$engine{$query};%query = %$ref; foreach $date (sort keys %query) { print NEW " $engines{$engine}->{$query}->{$date}\n"; } print NEW " \n"; } print NEW " \n"; } print NEW "\n"; close(NEW); system "genreport.pl"; } sub ENGINE { %_ = @_; $enginename = $_{NAME}; $engines{$enginename} = (); } sub ENGINE_ {} sub SEARCH { %_ = @_; $querystr = $_{QUERY}; $engines{$enginename}->{$querystr} = (); } sub SEARCH_ {} sub RESULT { %_ = @_; $resultdate = $_{DATE}+0; $engines{$enginename}->{$querystr}->{$resultdate} = 0; } sub RESULT_ { $gatherstring = $gatherstring+0; $engines{$enginename}->{$querystr}->{$resultdate} = $gatherstring; $resultdate=0; $gatherstring=""; } $gatherstring=""; sub handle_char { if($resultdate) { $gatherstring .= $_[1]; } } sub fetch { my($url) = @_; $res = $ua->simple_request(GET $url); $fetchcount++; if($res->is_success) { $goodfetchcount++; return ${$res->content_ref}; } else { return $E_NOTFOUND; } } sub in { local($val, @list) = @_; for($i=0; $i<=$#list; $i++) { return $i if uc($list[$i]) eq uc($val); } return -1; }