#!/usr/bin/perl -s # # URL List Checker. # Traverses a text list of URLs, performing a HEAD request on each # item. Anything that doesn't check out gets put on a list to be removed. # require "http.pl"; require "flush.pl"; $filelength = &file_length("urls.txt"); open(urllist,"urls.txt"); open(removeque,">>remove.que"); open(newurls,">>newurls.txt"); open(LOG,">log.checker.txt"); select(LOG); $| = 1; $starttime=time; $lasthost=""; while () { s/\r//g; s/\n//g; s|:80/|/|; s|:80$|/|; s|([^/])/([^./]+)$|$1/$2/|; s|//([^/]+)$|//$1/|; $url = $_; if ($url =~ m|^http://(.*)|) { $host = $1; $port = 80; # default $request = "/"; # default ($host =~ s|^([^/]+)(.*)$|$1|) && ($request = ($2||"/")); ($host =~ s/:(\d+)$//) && ($port = $1); $host =~ y/A-Z/a-z/; } else { &printflush( newurls, "$url\n"); next; } printf LOG ("%6d/%6d: %s\n", $., $filelength, $url); # printf("%003.3f: %s\n",100*( $./$filelength) , $url); # print "host: $host port: $port request: $request\n"; if($host == $lasthost) { sleep 2; } $lasthost = $host; local(@htext) = &http'fetch($host,$port,$request, "HEAD"); # &print_htext(@htext); if (&http'invalid(@htext)) { print LOG "$url is invalid.\n"; &printflush( removeque, "$url\n"); foreach $line (@htext) { $line =~ s/\r//g; $line =~ s/\n//g; if ( $line =~ m/Location: (.*)/ ) { &printflush( newurls, "$1\n") }; }; } else { &printflush( newurls, "$url\n"); } sleep(1); } $endtime=time; close(urllist); close(removeque); close(newurls); $checktime=($endtime-$starttime)/86400; print LOG "start time: $starttime\n end time: $endtime\nTotal: $checktime days.\n"; sub print_htext { local(@htext) = @_; for ($i = 0; $i < $#htext; $i++) { print "$i: $htext[$i]\n"; } } sub file_length { local($filename) = @_; local($filelen) = 0; open(file,$filename); while () { $i++; } close(file); $i }