Google Scanner 源码:
#!/usr/bin/perl -w
#By xti9er
require LWP::UserAgent;
use LWP::Simple;
use Color::Output;
Color::Output::Init;
$|=1;
#----------------------------
#从配置文件中提取字符串
#----------------------------
my $inrulfile=shift||"inurl.ini";
my $countryset="";
open(INURL,$inrulfile) or die $!;
while($inurl=)
{
chomp($inurl);
next if $inurl=~/^#/;
if($inurl=~/^country=(w+)/i)
{
$countryset="country$1";
}
elsif($inurl=~/^country=$/i)
{
}
else{
push(@inurl,$inurl);
}
}
close INURL;
for my $inurl(@inurl)
{
$inurl=~s/s/+/g;
$reginurl=$inurl;
$reginurl=~s/?/?/g;
$reginurl=~s/=/=/g;
$reginurl=~s/:/:/g;
$reginurl=~s/+/s/g;
$sleeptime=5;
#cprin("Sleeptime=[$sleeptime]t Url=[$inurl]t RegUrl=[$reginurl]t country=[$countryset]n",7);
sleep(2);
my $sdco=0;
my $hostno=0;
my $cksd=0;
my $p75="-"x80;
my $ua = LWP::UserAgent->new;
$ua->timeout(20);
$ua->env_proxy;
$ua->agent("Mozilla/5.0");
#print "Start ...n";
cprin("ttt --=G o o g l e t S c a n n e r=-- n",5);
cprin("ttt By xti9er n",13);
cprin("$p75n",7);
$stime=time;
$lstime=localtime();
cprin("tttStart at $lstimen",13);
my $searchweb="http://www.google.com.hk";
my $startpage=0;
print "[Google] page:$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/n";
my $response = $ua->get("$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/")
or (cprin("[Start] Get google start page faild:$!",5) and next);
if ($response->is_success)
{
$getre=$response->content;
@getlog=split(/href=/,$getre);
$getco=0;
for(@getlog)
{
$getco++;
if($_=~/id=resultStats>(.*)?)
{
$ttpageno=$1;
$ttpageno=~s/,//g;
$ttpageno=~s/D//g;
$ttpageno=int($ttpageno/100);
cprin("ttt$ttpageno Google Pages To Read!n$p75n",5);
sleep(2);
for($startpage=0;$startpage
{
sleep($sleeptime);
cprin("Now Read The ".$startpage."th Page!n--------------------------------------n",13);
spider($startpage,$inurl);
}
}
}
}
else
{
cprin($response->status_line,5);next;
}
}
sub spider
{
$pageno=shift;
my $inurl=shift;
sleep($sleeptime);
open(WLOG,"+>>$stime.log") or (cprin($!,5) and next);
my $searchweb="http://www.google.com.hk";
my $searchurl="$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=$pageno&sa=N/";
my $nextpage=0;
my $nextpageno=$pageno+100;
cprin("Now URL: $searchurln",7);
my $ua = LWP::UserAgent->new;
$ua->timeout(20);
$ua->env_proxy;
$ua->agent("Mozilla/5.0");
my $response = $ua->get($searchurl);
if ($response->is_success)
{
$getre=$response->content; # or whatever
@getlog=split(/href=/,$getre);
}
else
{
print "Get page count faild!n";
cprin($response->status_line,5);next;
}
my @sites;
foreach $urlre(@getlog){
if($urlre=~/^"http://(.*?)"starget=_blanksclass=l/){
push(@sites,"http://$1");
}
if($urlre=~//search?.*?q=.*?start=$nextpageno&sa=N/){
$nextpage++;
}
}
my %seen=();
@sites = grep { !$seen{$_} ++ } @sites;
$siteno=0;
for my $nowsite (@sites)
{
$siteno++;
print "URL:$nowsiten";
print WLOG $nowsite,"n";
}
close WLOG;
if($nextpage==0)
{
cprin("tttThe Endn",13);
goto SPIDEREND;
}
}
SPIDEREND:
$etime=time;
$ttime=$etime-$stime;
print stime($ttime);
sub cprin
{
($str,$i)=@_;
cprint("x03" . $i . "$strnx030");
}
sub stime
{
my $stime=shift;
my $hour=int($stime/(60*60));
my $minute=int(($stime-($hour*60*60))/60);
my $second=$stime-$hour*60*60-$minute*60;
return ("$hour hours $minute mins $second secs");
}
|
评论