代码: 全选
=info
523066680/vicyang
2018-12
=cut
use Encode;
use Modern::Perl;
use File::Slurp;
use Mojo::UserAgent;
use File::Path qw/mkpath/;
use Try::Tiny;
STDOUT->autoflush(1);
our $ua = Mojo::UserAgent->new();
our $main = "http://www.52shici.com";
our $wdir = "D:/temp/52shici_mobile";
mkpath $wdir unless -e $wdir;
#获取主类
my $max;
my $route;
my $item;
my $res = $ua->get( $main )->result;
for my $e ($res->dom->find(".works-type-list a")->each )
{
$route = $e->attr("href");
$item = encode('gbk', $e->text);
printf "%s %s\n", $route, $item;
$max = get_max_pgcode( "${main}/${route}" );
get_article( "${main}/${route}", $max, $item );
}
sub get_article
{
our ($main, $wdir);
my ( $link, $max, $item ) = @_;
my $res;
my $detail;
my $path = "${wdir}/${item}";
my $file;
my $target;
mkpath $path unless -e $path;
chdir $path;
# 遍历所有页码
for my $code ( 1 .. $max )
{
printf "%s, Page code: %d/%d\n", $item, $code, $max;
$res = try_to_get( "${link}&page=${code}" );
my $count = 1;
# 每一篇文章
for my $e ( $res->dom->find(".l a")->each )
{
# URL中的请求部分
$e->attr('href') =~/\?(.*)&/;
$file = $1 .".html";
$target = $main ."/". $e->attr('href');
$target =~s/&.*$//;
$target =~s/www\./m\./;
next if -e $file;
$detail = try_to_get( $target );
write_file( $file , $detail->body );
}
}
}
sub get_max_pgcode
{
my ( $link ) = @_;
my $res = $ua->get( $link )->result;
my $keyword = $res->dom->at(".mt")->text;
if ($keyword =~/1\/(\d+)/) { return $1; }
else { printf "Failed to get max page code\n"; return undef }
}
sub try_to_get
{
our ($ua);
my ($link) = @_;
my $res;
my $times = 0;
while (1)
{
try { $res = $ua->get( $link )->result; }
catch { printf "Error %s, retry: %d\n", $_, $times; };
$times++;
last if (defined $res and $res->is_success);
exit if ( $times > 5 );
}
return $res;
}