Code: [show] | [select all]
=info
523066680/vicyang
2018-10
=cut
use utf8;
use Encode;
use LWP::UserAgent;
use File::Path;
use File::Slurp;
use File::Basename qw/basename/;
use Mojo::DOM;
STDOUT->autoflush(1);
our $wdir = encode('gbk', "D:/temp/力成文学");
mkpath $wdir unless -e $wdir;
our $main = "http://www.ceasm.com";
our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
my $res = $ua->get($main);
my $html = $res->content();
my $dom = Mojo::DOM->new($html);
my ($url, $dir, $buff, $item );
my (@sUrl, @sItem, @list, $article);
# 一级栏目
for my $e ( $dom->at(".menu")->find("[target]")->each )
{
$url = $e->attr("href");
$item = $e->text;
printf "%s\n", $e->text;
# 二级栏目
get_subitem( $url, \@sUrl, \@sItem );
for my $id ( 0 .. $#sUrl )
{
printf " %s\n", $sItem[$id];
$dir = "${wdir}/${item}/${sItem[$id]}/";
mkpath $dir unless -e $dir;
# 所有文章链接
@list = list( $main . $sUrl[$id] );
for my $link ( @list )
{
printf " %s\n", $link;
$buff = article($main . $link);
write_file( $dir . basename($link), $buff );
}
}
}
# 二级栏目
sub get_subitem
{
my ( $url, $links, $names ) = @_;
my $res = $ua->get( $url );
$dom = Mojo::DOM->new( $res->content() );
@$links = map { $_->attr("href") } ( $dom->at(".keywords")->find("[target]")->each );
@$names = map { $_->text } ( $dom->at(".keywords")->find("[target]")->each );
}
# 文段列表
sub list
{
my $link = shift;
my $res = $ua->get( $link );
$dom = Mojo::DOM->new( $res->content() );
# 获取最大页面值
$dom->at(".pagelist")->find("a")->last =~ /(\d+)/;
my $max = $1;
my @list;
for my $id ( 1 .. $max )
{
$res = $ua->get( $link ."list${id}.html" );
$dom = Mojo::DOM->new( $res->content() );
push @list, map { $_->attr("href") } $dom->at(".dedelist")->find("h4 [target]")->each;
}
return @list;
}
sub article
{
my $link = shift;
my $res;
do { $res = $ua->get( $link ); } until ( length($res->content) > 2000 );
return $res->content;
}
Code: [show] | [select all]
=info
523066680/vicyang
2018-10
=cut
use utf8;
use Encode;
use File::Slurp;
use LWP::UserAgent;
use Mojo::DOM;
STDOUT->autoflush(1);
our $main = "http://www.ceasm.com";
our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
our $wdir = encode('gbk', "D:/Temp/力成文学");
chdir $wdir or warn "$!";
my $buff;
my @files;
my @dirs = `dir "$wdir" /ad /s /b`;
grep { s/\r?\n//; } @dirs;
for my $dir ( @dirs )
{
printf "%s\n", $dir;
chdir $dir or die "$!";
@files = glob "*.html";
next unless $#files >= 0;
$buff = "";
grep { $buff .= article( $_ ) } sort { substr($b, 0, -5) <=> substr($a, 0, -5) } @files;
write_file( "${dir}.txt", $buff );
}
sub article
{
my $file = shift;
my $html = decode('gbk', scalar(read_file( $file )) );
$html =~s/ /#CRLF/g;
$html =~s/\n/#CRLF/g; # ------> 1
$dom = Mojo::DOM->new( $html );
my $title = $dom->at("h2")->all_text;
my $text = $dom->at(".text")->all_text;
$text =~s/\s//g; # ------> 2 去掉所有空白符号包括 space tab \r \n 全角空白符
$text =~s/(\d+、)/\n$1/g;
$text =~s/\Q$title\E//;
$text =~s/#CRLF/\n/g;
$text =~s/[\r\n]+/\n/g;
$text =~s/^\n//;
my $str;
#标题
$str = sprintf "%s\n", encode('gbk', $title );
$str .= sprintf "%s\n", $file;
$str .= sprintf "%s\n\n", encode('gbk', $text);
return $str;
}
sub xcode
{
$_[1]='x' if (not defined $_[1]);
for my $v ( split(//,$_[0]) )
{
print sprintf ("%02$_[1] ",ord($v));
}
print "\n\n";
}