每次执行会叠加更新数据,保存为 perldb 文件,使用 Storable 模块做转储
Code: [show] | [select all]
=info
爬取订单记录、积累数据
Author: 523066680/vicyang
Date: 2019-03
方案:获取所有旧数据,获取所有在线数据,将数据合并。
=cut
use Modern::Perl;
use Encode;
use File::Slurp;
use Storable;
use FindBin;
use lib "$FindBin::Bin/../Function";
use GetOrders;
STDOUT->autoflush(1);
my $item = "2050788225";
my $dbfile = "Brief_${item}.perldb";
my $old = get_original( $dbfile );
my $online = GetOrders::get_online_orders( $item );
printf "orders in db: %d\n", 1+$#$old;
printf "orders online: %d\n", 1+$#$online;
say "Merging";
GetOrders::merge( $old, $online );
printf "Dumping, items: %d\n", $#$old+1;
store $old, $dbfile;
say "Done";
exit;
sub get_original
{
my $dbfile = shift;
my $data = -e $dbfile ? retrieve( $dbfile ) : [];
return $data;
}
Code: [show] | [select all]
package GetOrders;
use Modern::Perl;
use Mojo::UserAgent;
use Mojo::DOM;
use Date::Parse;
use Date::Format;
our $main = "https://feedback.aliexpress.com";
our $url = "https://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm";
our $ua = Mojo::UserAgent->new();
our @headers = (
"Host" => "www.aliexpress.com",
"User-Agent" => "Firefox/63.0",
);
sub get_online_orders
{
my ($item) = @_;
my $orders = [];
my %args = (
"productId" => $item,
"type" => "default",
"page" => "1"
);
my $pgcode = 1;
my $total = 1;
my $parts;
my @all;
# 按页码顺序获取,数据存储到 @all
while ( $pgcode <= $total and $pgcode <= 10 )
{
printf "Current Page %d\n", $pgcode;
$args{'page'} = $pgcode;
my $res = $ua->get( $url, form => \%args )->result;
redo unless $res->is_success;
($parts, $total) = read_json( $res, $pgcode );
# $parts 返回升序序列,但不同页码的时间段是降序,unshift 将其反转。
unshift @all, $parts;
$pgcode++;
}
# 按日期(升序)合并
grep { merge( $orders, $_ ) } @all;
return $orders;
}
# merge 是可复用函数,可以在外部使用。
sub merge
{
my ( $orders, $parts ) = @_;
if ( $#$orders < 0 ) {
@$orders = @$parts;
} else {
my $sect = 0;
for my $id ( 0 .. $#$parts-1 ) {
if ( $parts->[$id][0] == $orders->[-2][0] and
$parts->[$id+1][0] == $orders->[-1][0] )
{
$sect = $id+2;
last;
}
}
#printf "Sect: %d\n", $sect;
push @$orders, @{$parts}[$sect .. $#$parts];
}
}
sub read_json
{
my ($res, $page) = @_;
my $node = $res->json;
my $data = [];
# 如果页码不对,返回空数据
return [] if $node->{page}->{current} != $page;
for my $e ( @{$node->{records}} )
{
# 降序反转为升序
unshift @$data, [
str2time($e->{date}),
$e->{countryCode},
$e->{quantity},
$e->{name},
$e->{buyerAccountPointLeval},
];
}
return ( $data, $node->{page}->{total} );
}
1;