[Perl]全盘查找重复的目录

帖子由 **523066680** » 2017年01月19日 15:10

占用

帖子由 **523066680** » 2017年01月19日 15:12

思路，将文件夹目录结构转为哈希结构，借用 Perl 的 dump 函数直接输出某一节点下的所有"子节点"信息。
然后将 Dump 出来的信息转为 MD5 校验值，这样无需做大量的存储也能进行对比。

d69c12204cb2ec0313843e44c3990420 C:\MinGW
c538d379fc40d593e988477c1cc325d5 C:\NVIDIA
b1705da1f745931c29e722d5418bb104 C:\Perl
3b17d11edd2df48e306076c13c04ba91 C:\Perl\html\lib\YAML
f1f6a1e9ec6ea63004fd66233b15b778 C:\Perl\html\lib\ActivePerl\DocTools

Code: [show] | [select all]

=info

    Code by 523066680@163.com

    2017-01

=cut



# 备注，可能遇到 Unicode 文件名路径，

# 暂时使用系统 cmd /U /c dir 生成路径列表



use Encode;

use IO::Handle;

use Data::Dump qw/dump dd/; 



our $hash = {};

my $gstr;



{

    local $/ = "\x0d\x00\x0a\x00";



    print "getting file lists ...\n";

    @files = `cmd /U /c dir /s /b C:\\MinGW`;



    print "Deal ...\n";

    for my $n ( 0 .. $#files )

    {

        $files[$n] =~s/\x0d\x00\x0a\x00//;

        $gstr = encode('gbk', decode('utf16-le', $files[$n]));

        toStruct( $gstr );

    }

}



#举个栗子

dd( $hash->{'C:'}{'MinGW'}{'include'}{'boost'}{'accumulators'} );



sub toStruct

{

    my $path = shift;

    my @parts = split(/[\/\\]/, $path);

    my $ref;

    $ref = $hash;



    grep

    {

        $ref->{$_} = {} unless ( exists $ref->{$_} );

        $ref = $ref->{$_};

    }

    @parts;

}

输出示例（部分）：

getting file lists ...
Deal ...
{
  "accumulators.hpp"     => {},
  "accumulators_fwd.hpp" => {},
  "framework"            => {
                              "accumulator_base.hpp"    => {},
                              "accumulator_concept.hpp" => {},
                              "accumulator_set.hpp"     => {},
                              "accumulators"            => {
                                                             "droppable_accumulator.hpp" => {},
                                                             "external_accumulator.hpp"  => {},

缺点是用了大量空格作为缩进，处理庞大的目录树时内存暴涨。

帖子由 **523066680** » 2017年01月19日 15:50

备注：

使用过程将在 D 盘生成 dirs.txt 和 md5map.txt，dirs.txt 为dir输出结果（Unicode），
md5map为所有目录的校验值
使用cmd /U /c 执行 dir 命令以确保Unicode字符的完整输出
使用 YAML::Tiny Dump 替代 Data::Dump，减少结构体输出的体积
内存占用情况：如果 dir 导出的文件大小为 100MB，则内存占用约为 250MB
冗余信息的处理：
假设某个目录存在副本，且子目录下也存在副本
代码：全选
```
├─Fold
│  ├─a
│  └─a - 副本
└─Fold - 副本
    ├─a
    └─a - 副本
```
程序将只显示 fold 这一层的结果。但是，如果在副本的中间某一层存在差异，则会详尽地列出相同的子目录
效率：个人PC（双核4G旧主机蓝标硬盘），dir /a /s /b D: ，254506 行数据，从 dir 到 Perl 输出判断结果
耗时 58 秒

Code: [show] | [select all]

=info

    Code by 523066680@163.com

    2017-01

=cut

use Encode;

use IO::Handle;

use Digest::MD5 qw/md5_hex/;

use YAML::Tiny;

STDOUT->autoflush(1);



print "Dir ...\n";

#system("cmd /U /c dir /a /s /b C:\\ >D:\\dirs.txt");



print "Data dealing ...\n";

our $hash = {};

our @lines;

our %fold_md5;

our %md5map;

my $gstr;



open READ,"<:raw",'D:\\dirs.txt';

{

    local $/ = "\x0d\x00\x0a\x00";

    while ($line = <READ>)

    {

        $line =~s/\x0d\x00\x0a\x00//;

        $gstr = encode('gbk', decode('utf16-le', $line));

        push @lines, $gstr;

        toStruct( $gstr );

    }

}

close READ;



print "Compare ...\n";

compare();



sub compare

{

    my @parts;

    my $ref;

    my $md5;



    print "Getting md5 information ...\n";

    open WRT, ">:raw", "D:\\md5map.txt";

    for my $i ( 0 .. $#lines )

    {

        @parts = split(/[\/\\]/, $lines[$i]);

        $ref = $hash;

        grep { $ref = $ref->{$_} } ( @parts );   #将引用迭代到路径的最后一层

        next if (! keys %{$ref} );               #如果没有下一层文件内容则略过

        

        $md5 = md5_hex( Dump($ref) );

        $fold_md5{ $lines[$i] } = $md5;



        print WRT $md5 ." ". encode('utf8', decode('gbk', $lines[$i])) ."\n";

    }

    close WRT;





    for my $k ( keys %fold_md5 )

    {

        $md5 = $fold_md5{$k};

        push @{ $md5map{$md5} }, $k;

    }



    #去重，如果一组MD5内容相同的目录，且它们上一级目录的MD5也相同

    #则无需列出。

    print "Cut repeat case ...\n";

    my $tp;

    my $prev;

    for my $m ( keys %md5map )

    {

        if ( $#{$md5map{$m}} > 0  )

        {

            undef $md5;

ST:         for my $p ( @{ $md5map{$m} } )

            {

                #取得上一层路径

                $tp = $p;

                $tp =~ s/(\\|\/)[^\\\/]+$//;



                if (    ( defined $md5 )

                    and ( $md5 eq $fold_md5{$tp} ) #MD5相同

                    and ( $prev ne $tp )           #并且不是相邻目录

                    #例如 "Fold\a" "Fold\a副本" 的上一级都是 Fold，md5一致

                )

                {

                    delete $md5map{$m};

                    last ST;

                }

                else

                {

                    $md5 = $fold_md5{$tp};

                }

                $prev = $tp;

            }

        }

        else

        {

            #删除没必要列出的情况

            delete $md5map{$m};

        }

    }



    print "Find same folder\n";

    for my $k ( keys %md5map )

    {

        print join("\n", @{$md5map{$k}} );

        print "\n";

        #dump_byPath( $md5map{$k}->[0] );

        print "\n";

    }

}



sub dump_byPath

{

    my $path = shift;

    my $ref = $hash;

    for my $e ( split(/[\/\\]/, $path) )

    {

        $ref = $ref->{$e};

    }

    print Dump( $ref );

}



sub toStruct

{

    my $path = shift;

    my @parts = split(/[\/\\]/, $path);

    my $ref;

    $ref = $hash;



    for my $e ( @parts )

    {

        if ( not exists $ref->{$e} ) #如果不加判断，会不断地替换，最后只有一个路径的结构

        {

            $ref->{$e} = {};

        }

        $ref = $ref->{$e};

    }

}

FuniCode 编程论坛

[Perl]全盘查找重复的目录

[Perl]全盘查找重复的目录

将目录树结构转为 Perl 哈希（键值对）结构

Perl 遍历磁盘文件并筛选出内容相同的目录

在线用户