Deduplication, continued.

OK, so what started out as a bash script grew into a rather finicky Perl script. I used a bunch of parallel hashes, judging things by combinations duplicate names, identical file sizes and actually scoring the path name and taking the highest score. I ended up no using the file hashes, because I decided that the path names and file sizes were enough information.

It prints out a lot of debugging output, but mostly it is heirarchical: file name/file size/path name.

20021120 foggy morning, church - as.jpg
1476
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb

21850
185    /tank/pictures/9999-Source/2002/2002-12-00-bellingham
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb

4029
185    /tank/pictures/9999-Source/2002/2002-12-00-bellingham
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/.xvpics

And here is the script itself:

# copyright 2013 Jed Reynolds
#!/usr/bin/perl -w
use strict;
use warnings;
use Carp;
use Data::Dumper;
#use Data::Dumper::Perltidy;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Useqq = 1;

package main;

use constant {
   FHASH    => 0,
   FNAME    => 1,
   FSIZE    => 2,
   FPATH    => 3,
   SCORE    => 4,
   LCNAME   => 5,
   KEEP     => 100,
   DUMP     => -1,
   DUNNO    => 999,
};

our   %file_names    = ();
our   %path_counts   = ();
our   %file_sizes    = ();
our   @records       = ();
our   %lcfname_map   = ();
our @loser_files = (
   '.project',
   '.recent',
   '.recent-1',
   '.recent-2',
   '.recent-3',
   '.recent-4',
   '.recent-5',
   'thumbs.db',
   'thumbs.db~',
   '.picasa.ini',
   'zpool.txt',
);
our @loser_str = (
   '.xml',
   '.html',
   'thumbs.db',
   'Thumbs.db',
   'thumbs.db~',
   'Thumbs.db~',
   '.picasa.ini',
   'z.pl',
   'zpool.txt',
   '.php',
   '.ini',
   '.pdf',
   '.odt',
);
our @loser_paths = (
   'pictures/Documents',
   'AptanaStudio',
   '100NIKON',
   '101NIKON',
   '102NIKON',
   '100PENTX',
   '101PENTX',
   '102PENTX',
   '100FUGI',
   '101FUGI',
   '102FUGI',
   '100FUJI',
   '101FUJI',
   '102FUJI',
   'DCIM',
   'pictures/Pictures/lap',
   'pictures/lap',
   'pictures/pictures',
   'Pictures/Pictures',
   'Pictures/tank',
   'tank/tank',
   #'0000-incoming',
   'pictures/mnt',
   #'home/Pictures',
   'pictures/bin',
   '999-copied',
   'Documents/Candela',
);

sub lcMapKeyOf {
   my $ra_rec = shift;
   die("asRecordKey: wants record ref,bye") if(!defined $ra_rec);
   my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE].";".@$ra_rec[FPATH];
   return $key;
}

sub recordKeyOf {
   my $ra_rec = shift;
   die("asRecordKey: wants record ref,bye") if(!$ra_rec);
   my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE];
   return $key;
};

sub areMultipleCopiesInLoserPath {
   my $ra_rec        = shift;
   die("areMultipleCopies: norecord,bye") unless( $ra_rec );
   my $rv         = 0;
   my $key        = ::recordKeyOf($ra_rec);
   my $path       = @$ra_rec[FPATH];
   my $num        = $::path_counts{$key};
   my $strikes    = 0;
   if ( $num > 1 ) {
      for my $loser (@::loser_paths) {
         if( index($path, $loser) > -1 ) {
            $strikes += 1;
            print STDERR "multi:$path - $num\n";
         }
      }
   }
   # if everthing is a trike then we want to 
   # judge it using score system
   $rv = (($strikes > 0) && ($strikes < $num)) ? 1 : 0;    return $rv; } #~sub sub hasLoserName {    my $ra_rec  = shift;    my $fname   = @$ra_rec[FNAME];    if( $fname ~~ @::loser_files ) {       #print "\n$fname member of loser_files\n";       return 1;    }    for my $loser (@::loser_str) {       if( index($fname, $loser) > -1) {
         #print "\n$fname substr of loser_str $loser\n";
         return 1;
      }
   }
   for my $loser (@::loser_paths) {
      if( index(@$ra_rec[FPATH], $loser) > -1 ) {
         #print "\n".@$ra_rec[FPATH]." substr of loser_path $loser\n";
         return 1;
      }
   }
   return 0;
}

sub checkForShorterName {
   my $rv         = 0;
   my $ra_rec     = shift;
   my $fname      = shift;
   my ($shorter)  = ($fname =~ m/(.*)[~]+$/);
   #print STDERR "shorter[$shorter] fname[$fname]\n";
   my $nkey       = lc($shorter).";".@$ra_rec[FSIZE];
   if (defined $::path_counts{$nkey} && $::path_counts{$nkey} > 0) {
      #print "LOSER: backup has existing copies $nkey\n";
      $rv = 1;
      return $rv;
   }
   #print "zero path_counts[$nkey] checking file_names[$shorter]\n";
   my $rh_sizes = $::file_names{$shorter};
   if (defined $rh_sizes ) {
      my $num = keys %$rh_sizes;
      if ($num > 1) {
         #print "LOSER: existing copies of $nkey\n";
         $rv = 1;
      }
   }
   return $rv;
} #~sub

sub pathScore {
   my $fpath      = shift;
   die("pathScore: blank path, bye.") if ($fpath eq "");
   my $s          ='/';
   my $pre        = '/tank/pictures/';
   my $score      = 0;

   if ($fpath     =~ /20\d\d/) {
      my ($year)  = ($fpath =~ m/(20\d\d)/);
      $score      += ((2013 - $year) * 10);
   }

   if ($fpath     =~ /${pre}0100-Projects${s}\d{4}${s}.+$/ ) {
      $score += 200;
   }
   elsif ($fpath  =~ /${pre}Chris-Nelsen.+$/ ) {
      $score += 170;
   }
   elsif ($fpath  =~ /${pre}0100-Projects${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 160;
   }
   elsif ($fpath  =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}.+$/ ) {
      $score += 155;
   }
   elsif ($fpath  =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 150;
   }
   elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}$/ ) {
      $score += 140;
   }
   elsif ($fpath =~ /${pre}0100-Projects${s}333-dng$/ ) {
      $score += 130;
   }
   elsif ($fpath =~ /${pre}0100-Projects.+$/ ) {
      $score += 125;
   }
   elsif ($fpath =~ /${pre}0100-Projects$/ ) {
      $score += 120;
   }
   elsif ($fpath  =~ /${pre}9999-Source${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 100;
   }
   elsif ($fpath  =~ /${pre}9999-Source${s}\d{4}${s}\d{4}.+$/ ) {
      $score += 75;
   }
   elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}.+$/ ) {
      $score += 71;
   }
   elsif ($fpath =~ /${pre}9999-Source${s}\d{4}$/ ) {
      $score += 60;
   }
   elsif ($fpath =~ /${pre}9999-Source${s}.+$/ ) {
      $score += 40;
   }
   elsif ($fpath =~ /${pre}9999-Source$/ ) {
      $score += 50;
   }
   elsif ($fpath =~ /${pre}(3p|gmaps)$/ ) {
      $score += 60;
   }   
   elsif ($fpath =~ /${pre}Roz.+$/ ) {
      $score += 70;
   }
   elsif ($fpath =~ /${pre}[Ww]eb.+$/ ) {
      $score += 3;
   }
   elsif ($fpath =~ /${pre}\d+-[Mm]ovies${s}?.*$/ ) {
      $score += 10;
   }
   elsif ($fpath =~ /${pre}\d+-[Mm]ovie$/ ) {
      $score += 10;
   }
   elsif ($fpath =~ /${pre}8888-[Ss]creensaver${s}?.*$/ ) {
      $score += 2;
   }
   elsif ($fpath =~ /${pre}8888-[Ll]iam.*$/ ) {
      $score += 60;
   }
   elsif ($fpath =~ /${pre}0200-[Dd]esk.*$/ ) {
      $score += 50;
   }
   elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}${s}.+$/ ) {
      $score += 41;
   }
   elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 31;
   }
   elsif ($fpath =~ /${pre}\d{4}${s}\d{2}${s}\d{2}$/ ) {
      $score += 30;
   }
   elsif ($fpath =~ /${pre}\d{4}${s}.+$/ ) {
      $score += 21;
   }
   elsif ($fpath =~ /${pre}1000-print.*$/ ) {
      $score += 20;
   }
   elsif ($fpath =~ /${pre}Pictures${s}9999-[Ss]ource.*$/ ) {
      $score += 11;
   }
   elsif ($fpath =~ /${pre}Pictures${s}lap${s}00-pix.*$/ ) {
      $score += 10;
   }
   elsif ($fpath =~ /${pre}(home|0000-incoming).+$/ ) {
      $score += 2;
   }
   elsif ($fpath =~ /${pre}tank.+$/ ) {
      $score += 2;
   }
   elsif ($fpath =~ /${pre}?$/ ) {
      $score += 3;
   }
   else {
      my $v = "########################################\n"
            ."##\n"
            ."##\tUNRATED\t$fpath\n"
            ."##\n"
            ."########################################\n";
      die( $v );
   }
   if( $score > 0 && index($fpath, ".album") >= 0) {
      $score -= 1;
   }
   if( $score > 0 && index($fpath, ".xvpics") >= 0) {
      $score -= 1;
   }
   if( $score > 0 && index($fpath, "/thumb") >= 0) {
      $score -= 1;
   }
   return $score;
} #~sub

#---------------------------------------------------------#

my $l=0;
while() {
   chomp;
   next if(m/^\s*$/);
   print STDERR "HUH?" if( ! m/;/);
   my @file_rec = split(';');
   print STDERR "bad FHASH" if ($file_rec[FHASH] eq "");
   print STDERR "bad FNAME" if ($file_rec[FNAME] eq "");
   print STDERR "bad FSIZE" if ($file_rec[FSIZE] eq "");
   print STDERR "bad FPATH" if ($file_rec[FPATH] eq "");

   push(@file_rec, DUNNO);                   # score
   push(@file_rec, lc($file_rec[FNAME]));    # lc name
   push(@records, \@file_rec);
   my $lckey = lcMapKeyOf( \@file_rec );

   $lcfname_map{ $lckey } = \@file_rec;
   $l++;
}
close(STDIN);
$l = @records;
print "loaded $l records\n";
$l = 0;
print "building file path mapping\n";

for my $ra_rec (@records) {
   ##                         ##
   ## filename+size => path   ##
   ##                         ##
   my $key    = recordKeyOf($ra_rec);
   $file_sizes{$key} = [] if(! $file_sizes{$key});
   push($file_sizes{$key}, @$ra_rec[FPATH]);
   ##                            ##
   ## filename => size => path   ##
   ##                            ##
   $key                                 = lc(@$ra_rec[FNAME]);
   $file_names{$key}                    = () if(! $file_names{$key});
   $file_names{$key}{ @$ra_rec[FSIZE] } = [] if(! $file_names{$key}{ @$ra_rec[FSIZE] });
   push( $file_names{$key}{ @$ra_rec[FSIZE] }, @$ra_rec[FPATH]);
}

print "pondering file path duplicates\n";
for my $ra_rec (@records) {
   my $key              = recordKeyOf($ra_rec);
   my $ra_paths         = $file_sizes{$key};
   my $n                = @$ra_paths;
   die("no paths for $key ???\n") if (!$n);
   $path_counts{$key}   = $n;
}

#(sort { ($userids{$a} cmp $userids{$b}) || ($a cmp $b) } keys %userids)
my $putty = "\n\t\t";
my @sorted_keys = sort { ($path_counts{$b} <=> $path_counts{$a}) || ($a cmp $b) } keys %path_counts;
print "sorted path count keys\n";
for my $key (@sorted_keys) {
   my $cnt = 0 + $path_counts{$key};
   next if ($cnt == 1);
   my @sorted_paths = sort @{ $file_sizes{$key} };
} 
@sorted_keys = sort {$a cmp $b} keys %file_names;

# this is a heirarchy of various duplicate paths
my @sorted_sizes;
for my $key (@sorted_keys) {
   my $rh_sizes = $file_names{$key};
   @sorted_sizes = sort keys %$rh_sizes;
   for my $fsize (@sorted_sizes) {
      my $ra_paths = $rh_sizes->{$fsize};
   }
} 

##----------------------------------------------------##
##                                                    ##
##       Now get down to JUDGING the files            ##
##                                                    ##
##----------------------------------------------------##

for my $lcfname (sort keys %::file_names) {
   my @fsize_keys    = keys $::file_names{$lcfname};
   my $fname_ct      = 0;
   my $msg           = "\n$lcfname";
   my %scores        = ();
   for my $fsize (sort @fsize_keys) {
      $fname_ct++;
      my $fpath_ct   = @{$::file_names{$lcfname}{$fsize}};
      $msg .= "\n\t$fsize\n"; 

      for my $path (@{$::file_names{$lcfname}{$fsize}}) {
         my $key        = "$lcfname;$fsize;$path";
         my $ra_rec     = $lcfname_map{$key};

         if (!defined $ra_rec) {
            die( "NO RA_REC $key????" );
            next;
         }
         if (!defined @$ra_rec[SCORE]) {
            print STDERR "NO RA_REC score\n";
            @$ra_rec[SCORE] = DUNNO;
         }

         my $score      = 0;
         $scores{$path} = $score;
         if (@$ra_rec[FSIZE] < 4 ) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }
         if ( hasLoserName( $ra_rec ) ) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }
         if( areMultipleCopiesInLoserPath($ra_rec) ) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }
         if ( $lcfname =~ /.*[~]+$/ && checkForShorterName( $ra_rec, $lcfname )) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }

         $scores{$path} = pathScore( $path );         
      } #~for path

      # now sort out the scores, keep the best
      next if (keys %scores < 1 );

      my $top = "";

      for my $path ( sort{$scores{$b} <=> $scores{$a} } keys %scores ) {
         #print STDERR "path: $path\n";
         my $score   = $scores{$path};
         $msg .= "\t\t$score\t$path\n";

         my $key     = "$lcfname;$fsize;$path";
         next if (!defined $lcfname_map{$key});

         my $ra_rec  = $lcfname_map{$key};

         if (!defined $ra_rec) {
            die( "NO RA_REC $key????" );
            next;
         }
         #print STDERR "score:".@$ra_rec[SCORE]."\n";
         if ($top eq "" && @$ra_rec[SCORE] == DUNNO) {
            $top = $path;
            @$ra_rec[SCORE] = KEEP;
         }
         elsif (@$ra_rec[SCORE] == DUNNO) {
            @$ra_rec[SCORE] = DUMP;
         }
      }

   } #~for size

   print "\n$fname_ct\t$msg";

} # for lcname

for my $ra_rec (@records) {

   if (  @$ra_rec[SCORE] == DUMP || @$ra_rec[SCORE] == KEEP) {
      next;
   }

   if (@$ra_rec[FSIZE] < 4 ) {       @$ra_rec[SCORE] = DUMP;       next;    }    if ( hasLoserName( $ra_rec ) ) {       @$ra_rec[SCORE] = DUMP;       next;    }    if( areMultipleCopiesInLoserPath($ra_rec) ) {       @$ra_rec[SCORE] = DUMP;       next;    }    if ( @$ra_rec[FPATH] =~ /.*[~]+$/ && checkForShorterName( $ra_rec, @$ra_rec[FNAME] )) {       @$ra_rec[SCORE] = DUMP;       next;    }    my $score   = 0;    $score      = pathScore( @$ra_rec[FPATH] );    @$ra_rec[SCORE] = ( $score > 0 ) ? KEEP : DUMP ;
} 

#now go through and tally the number of items killed
our @winners   = [];
our @losers    = [];
our @undecided = [];
for my $ra_rec (@records) {
   my $key  = ::recordKeyOf($ra_rec);
   if ( @$ra_rec[SCORE] == DUMP ) {
      push @losers, $ra_rec;
      #my $safety = 0+$path_counts{$key};
   }
   elsif ( @$ra_rec[SCORE] == KEEP ) {
      #my $ct   =  0+$path_counts{$key} ;
      #my $ix   =  index($key, '~');

      #if ( $ct > 1 ) {
      #   my $lckey = lcMapKeyOf( $ra_rec );
      #   my $ra_paths = $file_sizes{$key};
      #   my $m = "\n\t".join( "\n\t", @$ra_paths );
      #   print "WIN? $key: $m\n";
      #}
      #if ($ix > -1) {
      #   my $lckey = lcMapKeyOf( $ra_rec );
      #   print STDERR "WIN? $key [$lckey]: ct[$ct] backup-symbol[$ix]\n";
      #}
      push @winners, $ra_rec;
   }
   else {
      print STDERR "UNDECIDED: $key\n";
      push @undecided, $ra_rec;
   }
}

my $msg = "There were ";
$msg .= @undecided;
$msg .= " undecided, ";
$msg .= @winners;
$msg .= " winners and ";
$msg .= @losers;
$msg .= " losers.\n";
print $msg;

my $fh;
for my $ra_rec (@undecided) {
   next if( !defined $ra_rec || !defined @$ra_rec[FNAME]);
   my $key = lcMapKeyOf( $ra_rec );
   die( "RE-JUDGE>>>> $key\n"); 
}
open($fh, ">", "zz-keep-paths.txt");
for my $ra_rec (@winners) {
   next if (!defined @$ra_rec[FPATH]);
   next if (!defined @$ra_rec[FNAME]);
   my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME];
   print $fh "$fq_name\n";
}
close $fh;
open($fh, ">", "zz-discard-paths.txt");
for my $ra_rec (@losers) {
   next if (!defined @$ra_rec[FPATH]);
   next if (!defined @$ra_rec[FNAME]);
   my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME];
   print $fh "rm $fq_name\n";
}
close $fh;
#eof
%d bloggers like this: