OK, so what started out as a bash script grew into a rather finicky Perl script. I used a bunch of parallel hashes, judging things by combinations duplicate names, identical file sizes and actually scoring the path name and taking the highest score. I ended up no using the file hashes, because I decided that the path names and file sizes were enough information.
It prints out a lot of debugging output, but mostly it is heirarchical: file name/file size/path name.
20021120 foggy morning, church - as.jpg 1476 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb 21850 185 /tank/pictures/9999-Source/2002/2002-12-00-bellingham 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb 4029 185 /tank/pictures/9999-Source/2002/2002-12-00-bellingham 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/.xvpics
And here is the script itself:
# copyright 2013 Jed Reynolds
#!/usr/bin/perl -w
use strict;
use warnings;
use Carp;
use Data::Dumper;
#use Data::Dumper::Perltidy;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Useqq = 1;
package main;
use constant {
FHASH => 0,
FNAME => 1,
FSIZE => 2,
FPATH => 3,
SCORE => 4,
LCNAME => 5,
KEEP => 100,
DUMP => -1,
DUNNO => 999,
};
our %file_names = ();
our %path_counts = ();
our %file_sizes = ();
our @records = ();
our %lcfname_map = ();
our @loser_files = (
'.project',
'.recent',
'.recent-1',
'.recent-2',
'.recent-3',
'.recent-4',
'.recent-5',
'thumbs.db',
'thumbs.db~',
'.picasa.ini',
'zpool.txt',
);
our @loser_str = (
'.xml',
'.html',
'thumbs.db',
'Thumbs.db',
'thumbs.db~',
'Thumbs.db~',
'.picasa.ini',
'z.pl',
'zpool.txt',
'.php',
'.ini',
'.pdf',
'.odt',
);
our @loser_paths = (
'pictures/Documents',
'AptanaStudio',
'100NIKON',
'101NIKON',
'102NIKON',
'100PENTX',
'101PENTX',
'102PENTX',
'100FUGI',
'101FUGI',
'102FUGI',
'100FUJI',
'101FUJI',
'102FUJI',
'DCIM',
'pictures/Pictures/lap',
'pictures/lap',
'pictures/pictures',
'Pictures/Pictures',
'Pictures/tank',
'tank/tank',
#'0000-incoming',
'pictures/mnt',
#'home/Pictures',
'pictures/bin',
'999-copied',
'Documents/Candela',
);
sub lcMapKeyOf {
my $ra_rec = shift;
die("asRecordKey: wants record ref,bye") if(!defined $ra_rec);
my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE].";".@$ra_rec[FPATH];
return $key;
}
sub recordKeyOf {
my $ra_rec = shift;
die("asRecordKey: wants record ref,bye") if(!$ra_rec);
my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE];
return $key;
};
sub areMultipleCopiesInLoserPath {
my $ra_rec = shift;
die("areMultipleCopies: norecord,bye") unless( $ra_rec );
my $rv = 0;
my $key = ::recordKeyOf($ra_rec);
my $path = @$ra_rec[FPATH];
my $num = $::path_counts{$key};
my $strikes = 0;
if ( $num > 1 ) {
for my $loser (@::loser_paths) {
if( index($path, $loser) > -1 ) {
$strikes += 1;
print STDERR "multi:$path - $num\n";
}
}
}
# if everthing is a trike then we want to
# judge it using score system
$rv = (($strikes > 0) && ($strikes < $num)) ? 1 : 0; return $rv; } #~sub sub hasLoserName { my $ra_rec = shift; my $fname = @$ra_rec[FNAME]; if( $fname ~~ @::loser_files ) { #print "\n$fname member of loser_files\n"; return 1; } for my $loser (@::loser_str) { if( index($fname, $loser) > -1) {
#print "\n$fname substr of loser_str $loser\n";
return 1;
}
}
for my $loser (@::loser_paths) {
if( index(@$ra_rec[FPATH], $loser) > -1 ) {
#print "\n".@$ra_rec[FPATH]." substr of loser_path $loser\n";
return 1;
}
}
return 0;
}
sub checkForShorterName {
my $rv = 0;
my $ra_rec = shift;
my $fname = shift;
my ($shorter) = ($fname =~ m/(.*)[~]+$/);
#print STDERR "shorter[$shorter] fname[$fname]\n";
my $nkey = lc($shorter).";".@$ra_rec[FSIZE];
if (defined $::path_counts{$nkey} && $::path_counts{$nkey} > 0) {
#print "LOSER: backup has existing copies $nkey\n";
$rv = 1;
return $rv;
}
#print "zero path_counts[$nkey] checking file_names[$shorter]\n";
my $rh_sizes = $::file_names{$shorter};
if (defined $rh_sizes ) {
my $num = keys %$rh_sizes;
if ($num > 1) {
#print "LOSER: existing copies of $nkey\n";
$rv = 1;
}
}
return $rv;
} #~sub
sub pathScore {
my $fpath = shift;
die("pathScore: blank path, bye.") if ($fpath eq "");
my $s ='/';
my $pre = '/tank/pictures/';
my $score = 0;
if ($fpath =~ /20\d\d/) {
my ($year) = ($fpath =~ m/(20\d\d)/);
$score += ((2013 - $year) * 10);
}
if ($fpath =~ /${pre}0100-Projects${s}\d{4}${s}.+$/ ) {
$score += 200;
}
elsif ($fpath =~ /${pre}Chris-Nelsen.+$/ ) {
$score += 170;
}
elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) {
$score += 160;
}
elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}.+$/ ) {
$score += 155;
}
elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}$/ ) {
$score += 150;
}
elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}$/ ) {
$score += 140;
}
elsif ($fpath =~ /${pre}0100-Projects${s}333-dng$/ ) {
$score += 130;
}
elsif ($fpath =~ /${pre}0100-Projects.+$/ ) {
$score += 125;
}
elsif ($fpath =~ /${pre}0100-Projects$/ ) {
$score += 120;
}
elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) {
$score += 100;
}
elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}\d{4}.+$/ ) {
$score += 75;
}
elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}.+$/ ) {
$score += 71;
}
elsif ($fpath =~ /${pre}9999-Source${s}\d{4}$/ ) {
$score += 60;
}
elsif ($fpath =~ /${pre}9999-Source${s}.+$/ ) {
$score += 40;
}
elsif ($fpath =~ /${pre}9999-Source$/ ) {
$score += 50;
}
elsif ($fpath =~ /${pre}(3p|gmaps)$/ ) {
$score += 60;
}
elsif ($fpath =~ /${pre}Roz.+$/ ) {
$score += 70;
}
elsif ($fpath =~ /${pre}[Ww]eb.+$/ ) {
$score += 3;
}
elsif ($fpath =~ /${pre}\d+-[Mm]ovies${s}?.*$/ ) {
$score += 10;
}
elsif ($fpath =~ /${pre}\d+-[Mm]ovie$/ ) {
$score += 10;
}
elsif ($fpath =~ /${pre}8888-[Ss]creensaver${s}?.*$/ ) {
$score += 2;
}
elsif ($fpath =~ /${pre}8888-[Ll]iam.*$/ ) {
$score += 60;
}
elsif ($fpath =~ /${pre}0200-[Dd]esk.*$/ ) {
$score += 50;
}
elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}${s}.+$/ ) {
$score += 41;
}
elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}$/ ) {
$score += 31;
}
elsif ($fpath =~ /${pre}\d{4}${s}\d{2}${s}\d{2}$/ ) {
$score += 30;
}
elsif ($fpath =~ /${pre}\d{4}${s}.+$/ ) {
$score += 21;
}
elsif ($fpath =~ /${pre}1000-print.*$/ ) {
$score += 20;
}
elsif ($fpath =~ /${pre}Pictures${s}9999-[Ss]ource.*$/ ) {
$score += 11;
}
elsif ($fpath =~ /${pre}Pictures${s}lap${s}00-pix.*$/ ) {
$score += 10;
}
elsif ($fpath =~ /${pre}(home|0000-incoming).+$/ ) {
$score += 2;
}
elsif ($fpath =~ /${pre}tank.+$/ ) {
$score += 2;
}
elsif ($fpath =~ /${pre}?$/ ) {
$score += 3;
}
else {
my $v = "########################################\n"
."##\n"
."##\tUNRATED\t$fpath\n"
."##\n"
."########################################\n";
die( $v );
}
if( $score > 0 && index($fpath, ".album") >= 0) {
$score -= 1;
}
if( $score > 0 && index($fpath, ".xvpics") >= 0) {
$score -= 1;
}
if( $score > 0 && index($fpath, "/thumb") >= 0) {
$score -= 1;
}
return $score;
} #~sub
#---------------------------------------------------------#
my $l=0;
while() {
chomp;
next if(m/^\s*$/);
print STDERR "HUH?" if( ! m/;/);
my @file_rec = split(';');
print STDERR "bad FHASH" if ($file_rec[FHASH] eq "");
print STDERR "bad FNAME" if ($file_rec[FNAME] eq "");
print STDERR "bad FSIZE" if ($file_rec[FSIZE] eq "");
print STDERR "bad FPATH" if ($file_rec[FPATH] eq "");
push(@file_rec, DUNNO); # score
push(@file_rec, lc($file_rec[FNAME])); # lc name
push(@records, \@file_rec);
my $lckey = lcMapKeyOf( \@file_rec );
$lcfname_map{ $lckey } = \@file_rec;
$l++;
}
close(STDIN);
$l = @records;
print "loaded $l records\n";
$l = 0;
print "building file path mapping\n";
for my $ra_rec (@records) {
## ##
## filename+size => path ##
## ##
my $key = recordKeyOf($ra_rec);
$file_sizes{$key} = [] if(! $file_sizes{$key});
push($file_sizes{$key}, @$ra_rec[FPATH]);
## ##
## filename => size => path ##
## ##
$key = lc(@$ra_rec[FNAME]);
$file_names{$key} = () if(! $file_names{$key});
$file_names{$key}{ @$ra_rec[FSIZE] } = [] if(! $file_names{$key}{ @$ra_rec[FSIZE] });
push( $file_names{$key}{ @$ra_rec[FSIZE] }, @$ra_rec[FPATH]);
}
print "pondering file path duplicates\n";
for my $ra_rec (@records) {
my $key = recordKeyOf($ra_rec);
my $ra_paths = $file_sizes{$key};
my $n = @$ra_paths;
die("no paths for $key ???\n") if (!$n);
$path_counts{$key} = $n;
}
#(sort { ($userids{$a} cmp $userids{$b}) || ($a cmp $b) } keys %userids)
my $putty = "\n\t\t";
my @sorted_keys = sort { ($path_counts{$b} <=> $path_counts{$a}) || ($a cmp $b) } keys %path_counts;
print "sorted path count keys\n";
for my $key (@sorted_keys) {
my $cnt = 0 + $path_counts{$key};
next if ($cnt == 1);
my @sorted_paths = sort @{ $file_sizes{$key} };
}
@sorted_keys = sort {$a cmp $b} keys %file_names;
# this is a heirarchy of various duplicate paths
my @sorted_sizes;
for my $key (@sorted_keys) {
my $rh_sizes = $file_names{$key};
@sorted_sizes = sort keys %$rh_sizes;
for my $fsize (@sorted_sizes) {
my $ra_paths = $rh_sizes->{$fsize};
}
}
##----------------------------------------------------##
## ##
## Now get down to JUDGING the files ##
## ##
##----------------------------------------------------##
for my $lcfname (sort keys %::file_names) {
my @fsize_keys = keys $::file_names{$lcfname};
my $fname_ct = 0;
my $msg = "\n$lcfname";
my %scores = ();
for my $fsize (sort @fsize_keys) {
$fname_ct++;
my $fpath_ct = @{$::file_names{$lcfname}{$fsize}};
$msg .= "\n\t$fsize\n";
for my $path (@{$::file_names{$lcfname}{$fsize}}) {
my $key = "$lcfname;$fsize;$path";
my $ra_rec = $lcfname_map{$key};
if (!defined $ra_rec) {
die( "NO RA_REC $key????" );
next;
}
if (!defined @$ra_rec[SCORE]) {
print STDERR "NO RA_REC score\n";
@$ra_rec[SCORE] = DUNNO;
}
my $score = 0;
$scores{$path} = $score;
if (@$ra_rec[FSIZE] < 4 ) {
@$ra_rec[SCORE] = DUMP;
next;
}
if ( hasLoserName( $ra_rec ) ) {
@$ra_rec[SCORE] = DUMP;
next;
}
if( areMultipleCopiesInLoserPath($ra_rec) ) {
@$ra_rec[SCORE] = DUMP;
next;
}
if ( $lcfname =~ /.*[~]+$/ && checkForShorterName( $ra_rec, $lcfname )) {
@$ra_rec[SCORE] = DUMP;
next;
}
$scores{$path} = pathScore( $path );
} #~for path
# now sort out the scores, keep the best
next if (keys %scores < 1 );
my $top = "";
for my $path ( sort{$scores{$b} <=> $scores{$a} } keys %scores ) {
#print STDERR "path: $path\n";
my $score = $scores{$path};
$msg .= "\t\t$score\t$path\n";
my $key = "$lcfname;$fsize;$path";
next if (!defined $lcfname_map{$key});
my $ra_rec = $lcfname_map{$key};
if (!defined $ra_rec) {
die( "NO RA_REC $key????" );
next;
}
#print STDERR "score:".@$ra_rec[SCORE]."\n";
if ($top eq "" && @$ra_rec[SCORE] == DUNNO) {
$top = $path;
@$ra_rec[SCORE] = KEEP;
}
elsif (@$ra_rec[SCORE] == DUNNO) {
@$ra_rec[SCORE] = DUMP;
}
}
} #~for size
print "\n$fname_ct\t$msg";
} # for lcname
for my $ra_rec (@records) {
if ( @$ra_rec[SCORE] == DUMP || @$ra_rec[SCORE] == KEEP) {
next;
}
if (@$ra_rec[FSIZE] < 4 ) { @$ra_rec[SCORE] = DUMP; next; } if ( hasLoserName( $ra_rec ) ) { @$ra_rec[SCORE] = DUMP; next; } if( areMultipleCopiesInLoserPath($ra_rec) ) { @$ra_rec[SCORE] = DUMP; next; } if ( @$ra_rec[FPATH] =~ /.*[~]+$/ && checkForShorterName( $ra_rec, @$ra_rec[FNAME] )) { @$ra_rec[SCORE] = DUMP; next; } my $score = 0; $score = pathScore( @$ra_rec[FPATH] ); @$ra_rec[SCORE] = ( $score > 0 ) ? KEEP : DUMP ;
}
#now go through and tally the number of items killed
our @winners = [];
our @losers = [];
our @undecided = [];
for my $ra_rec (@records) {
my $key = ::recordKeyOf($ra_rec);
if ( @$ra_rec[SCORE] == DUMP ) {
push @losers, $ra_rec;
#my $safety = 0+$path_counts{$key};
}
elsif ( @$ra_rec[SCORE] == KEEP ) {
#my $ct = 0+$path_counts{$key} ;
#my $ix = index($key, '~');
#if ( $ct > 1 ) {
# my $lckey = lcMapKeyOf( $ra_rec );
# my $ra_paths = $file_sizes{$key};
# my $m = "\n\t".join( "\n\t", @$ra_paths );
# print "WIN? $key: $m\n";
#}
#if ($ix > -1) {
# my $lckey = lcMapKeyOf( $ra_rec );
# print STDERR "WIN? $key [$lckey]: ct[$ct] backup-symbol[$ix]\n";
#}
push @winners, $ra_rec;
}
else {
print STDERR "UNDECIDED: $key\n";
push @undecided, $ra_rec;
}
}
my $msg = "There were ";
$msg .= @undecided;
$msg .= " undecided, ";
$msg .= @winners;
$msg .= " winners and ";
$msg .= @losers;
$msg .= " losers.\n";
print $msg;
my $fh;
for my $ra_rec (@undecided) {
next if( !defined $ra_rec || !defined @$ra_rec[FNAME]);
my $key = lcMapKeyOf( $ra_rec );
die( "RE-JUDGE>>>> $key\n");
}
open($fh, ">", "zz-keep-paths.txt");
for my $ra_rec (@winners) {
next if (!defined @$ra_rec[FPATH]);
next if (!defined @$ra_rec[FNAME]);
my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME];
print $fh "$fq_name\n";
}
close $fh;
open($fh, ">", "zz-discard-paths.txt");
for my $ra_rec (@losers) {
next if (!defined @$ra_rec[FPATH]);
next if (!defined @$ra_rec[FNAME]);
my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME];
print $fh "rm $fq_name\n";
}
close $fh;
#eof
