OK, so what started out as a bash script grew into a rather finicky Perl script. I used a bunch of parallel hashes, judging things by combinations duplicate names, identical file sizes and actually scoring the path name and taking the highest score. I ended up no using the file hashes, because I decided that the path names and file sizes were enough information.
It prints out a lot of debugging output, but mostly it is heirarchical: file name/file size/path name.
20021120 foggy morning, church - as.jpg 1476 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb 21850 185 /tank/pictures/9999-Source/2002/2002-12-00-bellingham 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb 4029 185 /tank/pictures/9999-Source/2002/2002-12-00-bellingham 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb 184 /tank/pictures/9999-Source/2002/2002-12-00-bellingham/.xvpics
And here is the script itself:
# copyright 2013 Jed Reynolds #!/usr/bin/perl -w use strict; use warnings; use Carp; use Data::Dumper; #use Data::Dumper::Perltidy; $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Useqq = 1; package main; use constant { FHASH => 0, FNAME => 1, FSIZE => 2, FPATH => 3, SCORE => 4, LCNAME => 5, KEEP => 100, DUMP => -1, DUNNO => 999, }; our %file_names = (); our %path_counts = (); our %file_sizes = (); our @records = (); our %lcfname_map = (); our @loser_files = ( '.project', '.recent', '.recent-1', '.recent-2', '.recent-3', '.recent-4', '.recent-5', 'thumbs.db', 'thumbs.db~', '.picasa.ini', 'zpool.txt', ); our @loser_str = ( '.xml', '.html', 'thumbs.db', 'Thumbs.db', 'thumbs.db~', 'Thumbs.db~', '.picasa.ini', 'z.pl', 'zpool.txt', '.php', '.ini', '.pdf', '.odt', ); our @loser_paths = ( 'pictures/Documents', 'AptanaStudio', '100NIKON', '101NIKON', '102NIKON', '100PENTX', '101PENTX', '102PENTX', '100FUGI', '101FUGI', '102FUGI', '100FUJI', '101FUJI', '102FUJI', 'DCIM', 'pictures/Pictures/lap', 'pictures/lap', 'pictures/pictures', 'Pictures/Pictures', 'Pictures/tank', 'tank/tank', #'0000-incoming', 'pictures/mnt', #'home/Pictures', 'pictures/bin', '999-copied', 'Documents/Candela', ); sub lcMapKeyOf { my $ra_rec = shift; die("asRecordKey: wants record ref,bye") if(!defined $ra_rec); my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE].";".@$ra_rec[FPATH]; return $key; } sub recordKeyOf { my $ra_rec = shift; die("asRecordKey: wants record ref,bye") if(!$ra_rec); my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE]; return $key; }; sub areMultipleCopiesInLoserPath { my $ra_rec = shift; die("areMultipleCopies: norecord,bye") unless( $ra_rec ); my $rv = 0; my $key = ::recordKeyOf($ra_rec); my $path = @$ra_rec[FPATH]; my $num = $::path_counts{$key}; my $strikes = 0; if ( $num > 1 ) { for my $loser (@::loser_paths) { if( index($path, $loser) > -1 ) { $strikes += 1; print STDERR "multi:$path - $num\n"; } } } # if everthing is a trike then we want to # judge it using score system $rv = (($strikes > 0) && ($strikes < $num)) ? 1 : 0; return $rv; } #~sub sub hasLoserName { my $ra_rec = shift; my $fname = @$ra_rec[FNAME]; if( $fname ~~ @::loser_files ) { #print "\n$fname member of loser_files\n"; return 1; } for my $loser (@::loser_str) { if( index($fname, $loser) > -1) { #print "\n$fname substr of loser_str $loser\n"; return 1; } } for my $loser (@::loser_paths) { if( index(@$ra_rec[FPATH], $loser) > -1 ) { #print "\n".@$ra_rec[FPATH]." substr of loser_path $loser\n"; return 1; } } return 0; } sub checkForShorterName { my $rv = 0; my $ra_rec = shift; my $fname = shift; my ($shorter) = ($fname =~ m/(.*)[~]+$/); #print STDERR "shorter[$shorter] fname[$fname]\n"; my $nkey = lc($shorter).";".@$ra_rec[FSIZE]; if (defined $::path_counts{$nkey} && $::path_counts{$nkey} > 0) { #print "LOSER: backup has existing copies $nkey\n"; $rv = 1; return $rv; } #print "zero path_counts[$nkey] checking file_names[$shorter]\n"; my $rh_sizes = $::file_names{$shorter}; if (defined $rh_sizes ) { my $num = keys %$rh_sizes; if ($num > 1) { #print "LOSER: existing copies of $nkey\n"; $rv = 1; } } return $rv; } #~sub sub pathScore { my $fpath = shift; die("pathScore: blank path, bye.") if ($fpath eq ""); my $s ='/'; my $pre = '/tank/pictures/'; my $score = 0; if ($fpath =~ /20\d\d/) { my ($year) = ($fpath =~ m/(20\d\d)/); $score += ((2013 - $year) * 10); } if ($fpath =~ /${pre}0100-Projects${s}\d{4}${s}.+$/ ) { $score += 200; } elsif ($fpath =~ /${pre}Chris-Nelsen.+$/ ) { $score += 170; } elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) { $score += 160; } elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}.+$/ ) { $score += 155; } elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}$/ ) { $score += 150; } elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}$/ ) { $score += 140; } elsif ($fpath =~ /${pre}0100-Projects${s}333-dng$/ ) { $score += 130; } elsif ($fpath =~ /${pre}0100-Projects.+$/ ) { $score += 125; } elsif ($fpath =~ /${pre}0100-Projects$/ ) { $score += 120; } elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) { $score += 100; } elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}\d{4}.+$/ ) { $score += 75; } elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}.+$/ ) { $score += 71; } elsif ($fpath =~ /${pre}9999-Source${s}\d{4}$/ ) { $score += 60; } elsif ($fpath =~ /${pre}9999-Source${s}.+$/ ) { $score += 40; } elsif ($fpath =~ /${pre}9999-Source$/ ) { $score += 50; } elsif ($fpath =~ /${pre}(3p|gmaps)$/ ) { $score += 60; } elsif ($fpath =~ /${pre}Roz.+$/ ) { $score += 70; } elsif ($fpath =~ /${pre}[Ww]eb.+$/ ) { $score += 3; } elsif ($fpath =~ /${pre}\d+-[Mm]ovies${s}?.*$/ ) { $score += 10; } elsif ($fpath =~ /${pre}\d+-[Mm]ovie$/ ) { $score += 10; } elsif ($fpath =~ /${pre}8888-[Ss]creensaver${s}?.*$/ ) { $score += 2; } elsif ($fpath =~ /${pre}8888-[Ll]iam.*$/ ) { $score += 60; } elsif ($fpath =~ /${pre}0200-[Dd]esk.*$/ ) { $score += 50; } elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}${s}.+$/ ) { $score += 41; } elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}$/ ) { $score += 31; } elsif ($fpath =~ /${pre}\d{4}${s}\d{2}${s}\d{2}$/ ) { $score += 30; } elsif ($fpath =~ /${pre}\d{4}${s}.+$/ ) { $score += 21; } elsif ($fpath =~ /${pre}1000-print.*$/ ) { $score += 20; } elsif ($fpath =~ /${pre}Pictures${s}9999-[Ss]ource.*$/ ) { $score += 11; } elsif ($fpath =~ /${pre}Pictures${s}lap${s}00-pix.*$/ ) { $score += 10; } elsif ($fpath =~ /${pre}(home|0000-incoming).+$/ ) { $score += 2; } elsif ($fpath =~ /${pre}tank.+$/ ) { $score += 2; } elsif ($fpath =~ /${pre}?$/ ) { $score += 3; } else { my $v = "########################################\n" ."##\n" ."##\tUNRATED\t$fpath\n" ."##\n" ."########################################\n"; die( $v ); } if( $score > 0 && index($fpath, ".album") >= 0) { $score -= 1; } if( $score > 0 && index($fpath, ".xvpics") >= 0) { $score -= 1; } if( $score > 0 && index($fpath, "/thumb") >= 0) { $score -= 1; } return $score; } #~sub #---------------------------------------------------------# my $l=0; while() { chomp; next if(m/^\s*$/); print STDERR "HUH?" if( ! m/;/); my @file_rec = split(';'); print STDERR "bad FHASH" if ($file_rec[FHASH] eq ""); print STDERR "bad FNAME" if ($file_rec[FNAME] eq ""); print STDERR "bad FSIZE" if ($file_rec[FSIZE] eq ""); print STDERR "bad FPATH" if ($file_rec[FPATH] eq ""); push(@file_rec, DUNNO); # score push(@file_rec, lc($file_rec[FNAME])); # lc name push(@records, \@file_rec); my $lckey = lcMapKeyOf( \@file_rec ); $lcfname_map{ $lckey } = \@file_rec; $l++; } close(STDIN); $l = @records; print "loaded $l records\n"; $l = 0; print "building file path mapping\n"; for my $ra_rec (@records) { ## ## ## filename+size => path ## ## ## my $key = recordKeyOf($ra_rec); $file_sizes{$key} = [] if(! $file_sizes{$key}); push($file_sizes{$key}, @$ra_rec[FPATH]); ## ## ## filename => size => path ## ## ## $key = lc(@$ra_rec[FNAME]); $file_names{$key} = () if(! $file_names{$key}); $file_names{$key}{ @$ra_rec[FSIZE] } = [] if(! $file_names{$key}{ @$ra_rec[FSIZE] }); push( $file_names{$key}{ @$ra_rec[FSIZE] }, @$ra_rec[FPATH]); } print "pondering file path duplicates\n"; for my $ra_rec (@records) { my $key = recordKeyOf($ra_rec); my $ra_paths = $file_sizes{$key}; my $n = @$ra_paths; die("no paths for $key ???\n") if (!$n); $path_counts{$key} = $n; } #(sort { ($userids{$a} cmp $userids{$b}) || ($a cmp $b) } keys %userids) my $putty = "\n\t\t"; my @sorted_keys = sort { ($path_counts{$b} <=> $path_counts{$a}) || ($a cmp $b) } keys %path_counts; print "sorted path count keys\n"; for my $key (@sorted_keys) { my $cnt = 0 + $path_counts{$key}; next if ($cnt == 1); my @sorted_paths = sort @{ $file_sizes{$key} }; } @sorted_keys = sort {$a cmp $b} keys %file_names; # this is a heirarchy of various duplicate paths my @sorted_sizes; for my $key (@sorted_keys) { my $rh_sizes = $file_names{$key}; @sorted_sizes = sort keys %$rh_sizes; for my $fsize (@sorted_sizes) { my $ra_paths = $rh_sizes->{$fsize}; } } ##----------------------------------------------------## ## ## ## Now get down to JUDGING the files ## ## ## ##----------------------------------------------------## for my $lcfname (sort keys %::file_names) { my @fsize_keys = keys $::file_names{$lcfname}; my $fname_ct = 0; my $msg = "\n$lcfname"; my %scores = (); for my $fsize (sort @fsize_keys) { $fname_ct++; my $fpath_ct = @{$::file_names{$lcfname}{$fsize}}; $msg .= "\n\t$fsize\n"; for my $path (@{$::file_names{$lcfname}{$fsize}}) { my $key = "$lcfname;$fsize;$path"; my $ra_rec = $lcfname_map{$key}; if (!defined $ra_rec) { die( "NO RA_REC $key????" ); next; } if (!defined @$ra_rec[SCORE]) { print STDERR "NO RA_REC score\n"; @$ra_rec[SCORE] = DUNNO; } my $score = 0; $scores{$path} = $score; if (@$ra_rec[FSIZE] < 4 ) { @$ra_rec[SCORE] = DUMP; next; } if ( hasLoserName( $ra_rec ) ) { @$ra_rec[SCORE] = DUMP; next; } if( areMultipleCopiesInLoserPath($ra_rec) ) { @$ra_rec[SCORE] = DUMP; next; } if ( $lcfname =~ /.*[~]+$/ && checkForShorterName( $ra_rec, $lcfname )) { @$ra_rec[SCORE] = DUMP; next; } $scores{$path} = pathScore( $path ); } #~for path # now sort out the scores, keep the best next if (keys %scores < 1 ); my $top = ""; for my $path ( sort{$scores{$b} <=> $scores{$a} } keys %scores ) { #print STDERR "path: $path\n"; my $score = $scores{$path}; $msg .= "\t\t$score\t$path\n"; my $key = "$lcfname;$fsize;$path"; next if (!defined $lcfname_map{$key}); my $ra_rec = $lcfname_map{$key}; if (!defined $ra_rec) { die( "NO RA_REC $key????" ); next; } #print STDERR "score:".@$ra_rec[SCORE]."\n"; if ($top eq "" && @$ra_rec[SCORE] == DUNNO) { $top = $path; @$ra_rec[SCORE] = KEEP; } elsif (@$ra_rec[SCORE] == DUNNO) { @$ra_rec[SCORE] = DUMP; } } } #~for size print "\n$fname_ct\t$msg"; } # for lcname for my $ra_rec (@records) { if ( @$ra_rec[SCORE] == DUMP || @$ra_rec[SCORE] == KEEP) { next; } if (@$ra_rec[FSIZE] < 4 ) { @$ra_rec[SCORE] = DUMP; next; } if ( hasLoserName( $ra_rec ) ) { @$ra_rec[SCORE] = DUMP; next; } if( areMultipleCopiesInLoserPath($ra_rec) ) { @$ra_rec[SCORE] = DUMP; next; } if ( @$ra_rec[FPATH] =~ /.*[~]+$/ && checkForShorterName( $ra_rec, @$ra_rec[FNAME] )) { @$ra_rec[SCORE] = DUMP; next; } my $score = 0; $score = pathScore( @$ra_rec[FPATH] ); @$ra_rec[SCORE] = ( $score > 0 ) ? KEEP : DUMP ; } #now go through and tally the number of items killed our @winners = []; our @losers = []; our @undecided = []; for my $ra_rec (@records) { my $key = ::recordKeyOf($ra_rec); if ( @$ra_rec[SCORE] == DUMP ) { push @losers, $ra_rec; #my $safety = 0+$path_counts{$key}; } elsif ( @$ra_rec[SCORE] == KEEP ) { #my $ct = 0+$path_counts{$key} ; #my $ix = index($key, '~'); #if ( $ct > 1 ) { # my $lckey = lcMapKeyOf( $ra_rec ); # my $ra_paths = $file_sizes{$key}; # my $m = "\n\t".join( "\n\t", @$ra_paths ); # print "WIN? $key: $m\n"; #} #if ($ix > -1) { # my $lckey = lcMapKeyOf( $ra_rec ); # print STDERR "WIN? $key [$lckey]: ct[$ct] backup-symbol[$ix]\n"; #} push @winners, $ra_rec; } else { print STDERR "UNDECIDED: $key\n"; push @undecided, $ra_rec; } } my $msg = "There were "; $msg .= @undecided; $msg .= " undecided, "; $msg .= @winners; $msg .= " winners and "; $msg .= @losers; $msg .= " losers.\n"; print $msg; my $fh; for my $ra_rec (@undecided) { next if( !defined $ra_rec || !defined @$ra_rec[FNAME]); my $key = lcMapKeyOf( $ra_rec ); die( "RE-JUDGE>>>> $key\n"); } open($fh, ">", "zz-keep-paths.txt"); for my $ra_rec (@winners) { next if (!defined @$ra_rec[FPATH]); next if (!defined @$ra_rec[FNAME]); my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME]; print $fh "$fq_name\n"; } close $fh; open($fh, ">", "zz-discard-paths.txt"); for my $ra_rec (@losers) { next if (!defined @$ra_rec[FPATH]); next if (!defined @$ra_rec[FNAME]); my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME]; print $fh "rm $fq_name\n"; } close $fh; #eof