Deduplication, continued.

OK, so what started out as a bash script grew into a rather finicky Perl script. I used a bunch of parallel hashes, judging things by combinations duplicate names, identical file sizes and actually scoring the path name and taking the highest score. I ended up no using the file hashes, because I decided that the path names and file sizes were enough information.

It prints out a lot of debugging output, but mostly it is heirarchical: file name/file size/path name.

20021120 foggy morning, church - as.jpg
1476
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb

21850
185    /tank/pictures/9999-Source/2002/2002-12-00-bellingham
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb

4029
185    /tank/pictures/9999-Source/2002/2002-12-00-bellingham
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/thumb
184    /tank/pictures/9999-Source/2002/2002-12-00-bellingham/.xvpics

And here is the script itself:

# copyright 2013 Jed Reynolds
#!/usr/bin/perl -w
use strict;
use warnings;
use Carp;
use Data::Dumper;
#use Data::Dumper::Perltidy;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Useqq = 1;

package main;

use constant {
   FHASH    => 0,
   FNAME    => 1,
   FSIZE    => 2,
   FPATH    => 3,
   SCORE    => 4,
   LCNAME   => 5,
   KEEP     => 100,
   DUMP     => -1,
   DUNNO    => 999,
};

our   %file_names    = ();
our   %path_counts   = ();
our   %file_sizes    = ();
our   @records       = ();
our   %lcfname_map   = ();
our @loser_files = (
   '.project',
   '.recent',
   '.recent-1',
   '.recent-2',
   '.recent-3',
   '.recent-4',
   '.recent-5',
   'thumbs.db',
   'thumbs.db~',
   '.picasa.ini',
   'zpool.txt',
);
our @loser_str = (
   '.xml',
   '.html',
   'thumbs.db',
   'Thumbs.db',
   'thumbs.db~',
   'Thumbs.db~',
   '.picasa.ini',
   'z.pl',
   'zpool.txt',
   '.php',
   '.ini',
   '.pdf',
   '.odt',
);
our @loser_paths = (
   'pictures/Documents',
   'AptanaStudio',
   '100NIKON',
   '101NIKON',
   '102NIKON',
   '100PENTX',
   '101PENTX',
   '102PENTX',
   '100FUGI',
   '101FUGI',
   '102FUGI',
   '100FUJI',
   '101FUJI',
   '102FUJI',
   'DCIM',
   'pictures/Pictures/lap',
   'pictures/lap',
   'pictures/pictures',
   'Pictures/Pictures',
   'Pictures/tank',
   'tank/tank',
   #'0000-incoming',
   'pictures/mnt',
   #'home/Pictures',
   'pictures/bin',
   '999-copied',
   'Documents/Candela',
);

sub lcMapKeyOf {
   my $ra_rec = shift;
   die("asRecordKey: wants record ref,bye") if(!defined $ra_rec);
   my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE].";".@$ra_rec[FPATH];
   return $key;
}

sub recordKeyOf {
   my $ra_rec = shift;
   die("asRecordKey: wants record ref,bye") if(!$ra_rec);
   my $key = lc(@$ra_rec[FNAME]).";".@$ra_rec[FSIZE];
   return $key;
};

sub areMultipleCopiesInLoserPath {
   my $ra_rec        = shift;
   die("areMultipleCopies: norecord,bye") unless( $ra_rec );
   my $rv         = 0;
   my $key        = ::recordKeyOf($ra_rec);
   my $path       = @$ra_rec[FPATH];
   my $num        = $::path_counts{$key};
   my $strikes    = 0;
   if ( $num > 1 ) {
      for my $loser (@::loser_paths) {
         if( index($path, $loser) > -1 ) {
            $strikes += 1;
            print STDERR "multi:$path - $num\n";
         }
      }
   }
   # if everthing is a trike then we want to 
   # judge it using score system
   $rv = (($strikes > 0) && ($strikes < $num)) ? 1 : 0;    return $rv; } #~sub sub hasLoserName {    my $ra_rec  = shift;    my $fname   = @$ra_rec[FNAME];    if( $fname ~~ @::loser_files ) {       #print "\n$fname member of loser_files\n";       return 1;    }    for my $loser (@::loser_str) {       if( index($fname, $loser) > -1) {
         #print "\n$fname substr of loser_str $loser\n";
         return 1;
      }
   }
   for my $loser (@::loser_paths) {
      if( index(@$ra_rec[FPATH], $loser) > -1 ) {
         #print "\n".@$ra_rec[FPATH]." substr of loser_path $loser\n";
         return 1;
      }
   }
   return 0;
}

sub checkForShorterName {
   my $rv         = 0;
   my $ra_rec     = shift;
   my $fname      = shift;
   my ($shorter)  = ($fname =~ m/(.*)[~]+$/);
   #print STDERR "shorter[$shorter] fname[$fname]\n";
   my $nkey       = lc($shorter).";".@$ra_rec[FSIZE];
   if (defined $::path_counts{$nkey} && $::path_counts{$nkey} > 0) {
      #print "LOSER: backup has existing copies $nkey\n";
      $rv = 1;
      return $rv;
   }
   #print "zero path_counts[$nkey] checking file_names[$shorter]\n";
   my $rh_sizes = $::file_names{$shorter};
   if (defined $rh_sizes ) {
      my $num = keys %$rh_sizes;
      if ($num > 1) {
         #print "LOSER: existing copies of $nkey\n";
         $rv = 1;
      }
   }
   return $rv;
} #~sub

sub pathScore {
   my $fpath      = shift;
   die("pathScore: blank path, bye.") if ($fpath eq "");
   my $s          ='/';
   my $pre        = '/tank/pictures/';
   my $score      = 0;

   if ($fpath     =~ /20\d\d/) {
      my ($year)  = ($fpath =~ m/(20\d\d)/);
      $score      += ((2013 - $year) * 10);
   }

   if ($fpath     =~ /${pre}0100-Projects${s}\d{4}${s}.+$/ ) {
      $score += 200;
   }
   elsif ($fpath  =~ /${pre}Chris-Nelsen.+$/ ) {
      $score += 170;
   }
   elsif ($fpath  =~ /${pre}0100-Projects${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 160;
   }
   elsif ($fpath  =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}.+$/ ) {
      $score += 155;
   }
   elsif ($fpath  =~ /${pre}0100-Projects${s}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 150;
   }
   elsif ($fpath =~ /${pre}0100-Projects${s}\d{4}$/ ) {
      $score += 140;
   }
   elsif ($fpath =~ /${pre}0100-Projects${s}333-dng$/ ) {
      $score += 130;
   }
   elsif ($fpath =~ /${pre}0100-Projects.+$/ ) {
      $score += 125;
   }
   elsif ($fpath =~ /${pre}0100-Projects$/ ) {
      $score += 120;
   }
   elsif ($fpath  =~ /${pre}9999-Source${s}\d{4}${s}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 100;
   }
   elsif ($fpath  =~ /${pre}9999-Source${s}\d{4}${s}\d{4}.+$/ ) {
      $score += 75;
   }
   elsif ($fpath =~ /${pre}9999-Source${s}\d{4}${s}.+$/ ) {
      $score += 71;
   }
   elsif ($fpath =~ /${pre}9999-Source${s}\d{4}$/ ) {
      $score += 60;
   }
   elsif ($fpath =~ /${pre}9999-Source${s}.+$/ ) {
      $score += 40;
   }
   elsif ($fpath =~ /${pre}9999-Source$/ ) {
      $score += 50;
   }
   elsif ($fpath =~ /${pre}(3p|gmaps)$/ ) {
      $score += 60;
   }   
   elsif ($fpath =~ /${pre}Roz.+$/ ) {
      $score += 70;
   }
   elsif ($fpath =~ /${pre}[Ww]eb.+$/ ) {
      $score += 3;
   }
   elsif ($fpath =~ /${pre}\d+-[Mm]ovies${s}?.*$/ ) {
      $score += 10;
   }
   elsif ($fpath =~ /${pre}\d+-[Mm]ovie$/ ) {
      $score += 10;
   }
   elsif ($fpath =~ /${pre}8888-[Ss]creensaver${s}?.*$/ ) {
      $score += 2;
   }
   elsif ($fpath =~ /${pre}8888-[Ll]iam.*$/ ) {
      $score += 60;
   }
   elsif ($fpath =~ /${pre}0200-[Dd]esk.*$/ ) {
      $score += 50;
   }
   elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}${s}.+$/ ) {
      $score += 41;
   }
   elsif ($fpath =~ /${pre}\d{4}-\d{2}-\d{2}$/ ) {
      $score += 31;
   }
   elsif ($fpath =~ /${pre}\d{4}${s}\d{2}${s}\d{2}$/ ) {
      $score += 30;
   }
   elsif ($fpath =~ /${pre}\d{4}${s}.+$/ ) {
      $score += 21;
   }
   elsif ($fpath =~ /${pre}1000-print.*$/ ) {
      $score += 20;
   }
   elsif ($fpath =~ /${pre}Pictures${s}9999-[Ss]ource.*$/ ) {
      $score += 11;
   }
   elsif ($fpath =~ /${pre}Pictures${s}lap${s}00-pix.*$/ ) {
      $score += 10;
   }
   elsif ($fpath =~ /${pre}(home|0000-incoming).+$/ ) {
      $score += 2;
   }
   elsif ($fpath =~ /${pre}tank.+$/ ) {
      $score += 2;
   }
   elsif ($fpath =~ /${pre}?$/ ) {
      $score += 3;
   }
   else {
      my $v = "########################################\n"
            ."##\n"
            ."##\tUNRATED\t$fpath\n"
            ."##\n"
            ."########################################\n";
      die( $v );
   }
   if( $score > 0 && index($fpath, ".album") >= 0) {
      $score -= 1;
   }
   if( $score > 0 && index($fpath, ".xvpics") >= 0) {
      $score -= 1;
   }
   if( $score > 0 && index($fpath, "/thumb") >= 0) {
      $score -= 1;
   }
   return $score;
} #~sub

#---------------------------------------------------------#

my $l=0;
while() {
   chomp;
   next if(m/^\s*$/);
   print STDERR "HUH?" if( ! m/;/);
   my @file_rec = split(';');
   print STDERR "bad FHASH" if ($file_rec[FHASH] eq "");
   print STDERR "bad FNAME" if ($file_rec[FNAME] eq "");
   print STDERR "bad FSIZE" if ($file_rec[FSIZE] eq "");
   print STDERR "bad FPATH" if ($file_rec[FPATH] eq "");

   push(@file_rec, DUNNO);                   # score
   push(@file_rec, lc($file_rec[FNAME]));    # lc name
   push(@records, \@file_rec);
   my $lckey = lcMapKeyOf( \@file_rec );

   $lcfname_map{ $lckey } = \@file_rec;
   $l++;
}
close(STDIN);
$l = @records;
print "loaded $l records\n";
$l = 0;
print "building file path mapping\n";

for my $ra_rec (@records) {
   ##                         ##
   ## filename+size => path   ##
   ##                         ##
   my $key    = recordKeyOf($ra_rec);
   $file_sizes{$key} = [] if(! $file_sizes{$key});
   push($file_sizes{$key}, @$ra_rec[FPATH]);
   ##                            ##
   ## filename => size => path   ##
   ##                            ##
   $key                                 = lc(@$ra_rec[FNAME]);
   $file_names{$key}                    = () if(! $file_names{$key});
   $file_names{$key}{ @$ra_rec[FSIZE] } = [] if(! $file_names{$key}{ @$ra_rec[FSIZE] });
   push( $file_names{$key}{ @$ra_rec[FSIZE] }, @$ra_rec[FPATH]);
}

print "pondering file path duplicates\n";
for my $ra_rec (@records) {
   my $key              = recordKeyOf($ra_rec);
   my $ra_paths         = $file_sizes{$key};
   my $n                = @$ra_paths;
   die("no paths for $key ???\n") if (!$n);
   $path_counts{$key}   = $n;
}

#(sort { ($userids{$a} cmp $userids{$b}) || ($a cmp $b) } keys %userids)
my $putty = "\n\t\t";
my @sorted_keys = sort { ($path_counts{$b} <=> $path_counts{$a}) || ($a cmp $b) } keys %path_counts;
print "sorted path count keys\n";
for my $key (@sorted_keys) {
   my $cnt = 0 + $path_counts{$key};
   next if ($cnt == 1);
   my @sorted_paths = sort @{ $file_sizes{$key} };
} 
@sorted_keys = sort {$a cmp $b} keys %file_names;

# this is a heirarchy of various duplicate paths
my @sorted_sizes;
for my $key (@sorted_keys) {
   my $rh_sizes = $file_names{$key};
   @sorted_sizes = sort keys %$rh_sizes;
   for my $fsize (@sorted_sizes) {
      my $ra_paths = $rh_sizes->{$fsize};
   }
} 

##----------------------------------------------------##
##                                                    ##
##       Now get down to JUDGING the files            ##
##                                                    ##
##----------------------------------------------------##

for my $lcfname (sort keys %::file_names) {
   my @fsize_keys    = keys $::file_names{$lcfname};
   my $fname_ct      = 0;
   my $msg           = "\n$lcfname";
   my %scores        = ();
   for my $fsize (sort @fsize_keys) {
      $fname_ct++;
      my $fpath_ct   = @{$::file_names{$lcfname}{$fsize}};
      $msg .= "\n\t$fsize\n"; 

      for my $path (@{$::file_names{$lcfname}{$fsize}}) {
         my $key        = "$lcfname;$fsize;$path";
         my $ra_rec     = $lcfname_map{$key};

         if (!defined $ra_rec) {
            die( "NO RA_REC $key????" );
            next;
         }
         if (!defined @$ra_rec[SCORE]) {
            print STDERR "NO RA_REC score\n";
            @$ra_rec[SCORE] = DUNNO;
         }

         my $score      = 0;
         $scores{$path} = $score;
         if (@$ra_rec[FSIZE] < 4 ) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }
         if ( hasLoserName( $ra_rec ) ) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }
         if( areMultipleCopiesInLoserPath($ra_rec) ) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }
         if ( $lcfname =~ /.*[~]+$/ && checkForShorterName( $ra_rec, $lcfname )) {
            @$ra_rec[SCORE] = DUMP;
            next;
         }

         $scores{$path} = pathScore( $path );         
      } #~for path

      # now sort out the scores, keep the best
      next if (keys %scores < 1 );

      my $top = "";

      for my $path ( sort{$scores{$b} <=> $scores{$a} } keys %scores ) {
         #print STDERR "path: $path\n";
         my $score   = $scores{$path};
         $msg .= "\t\t$score\t$path\n";

         my $key     = "$lcfname;$fsize;$path";
         next if (!defined $lcfname_map{$key});

         my $ra_rec  = $lcfname_map{$key};

         if (!defined $ra_rec) {
            die( "NO RA_REC $key????" );
            next;
         }
         #print STDERR "score:".@$ra_rec[SCORE]."\n";
         if ($top eq "" && @$ra_rec[SCORE] == DUNNO) {
            $top = $path;
            @$ra_rec[SCORE] = KEEP;
         }
         elsif (@$ra_rec[SCORE] == DUNNO) {
            @$ra_rec[SCORE] = DUMP;
         }
      }

   } #~for size

   print "\n$fname_ct\t$msg";

} # for lcname

for my $ra_rec (@records) {

   if (  @$ra_rec[SCORE] == DUMP || @$ra_rec[SCORE] == KEEP) {
      next;
   }

   if (@$ra_rec[FSIZE] < 4 ) {       @$ra_rec[SCORE] = DUMP;       next;    }    if ( hasLoserName( $ra_rec ) ) {       @$ra_rec[SCORE] = DUMP;       next;    }    if( areMultipleCopiesInLoserPath($ra_rec) ) {       @$ra_rec[SCORE] = DUMP;       next;    }    if ( @$ra_rec[FPATH] =~ /.*[~]+$/ && checkForShorterName( $ra_rec, @$ra_rec[FNAME] )) {       @$ra_rec[SCORE] = DUMP;       next;    }    my $score   = 0;    $score      = pathScore( @$ra_rec[FPATH] );    @$ra_rec[SCORE] = ( $score > 0 ) ? KEEP : DUMP ;
} 

#now go through and tally the number of items killed
our @winners   = [];
our @losers    = [];
our @undecided = [];
for my $ra_rec (@records) {
   my $key  = ::recordKeyOf($ra_rec);
   if ( @$ra_rec[SCORE] == DUMP ) {
      push @losers, $ra_rec;
      #my $safety = 0+$path_counts{$key};
   }
   elsif ( @$ra_rec[SCORE] == KEEP ) {
      #my $ct   =  0+$path_counts{$key} ;
      #my $ix   =  index($key, '~');

      #if ( $ct > 1 ) {
      #   my $lckey = lcMapKeyOf( $ra_rec );
      #   my $ra_paths = $file_sizes{$key};
      #   my $m = "\n\t".join( "\n\t", @$ra_paths );
      #   print "WIN? $key: $m\n";
      #}
      #if ($ix > -1) {
      #   my $lckey = lcMapKeyOf( $ra_rec );
      #   print STDERR "WIN? $key [$lckey]: ct[$ct] backup-symbol[$ix]\n";
      #}
      push @winners, $ra_rec;
   }
   else {
      print STDERR "UNDECIDED: $key\n";
      push @undecided, $ra_rec;
   }
}

my $msg = "There were ";
$msg .= @undecided;
$msg .= " undecided, ";
$msg .= @winners;
$msg .= " winners and ";
$msg .= @losers;
$msg .= " losers.\n";
print $msg;

my $fh;
for my $ra_rec (@undecided) {
   next if( !defined $ra_rec || !defined @$ra_rec[FNAME]);
   my $key = lcMapKeyOf( $ra_rec );
   die( "RE-JUDGE>>>> $key\n"); 
}
open($fh, ">", "zz-keep-paths.txt");
for my $ra_rec (@winners) {
   next if (!defined @$ra_rec[FPATH]);
   next if (!defined @$ra_rec[FNAME]);
   my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME];
   print $fh "$fq_name\n";
}
close $fh;
open($fh, ">", "zz-discard-paths.txt");
for my $ra_rec (@losers) {
   next if (!defined @$ra_rec[FPATH]);
   next if (!defined @$ra_rec[FNAME]);
   my $fq_name = @$ra_rec[FPATH]."/".@$ra_rec[FNAME];
   print $fh "rm $fq_name\n";
}
close $fh;
#eof

Sorting through thousands of photos

Backups are great. Having terabytes of of space for them is now completely necessary. Filling up those terabytes is …frustrating.

My photo collection is probably much like many photo enthusiast’s — well into the hundreds of thousands of pictures. But why is it so? I make thumbnails (that’s 2x pictures), I keep a low-res and a full-res picture besides thumbnails (3x). I’ve started keeping both my RAW (DNG) and my full-res JPG files (4x pictures).

This does not presume madness with the backups. What happens when you have copies of the same SD card on two computers? And you discover the months later, and you don’t have time to un-dup those few hundred? Having multiple photo editing software tools doesn’t necessarily make it easier. I use both Gimp and Darktable. I batch create my thumbnails using a shell script that drives ImageMagick, of course.  And then what piece of backup software would ever warn you if you started backing up your photos in overlapping locations? I might have (6x) the pictures I actually took!

Look at this listing of photos, for example:

val_memorial168.JPG;2484792;/tank/pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/big
val_memorial168.JPG;2484792;/tank/pictures/Pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/big
val_memorial168.JPG;2484792;/tank/pictures/tank/pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/big
val_memorial168.JPG;2484792;/tank/pictures/tank/pictures/Pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/big
val_memorial168-small.jpg;29530;/tank/pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/thumb
val_memorial168-small.jpg;29530;/tank/pictures/Pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/thumb
val_memorial168-small.jpg;29530;/tank/pictures/tank/pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/thumb
val_memorial168-small.jpg;29530;/tank/pictures/tank/pictures/Pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts/thumb
val_memorial168-small.jpg;355505;/tank/pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts
val_memorial168-small.jpg;355505;/tank/pictures/Pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts
val_memorial168-small.jpg;355505;/tank/pictures/tank/pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts
val_memorial168-small.jpg;355505;/tank/pictures/tank/pictures/Pictures/9999-Source/2006/2006-05-13-memorial-tey-roberts

Shell scripting to the rescue, right? My tactics are roughly this:

  • sort the photos by name (not by path)
  • and by size (similarly sized photos are often dups)
  • generate md5 sums of the start of the photo (the start is likely going to be as indicative and the whole photo)
  • sort by all of the above
  • try not to use perl :-)

Does it take a fancy data structure to sort this information? No. The trick is to re-arrange it in lines of text so that the things you most want to sort by are on the left side of the line.

And to wit, I doth bust out said magic thusly:

      1 #!/bin/bash
      2 ##  ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
      3 ##
      4 ##    Script intended to identify duplicately named 
      5 ##    files and also files with likely identical contents.
      6 ##
      7 ##  ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
      8 
      9 PIC_DIR=/tank/pictures
     10 VAR_DIR=/home/jreynolds/var
     11 rm -f $VAR_DIR/*.txt
     12 rm -f $VAR_DIR/*.a*
     13  F01_UNSORTED_NAMES="$VAR_DIR/unsorted.txt"
     14    F02_SORTED_NAMES="$VAR_DIR/sorted.txt"
     15     F03_WITH_HASHES="$VAR_DIR/hashes.txt"
     16   F04_SORTED_HASHES="$VAR_DIR/sorted-hashes.txt"
     17 # | head -100  \
     18 find $PIC_DIR -type f -printf "%f;%s;%h\n" \
     19 > $F01_UNSORTED_NAMES
     20 wc -l $F01_UNSORTED_NAMES
     21 
     22 cat $F01_UNSORTED_NAMES | sort > $F02_SORTED_NAMES
     23 line_ct=$(cat $F02_SORTED_NAMES|wc -l)
     24 lines_per=$[ (line_ct / 8 ) + 1 ];
     25 echo "This is the line count per file: $lines_per"
     26 
     27 i=0
     28 rm $F03_WITH_HASHES.*
     29 split -l $lines_per $F02_SORTED_NAMES ${F02_SORTED_NAMES/.txt/}.
     30 ls -l $VAR_DIR/*
     31 
     32 for J in $VAR_DIR/sorted.a* ; do
     33    echo "== $J == started"
     34    bash -c "$HOME/bin/hash_list.sh $J" &
     35 done
     36 n=8
     37 while [ $n -gt 0 ]; do 
     38    sleep 1
     39    n=$(pgrep -lf hash_list.sh | wc -l )
     40    echo -n "$n processing "
     41 done

And the careful reader will wonder, wth is hash_list.sh? You’d better wonder. It reveals the only way one can encorporate exactly one efficient disk read per file into a program. Behold:

      1 #!/bin/bash
      2 [ -z "$1" ] && echo "please specify file list, bye." && exit 1
      3 
      4 cat $1 | while read F ; do
      5    IFS=\; read -ra hunks <<> $1.hmac
     10    i=$[ i + 1 ]
     11    [ $(( $i % 50 )) == 0 ] && echo -n "."
     12 done
     13 wc -l $1.hmac
     14 
     15 cat $1.hmac | sort > $1.sorted
     16 wc -l $1.sorted

Can you even do this on windows? I’d be surprised if you could do it without at least Cygwin. However, this is all that is required: the bash programming environment. Could you do this on a Mac? Yeah. Could you do it on an iPad? Pppppt…you’re joking. Could you do it on a Samsung android tablet? Yeah, if you wanted to watch it…melt. You’d have to install the ssh client and terminal apps of course. Enough!

Now see what I discover? Broad swaths of duplication await to be uncovered:

000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/0000-incoming/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/0000-incoming/in-2012-12-09/big
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/9999-Source/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/9999-Source/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/9999-Source/2011/2011-08-31b
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG~;2301225;/tank/pictures/9999-Source/2012/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/9999-Source/2012/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/9999-Source/2012/in-2012-12-09b/big
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/9999-Source/2013/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/9999-Source/2013/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/home/Pictures/9999-Source/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/home/Pictures/9999-Source/2013/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/home/Pictures/9999-Source/2013/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/home/Pictures/9999-Source/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/Pictures/9999-Source/2011/2011-08-31b
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/Pictures/9999-Source/2011/ff/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/Pictures/9999-Source/2012/ff/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/Pictures/9999-Source/2012/ff/in-2012-12-09/big
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/Pictures/9999-Source/2012/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/Pictures/9999-Source/2012/in-2012-12-09b/big
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/Pictures/9999-Source/2012/zz/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/9999-Source/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/9999-Source/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/9999-Source/2011/2011-08-31b
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/9999-Source/2011/2011-08-31-c
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/9999-Source/2011/2011-08-31-d
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/9999-Source/2011/2011-08-31-f
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG~;2301225;/tank/pictures/tank/pictures/9999-Source/2012/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/9999-Source/2012/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/9999-Source/2012/in-2012-12-09b/big
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/9999-Source/2013/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/9999-Source/2013/2011/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/9999-Source/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/home/Pictures/9999-Source/2013/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/home/Pictures/9999-Source/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2011/2011-08-31b
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2011/ff/2011-08-31
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2012/ff/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2012/ff/in-2012-12-09/big
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2012/in-2012-12-09
000173549decf06ae5c858ea1eccfcca  -,imgp2355.jpg;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2012/in-2012-12-09b/big
000173549decf06ae5c858ea1eccfcca  -,IMGP2355.JPG;2301225;/tank/pictures/tank/pictures/Pictures/9999-Source/2012/zz/in-2012-12-09

Modern Picture Management?

I like to run a thumbnailer across most of my photos so I have little copies to send around, and I mod them so that they look distinctly different from the full resolution source files. I like to use gwenview to flip and rotate them. Gwenview can export to FB and email so that makes things easier.

I copied a bunch of these thumbs to a project folder. However, I still need to use scripting to sets of pictures from different directories, because they match by name. Buckle up:

ls \
| grep small-imgp \
| perl -pe 's/small-(imgp[0-9]+\.jpg)$/$1/' \
| while read F 
do 
   find /home/jreynolds/9/2012/ -iname "$F" -exec cp {} ~/For-Mary \; 
done

Wow, now I can really get to work.