#!/usr/bin/perl #Copyright (c) 2015, Rubén Llorente #All rights reserved. # #Redistribution and use in source and binary forms, with or without #modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Rubén Llorente nor the names of his contributors # may be used to endorse or promote products derived from this # software without specific prior written permission. # #THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED #WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE #DISCLAIMED. IN NO EVENT SHALL RUBÉN LLORENTE BE LIABLE FOR ANY #DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES #(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; #LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND #ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. use strict; use warnings; use autodie; use Digest::SHA qw /sha256/; use File::Find qw /find/; ## Sub routine declarations. { my %file_information; # Subroutine used by the find function to process files in the # given subdirectories. It retrieves the size, mtime and sha256 # hash of each file for comparison. The input of the function is a # filename with full path. The return value is discarded. my $get_file_data_sub = sub { if ( -f -r $File::Find::name ) { open 'FILE', '<', $File::Find::name; binmode('FILE'); my @file_data = ((stat($File::Find::name))[7], (stat($File::Find::name))[9], sha256()); close 'FILE'; @{$file_information{$File::Find::name}} = @file_data; } }; # Subroutine used for returning a hash that has file names as the # keys, and a list with the size, mtime and sha256 checksum for each # value. The input of the routine is a directory name, the output is # the hash. sub get_file_list_with_data { my $path = shift; find({wanted => \&$get_file_data_sub, no_chdir => 1}, $path); return %file_information; } } # This routine takes the %file_information formatted hash and creates # a hash whose keys are offending sha256 sums, and whose values are # arrays containing the paths of the offending files. sub group_duplicates { my %file_information = %{shift()}; my %duplicated; my ($file, $checksum, $second_checksum, $second_file); foreach $file (keys %file_information) { if ( $file_information{$file} ) { $checksum = ${$file_information{$file}}[2]; delete $file_information{$file}; foreach $second_file (keys %file_information) { $second_checksum = ${$file_information{$second_file}}[2]; if ( $checksum eq $second_checksum ) { push @{$duplicated{$checksum}}, "$second_file"; delete $file_information{$second_file}; } } (push @{$duplicated{$checksum}}, "$file") if $duplicated{$checksum}; } } return %duplicated; } # This routine just prints information about the duplicated elements. # It takes a hash as an input. sub print_information { my %duplicated_files = %{shift()}; foreach my $checkhash (keys %duplicated_files) { print "##########\n"; print "More than one file has the same SHA256 checksum.\n"; foreach my $repeated_file (@{$duplicated_files{$checkhash}}) { print "$repeated_file\n"; } print "##########\n"; } } # MAIN CODE EXECUTION BEGINS. { my $version="0.0.1"; my %file_stats; my %duplicated_stats; print "File Deduplicator. Version $version.\nCopyright (c) 2015, Rubén Llorente.\nAll rights reserved.\nDistributed under the 3-cause BSD license\n\n"; print "This software is unfinished and it shows. Be careful.\n"; print "Starting identification of duplicated files. This might take a while.\n"; foreach (@ARGV) { %file_stats = get_file_list_with_data($_); } %duplicated_stats = group_duplicates(\%file_stats); print_information(\%duplicated_stats); exit; }