#!/usr/bin/perl

#Copyright (c) 2015, Rubén Llorente 
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of Rubén Llorente nor the names of his contributors
#      may be used to endorse or promote products derived from this
#      software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL RUBÉN LLORENTE BE LIABLE FOR ANY
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use warnings;
use autodie;
use Digest::SHA qw /sha256/;
use File::Find qw /find/;

## Sub routine declarations.
{
	my %file_information;
# Subroutine used by the find function to process files in the 
# given subdirectories. It retrieves the size, mtime and sha256
# hash of each file for comparison. The input of the function is a 
# filename with full path. The return value is discarded.

	my $get_file_data_sub = sub {
		if ( -f -r $File::Find::name ) {
			open 'FILE', '<', $File::Find::name;
			binmode('FILE');
			my @file_data = ((stat($File::Find::name))[7], (stat($File::Find::name))[9], sha256(<FILE>));
			close 'FILE';
			@{$file_information{$File::Find::name}} = @file_data; 
		}
	};

# Subroutine used for returning a hash that has file names as the
# keys, and a list with the size, mtime and sha256 checksum for each
# value. The input of the routine is a directory name, the output is
# the hash.

	sub get_file_list_with_data {
		my $path = shift;
		find({wanted => \&$get_file_data_sub, no_chdir => 1}, $path);
		return %file_information;
	}
}

# This routine takes the %file_information formatted hash and creates
# a hash whose keys are offending sha256 sums, and whose values are
# arrays containing the paths of the offending files.

sub group_duplicates {
	my %file_information = %{shift()};
	my %duplicated;
	my ($file, $checksum, $second_checksum, $second_file);
	foreach $file (keys %file_information) {
		if ( $file_information{$file} ) {
			$checksum = ${$file_information{$file}}[2];
			delete $file_information{$file};

			foreach $second_file (keys %file_information) {
				$second_checksum = ${$file_information{$second_file}}[2];
				if ( $checksum eq $second_checksum ) {
					push @{$duplicated{$checksum}}, "$second_file";
					delete $file_information{$second_file};
				}
			}
			(push @{$duplicated{$checksum}}, "$file") if $duplicated{$checksum};
		}
	}
	return %duplicated;
}	

# This routine just prints information about the duplicated elements.
# It takes a hash as an input.

sub print_information {
	my %duplicated_files = %{shift()};
	foreach my $checkhash (keys %duplicated_files) {
		print "##########\n";
		print "More than one file has the same SHA256 checksum.\n";
		foreach my $repeated_file (@{$duplicated_files{$checkhash}}) {
			print "$repeated_file\n";
		}
		print "##########\n";
	}
}

# MAIN CODE EXECUTION BEGINS.
		
		
{

my $version="0.0.1";
my %file_stats;
my %duplicated_stats;

	

print "File Deduplicator. Version $version.\nCopyright (c) 2015, Rubén Llorente.\nAll rights reserved.\nDistributed under the 3-cause BSD license\n\n";

print "This software is unfinished and it shows. Be careful.\n";

print "Starting identification of duplicated files. This might take a while.\n";

foreach (@ARGV) {
	%file_stats = get_file_list_with_data($_);
}

%duplicated_stats = group_duplicates(\%file_stats);

print_information(\%duplicated_stats);

exit;
}