#!/usr/bin/perl

#Copyright (c) 2015, Rubén Llorente
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of Rubén Llorente nor the names of his contributors
#      may be used to endorse or promote products derived from this
#      software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL RUBÉN LLORENTE BE LIABLE FOR ANY
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use warnings;
use autodie;
use Digest::SHA qw /sha256_hex/;
use File::Find qw /find/;
use Getopt::Std qw /getopts/;

## SUBROUTINES AND VARIABLES RELATED TO OPTION PARSING.

$Getopt::Std::STANDARD_HELP_VERSION = "1";

$main::VERSION = "0.0.3";

sub main::HELP_MESSAGE {
	print "
Usage: ./deduper-$main::VERSION.pl [options] folder (folder2 folder3 ...)

If one or more folders are passed to the program, Deduper locates duplicated files in the given folders based on their SHA256 checksum, and prints their information to the standard output.

The behaviour of the program can be altered using the following option switches:

-f: Follow symlinks (default is not to follow symlinks).

-r: Delete older duplicated files and leave the newest only. Use with caution.

--help: Show this message and exit.

--version: Print version and copyright information and exit.

If no folder is passed to the program, the version and copyright information are printed to standard output.
";
}

sub main::VERSION_MESSAGE {
	print "Deduper File Deduplicator. Version $main::VERSION.\nCopyright (c) 2015, Rubén Llorente.\nAll rights reserved.\nDistributed under the 3-clause BSD license.\n\n";
}

## SUBROUTINES RELATED TO PROGRAM OPERATION

# This block defined the process designed to retrieve information
# from files.

{

# This hash has a filename for key and an array of the form
# Size/Mtime/SHA256 as a value.

	my %file_information;

# Subroutine used by the find function to process files in the 
# given subdirectories. It retrieves the size, mtime and sha256
# hash of each file for comparison. The input of the function is a 
# filename with full path. The return value is discarded.

	my $get_file_data_sub = sub {
		if ( -f $File::Find::name ) {
			if ( -r $File::Find::name ) {
				open my $file_fh, '<', $File::Find::name;
				binmode($file_fh);
				my @file_data = ((stat($File::Find::name))[7,10], sha256_hex(<$file_fh>));
				close $file_fh;
				@{$file_information{$File::Find::name}} = @file_data; 
			} else {
				warn "Read error: $File::Find::name, warning";
			}
		}
	};

# Subroutine used for returning a hash that has file names as the
# keys, and a list with the size, mtime and sha256 checksum for each
# value. The input of the routine is a directory name, the output is
# the hash.

	sub get_file_list_with_data {
		my @paths = @{shift()};
		my %options = %{shift()};
		foreach ( @paths ) {
			warn "$_ non existent, warning" unless -e $_;
		}
		my %find_options = (
			wanted => \&$get_file_data_sub,
			no_chdir => 1,
			follow => 0,
			follow_skip => 2
		);

		if ( $options{f} ) {
			$find_options{follow}=1;
		}
			
		find(\%find_options, @paths);
		return %file_information;
	}
}

# This routine takes the %file_information formatted hash and creates
# a hash whose keys are offending sha256 sums, and whose values are
# arrays containing the paths of the offending files. The array
# contains the filenames ordered by the age of each file.

sub group_duplicates {
	my %file_information = %{shift()};
	my %information_buffer;
	my %duplicated;
	my ($file, $checksum, $second_checksum, $second_file);
	foreach $file (keys %file_information) {
		if ( $file_information{$file} ) {
			$checksum = ${$file_information{$file}}[2];
			$information_buffer{$file} = $file_information{$file};
			delete $file_information{$file};

			foreach $second_file (keys %file_information) {
				$second_checksum = ${$file_information{$second_file}}[2];
				if ( $checksum eq $second_checksum ) {
					push @{$duplicated{$checksum}}, "$second_file";
					$information_buffer{$second_file} = $file_information{$second_file};
					delete $file_information{$second_file};
				}
			}
			delete $information_buffer{$file} unless $duplicated{$checksum};
			(push @{$duplicated{$checksum}}, "$file") if $duplicated{$checksum};
		}
	}
	%duplicated = sort_duplicates_by_age(\%duplicated, \%information_buffer);
	return %duplicated;
}	

# This routine just prints information about the duplicated elements.
# It takes two hashes as a inputs.

sub print_information {
	my %duplicated_files = %{shift()};
	my %file_information = %{shift()};
	if ( %duplicated_files ) {
		foreach my $checkhash (keys %duplicated_files) {
			print "\n\n########## DUPLICATION FOUND ##########\n";
			print "More than one file has the following SHA256 checksum: $checkhash\n";
			foreach my $repeated_file (@{$duplicated_files{$checkhash}}) {
				print "File: $repeated_file size: ${$file_information{$repeated_file}}[0] ctime: ${$file_information{$repeated_file}}[1]\n";
			}
			print "\n#######################################\n";
		}
	} else {
		print "\nNo duplicated files found.\n";
	}
}

# This routine sorts duplicated files by age.
# It takes two hashes as a inputs.

sub sort_duplicates_by_age {
	my %duplicated_files = %{shift()};
	my %file_information = %{shift()};
	my %timestamps;
	my @time_list;
	my $time;

	foreach my $checkhash (keys %duplicated_files) {
		foreach my $repeated_file ( @{$duplicated_files{$checkhash}} ) {
			$timestamps{$repeated_file} = ${$file_information{$repeated_file}}[1];
		}
		@time_list = sort {$timestamps{$a} <=> $timestamps{$b}} keys %timestamps;
		@{$duplicated_files{$checkhash}} = ();
		while (@time_list) {
			$time = pop @time_list;
			push @{$duplicated_files{$checkhash}}, "$time";
		}
		foreach (keys %timestamps) { delete $timestamps{$_} };
	}
	return %duplicated_files
}

# This routine deletes the older duplicated files, leaving only the
# newest file. Use with caution. It takes a hash for input.
# No useful return is provided in this case.

sub delete_duplicated_files {
	my %duplicated_files = %{shift()};
	my $file_to_delete;
	if ( %duplicated_files ) {
		print "\nOlder files are being removed.\n";
		foreach my $checkhash (keys %duplicated_files) {
			while (@{$duplicated_files{$checkhash}} > 1) {
				$file_to_delete = pop @{$duplicated_files{$checkhash}};
				unless ( ${$duplicated_files{$checkhash}}[0] eq $file_to_delete ) {
					unlink $file_to_delete;
					warn "There was a failure while trying to remove $file_to_delete, it still exists. Check you are not using some versioning filesystem, warning" if -e $file_to_delete;
					print "$file_to_delete is being removed.\n";
				}
			}
		}
	} else {
		print "\nNothing to remove.\n";
	}
}

# MAIN CODE EXECUTION BEGINS.
		
		
{
	my %options;
	getopts('frv', \%options);
	my %file_stats;
	my %duplicated_stats;

	main::VERSION_MESSAGE(); 

	last unless @ARGV;

	print "\nStarting identification of duplicated files. This could take a while.\n";

	%file_stats = get_file_list_with_data(\@ARGV, \%options);

	%duplicated_stats = group_duplicates(\%file_stats);

	print_information(\%duplicated_stats, \%file_stats);
	
	delete_duplicated_files(\%duplicated_stats, \%file_stats) if $options{r};
}

exit 0;