#!/usr/bin/perl

#Copyright (c) 2015, Rubén Llorente
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of Rubén Llorente nor the names of his contributors
#      may be used to endorse or promote products derived from this
#      software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL RUBÉN LLORENTE BE LIABLE FOR ANY
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use warnings;
use autodie;
use Digest::SHA qw /sha256_hex/;
use File::Find qw /find/;
use Getopt::Std qw /getopts/;

## SUBROUTINES AND VARIABLES RELATED TO OPTION PARSING.

$Getopt::Std::STANDARD_HELP_VERSION = "1";

$main::VERSION = "0.1.0";

$main::YEAR = "2015";

sub main::HELP_MESSAGE {
	print "
Usage: deduper [options] folder (folder2 folder3 ...)

If one or more folders are passed to the program, Deduper locates duplicated files in the given folders based on their SHA256 checksum, and prints their information to the standard output.

The behaviour of the program can be altered using the following option switches:

-f: Follow directory symlinks (default is not to follow symlinks).
-o: Delete older duplicated files and leave the newest only. Use with caution.
-n: Delete newer duplicated files and leave the oldest only. Use with caution.
--help: Show this message and exit.
--version: Print version and copyright information and exit.
-n and -r are mutually exclussive.

The Deduper File Deduplicator ignores file symlinks. It is also unable to interpretate hardlinks as what they are, and it will consider each hardlink to be a different file.

Symlink functionality is not available under Operating Systems and filesystems that don't support symlinks.
";
}

sub main::VERSION_MESSAGE {
	print "Deduper File Deduplicator. Version $main::VERSION.\nCopyright (c) $main::YEAR, Rubén Llorente.
All rights reserved.
Distributed under the 3-clause BSD license.
";
}

## SUBROUTINES RELATED TO PROGRAM OPERATION

# This subroutine takes a list of paths to scan for files and a hash
# that cotains some options. It returns a hash of arrays. The keys
# of the hash are the paths of the scanned files, and the arrays
# contain the size, mtime and sha256 checksum of each file.

sub get_file_list_with_data {
	my $paths = shift(); # This is an array reference!
	my $options = shift(); # This is a hash reference!

# This hash has a filename for key and an array of the form
# [ Size, Mtime, SHA256 ]as a value.

	my %file_information;
	
	@$paths = grep { # This block clears invalid paths away.
		( -e $_ ) ? 1 : (warn "Read error: $_ couldn't be found, or file is invalid, warning") & 0; 
	} @$paths; 

	return () if @$paths == 0; # Exit this block if no path is left after clearing.

# Subroutine used by the find function to process files in the 
# given subdirectories. It retrieves the size, mtime and sha256
# hash of each file for comparison. The input of the function is a 
# filename with full path. The return value is discarded. Refer to
# the documentation of the File::Find module.

	my $get_file_data_sub = sub {
		if ( my @stat = (lstat($File::Find::name))[7,10] and -f _ and not -l _ ) { # Check we are dealing with a regular file and obtain size and mtime in one step.
			if ( -r _ ) {
				open my $file_fh, '<', $File::Find::name;
				binmode($file_fh);
				 @{$file_information{$File::Find::name}} = ( @stat, sha256_hex(<$file_fh>) );
				close $file_fh;
			} else {
				warn "Read error: $File::Find::name couldn't be read, warning";
			}
		}
	};		

	my %find_options = ( # This block is needed for telling File::Find what to do.
		wanted => \&$get_file_data_sub,
		no_chdir => 1,
		follow => 0,
		follow_skip => 2
	);

	if ( $$options{f} ) { # Check if following symlinks has been enabled by a commandline option.
		$find_options{follow}=1;
	}
		
	find(\%find_options, @$paths);
	return %file_information;
}

# This routine takes the %file_information formatted hash and creates
# a hash whose keys are offending sha256 sums, and whose values are
# arrays containing the paths of the offending files. The array
# contains the filenames ordered by the age of each file.

sub group_duplicates {
	my %file_information = %{shift()};
	my %information_buffer;
	my %duplicated;
	my ($file, $checksum, $second_checksum, $second_file);
	foreach $file (keys %file_information) {
		if ( $file_information{$file} ) {
			$checksum = $file_information{$file}[2];
			$information_buffer{$file} = $file_information{$file};
			delete $file_information{$file};

			foreach $second_file (keys %file_information) {
				$second_checksum = $file_information{$second_file}[2];
				if ( $checksum eq $second_checksum ) {
					push @{$duplicated{$checksum}}, "$second_file";
					$information_buffer{$second_file} = $file_information{$second_file};
					delete $file_information{$second_file};
				}
			}
			delete $information_buffer{$file} unless $duplicated{$checksum};
			(push @{$duplicated{$checksum}}, "$file") if $duplicated{$checksum};
		}
	}
	sort_duplicates_by_age(\%duplicated, \%information_buffer);
	return %duplicated;
}	

# This routine sorts duplicated files by age.
# It takes two hashes as a inputs.

sub sort_duplicates_by_age {
	my $duplicated_files = shift(); # This is a hash reference.
	my $file_information = shift(); # This is a hash reference.
	my %timestamps;
	my @time_list;
	my $time;

	foreach my $checkhash (keys %$duplicated_files) {
		foreach my $repeated_file ( @{$duplicated_files->{$checkhash}} ) {
			$timestamps{$repeated_file} = $file_information->{$repeated_file}[1];
		}
		@time_list = sort {$timestamps{$a} <=> $timestamps{$b}} keys %timestamps;
		@{$duplicated_files->{$checkhash}} = ();
		while (@time_list) {
			$time = pop @time_list;
			push @{$duplicated_files->{$checkhash}}, "$time";
		}
		foreach (keys %timestamps) { delete $timestamps{$_} };
	}
}

# This routine just prints information about the duplicated elements.
# It takes two hashes as a inputs.

sub print_information {
	my $duplicated_files = shift(); # This is a hash reference.
	my $file_information = shift(); # This is a hash reference.
	if ( %$duplicated_files ) {
		foreach my $checkhash (keys %$duplicated_files) {
			print "\n########## DUPLICATION FOUND ##########\n";
			print "More than one file has the following SHA256 checksum: $checkhash\n";
			foreach my $repeated_file (@{$duplicated_files->{$checkhash}}) {
				print "File: $repeated_file size: $file_information->{$repeated_file}[0] ctime: $file_information->{$repeated_file}[1]\n";
			}
			print "#######################################\n";
		}
	} else {
		print "\nNo duplicated files found.\n";
	}
}

# This routine deletes the oldest or newest duplicated files, leaving
# only a copy per the command line instruction. Use with caution.
# No useful return is provided in this case.

sub delete_duplicated_files {
	my $duplicated_files = shift();
	my $options = shift();
	my $file_to_delete;
	if ( %$duplicated_files ) {
		# Parse a command switch to decide whether to delete the oldest duplicates or the newest ones.
		$options->{o} ? print "\nOlder files are being removed.\n" : print "\nNewer files are being removed.\n";
		foreach my $checkhash (keys %$duplicated_files) {
			@{$duplicated_files->{$checkhash}} = reverse @{$duplicated_files->{$checkhash}} if $options->{n}; 
			while (@{$duplicated_files->{$checkhash}} > 1) {
				$file_to_delete = pop @{$duplicated_files->{$checkhash}};
				unless ( $duplicated_files->{$checkhash}[0] eq $file_to_delete ) {
					print "$file_to_delete is being removed.\n";
					unlink $file_to_delete;
					# Some untested filesystems or operating systems take more than an unlink for having the file deleted, in which case unlink will not die, but the file will still exist. Issue a warning if this happens.
					warn "There was a failure while trying to remove $file_to_delete, it still exists. Check you are not using some versioning filesystem, warning" if -e $file_to_delete;
				}
			}
		}
	} else {
		print "\nNothing to remove.\n";
	}
}

# MAIN CODE EXECUTION BEGINS.
		
		
{
	my %options;
	my %file_stats;
	my %duplicated_stats;
	getopts('fon', \%options);

	main::VERSION_MESSAGE(); 

	main::HELP_MESSAGE() and last unless @ARGV; # Show help and exit the program if no command line arguments exist.

	die "-o and -n are mutually exclussive, stopped" if $options{o} and $options{n};

	print "\nStarting identification of duplicated files. This could take a while.\n";

	%file_stats = get_file_list_with_data(\@ARGV, \%options);

	%duplicated_stats = group_duplicates(\%file_stats);

	print_information(\%duplicated_stats, \%file_stats);
	
	delete_duplicated_files(\%duplicated_stats, \%options) if $options{o} or $options{n};
}

exit 0;
