#!/usr/bin/perl
$^W=1;
# let's cover most/all module requiremensts early
use strict;
use warnings;
no warnings qw(recursion);
use warnings;
use File::Basename;
use File::Find;
use File::Temp ();
use FileHandle;
use Getopt::Long;

#vi(1) se tabstop=4

Getopt::Long::Configure (
	"posix_default", # start with POSIX (compatible) defaults
	"bundling" # then tweak as seems fitting
);

# blocksize to use for read
my $blocksize=4096;

my $exit_val=0;

my $usage="usage: $0 [--help|-h|-?] [--nochange|-n] [--noxdev|--nomount] [--recursive|-r|-R] [--verbose|-v] pathname [...]";
my $helptext=<<""
$usage
	--help|-h|-?
		Help - provide some basic usage information, overrides other
		options, if preceded by --verbose|-v option, provides more
		detailed information.
	--nochange|-n
		Change nothing, but say what would otherwise be done.
	--noxdev|--nomount
		Descend directories on other file systems.
	--recursive|-r|-R
		Recursively descend directories.
	--verbose|-v
		Be verbose
	pathname - pathname(s) to examine

;
$usage="$usage\n";
my $verbosehelp=<<""
$helptext
	$0 examines pathname(s) looking for distinct occurrences of
	non-zero length regular files on the same file system with identical
	data content in which case $0 attempts to replace such occurrences
	with hard links to the occurrence having the oldest modification
	time, or if the modification times are identical, the occurrence
	having the largest number of links, or if the link count is also
	identical to an arbitrarily selected occurrence.

;

# detabify
$usage=~s/\t/    /og;
$helptext=~s/\t/    /og;
$verbosehelp=~s/\t/    /og;

my $help=undef;
my $nochange=undef;
my $noxdev=undef;
my $recursive=undef;
my $verbose=undef;
GetOptions (
	"help|h|?" => \$help,
	"nochange|n" => \$nochange,
	"noxdev|nomount" => \$noxdev,
	"recursive|r|R" => \$recursive,
	"verbose|v" => \$verbose
) or die "$0: bad option(s), aborting\n${usage}aborting";

if($help){
	if($verbose){
		print "$verbosehelp" or
		exit 1;
	}else{
		print "$helptext" or
		exit 1;
	};
	exit 0;
};

unless(@ARGV){
	print "$usage";
	exit 1;
};

# initialize our hash keyed by dev
my %d=();
#hash keyed by dev, each value
#	hash keyed by size, each value
#		hash keyed by ino, each value
#			array of mtime, nlink and
#				hash by pathname(s)
#				for the pathnames hash we use the pathnames as keys,
#				we don't much care about the values in that hash (just go
#				for something small, like undef, '', or 0)
#d=	{
#		dev,	{
#					size,	{	ino,[mtime,nlink,{pathname=>,...}],
#								ino,[mtime,nlink,{pathname=>,...}],
#								...
#							},
#					size,	{	ino,[mtime,nlink,{pathname=>,...}],
#								ino,[mtime,nlink,{pathname=>,...}],
#								...
#							},
#					...
#				}
#		...
#	}
#(lstat(_))[0] dev
#(lstat(_))[1] ino
#(lstat(_))[3] nlink
#(lstat(_))[7] size
#(lstat(_))[9] mtime

# lstat elements
my $lstat_dev=0;
my $lstat_ino=1;
my $lstat_nlink=3;
my $lstat_size=7;
my $lstat_mtime=9;

my $tmpprefix='.' . basename($0);

# takes an array of pathnames
# returns list of anonymous reference to filehandle of first pathname
# successfully opened, that pathname, then the other pathname(s),
# returns undef if it can't open any of the pathnames,
# complains about open failure(s).
sub getfhrp{
	my $filehandleref;
	my $i;
	for($i=0;$i<=$#_;++$i){
		if(open($filehandleref,$_[$i])){
			return	(
						$filehandleref,
						$_[$i],	# pathname we successfully opened first
						# then the rest
						@_[0..$i-1],
						@_[$i+1..$#_]
					)
			;
		}else{
			warn "$0: failed to open $_[$i]: $!\n";
			$exit_val=1;
		};
	};
	return undef;
};

# takes array of arrays, first array within should be source used for
# linking, subsequent arrays contain link targets, within each array,
# one or more pathnames to what should be the same file (same device
# and inode) are provided, the additional source pathnames provided
# may be tried should the first one tried fail, all targets are tried
# - notwithstanding $nochange.
sub dolink{
	my @source=@{$_[0]};
	for (my $i=1;$i<=$#_;++$i){
		for (my $j=0;$j<=$#{$_[$i]};++$j){
			# temporary filename in directory of target $_[$i][$j]
			# we use eval to be able to trap failures (e.g. if
			# dirname($_[$i][$j]) ceases to exist)
			my $tempfile=eval {
				File::Temp::tempnam(dirname($_[$i][$j]),$tmpprefix);
			};
			unless(defined($tempfile)){
				# couldn't construct $tempfile in same directory as target
				warn(
					"$0: ",
					"File::Temp::tempnam(dirname($_[$i][$j]),$tmpprefix)",
					" failed: $@\n"
				) if $verbose;
				warn "$0: Cannot link to $_[$i][$j]\n";
				$exit_val=1;
				# track that we got an error and proceed to next
				next;
			};

			for (my $k=0;$k<=$#source;++$k){
				if($nochange||link($source[$k],$tempfile)){
					if($nochange||rename($tempfile,$_[$i][$j])){
						if($nochange){
							print "$0: link $source[$k] to $_[$i][$j])\n";
						}elsif($verbose){
							print "$0: linked $source[$k] to $_[$i][$j])\n";
						};
					}else{
						warn "$0: rename($tempfile,$_[$i][$j]) failed: $!\n";
						warn "$0: Cannot link to $_[$i][$j]\n";
						unlink($tempfile);
						$exit_val=1;
					};
					last; # that source worked, no need to try others
				}else{
					warn "$0: link($source[$k],$tempfile) failed: $!\n";
					if($k>=$#source){
						# our final possible source failed for this target
						warn(
							"$0: Cannot link any of ('",
							join("','",@source),
							"') to $_[$i][$j]\n"
						);
					};
					$exit_val=1;
				};
			};
		};
	};
};

sub cmpln{
	use warnings;
	local $/=\$blocksize; #try to read efficiently - by $blocksize
	my %h=(); #we'll build hash of (still) possible matches by key
	my @matched=(); # sets of matched files
	for (@_){
		my $fhr=${$_}[0];
		if(my $buf=<$fhr>){
			if(exists($h{$buf})){
				push @{$h{$buf}},[@{$_}];
			}else{
				$h{$buf}=[[@{$_}]];
			};
		}else{
			if(close($fhr)){
				push @matched,[@{$_}[1..$#{$_}]];
			}else{
				warn "$0: error closing ${$_}[1]\n";
				$exit_val=1;
			};
		};
	};

	# handle matches
	if($#matched==0){
		# nothing to match to, empty it
		@matched=();
	}elsif($#matched>=1){
		&dolink(@matched);
		@matched=();
	};

	# the keys on %h can be large, particularly in cumulative effect
	# when we go recursive, so let's simplify, since we no longer need
	# those keys themselves
	my @h=();
	for (keys %h){
		if($#{$h{$_}}>=1){
			push @h,$h{$_};
		}else{
			# only a single file, nothing to compare, just close it
			unless(close (${$h{$_}}[0][0])){
				warn "$0: error closing ${$h{$_}}[0][1]\n";
				$exit_val=1;
			};
		};
		delete $h{$_};
	};
	for (@h){
		# our recursion may be rather/quite extensive/deep, depending on
		# file(s) examined, so lets suppress the
		# Deep recursion on subroutine "%s"
		# warnings
		no warnings qw(recursion);

		# "In order to understand recursion,
		#  you must first understand recursion." - Edsgar Dijkstra
		&cmpln(@{$_});
	};
};

# we call process via find
sub process{
	unless($recursive){
		$File::Find::prune=1;
	};

	my $dev;
	unless(($dev=(lstat($_))[0])){
		warn "$0: lstat($File::Find::name) failed\n";
		$exit_val=1;
		return;
	};
	if(!$noxdev&&$dev!=$File::Find::topdev){
		$File::Find::prune=1;
		return;
	}

	# make sure we're not a symbolic link, otherwise skip
	! -l _ or return;
	# we can use _ as we already used lstat and
	# since we're not using follow or follow_fast

	# only interested in ordinary files that are readable and non-zero size
	-f _ && -r _ && -s _ or return;

	if (! exists $d{(lstat(_))[$lstat_dev]}){
		# don't have this dev yet, add dev, etc.
		$d{(lstat(_))[$lstat_dev]}=
			{(lstat(_))[$lstat_size],
				{(lstat(_))[$lstat_ino],
					[(lstat(_))[$lstat_mtime],(lstat(_))[$lstat_nlink],
						{$File::Find::name,undef}
					]
				}
			}
	}elsif(! exists $d{(lstat(_))[$lstat_dev]}{(lstat(_))[$lstat_size]}){
		# have dev, but not size, add size, etc.
		$d{(lstat(_))[$lstat_dev]}{(lstat(_))[$lstat_size]}=
			{(lstat(_))[$lstat_ino],
				[(lstat(_))[$lstat_mtime],(lstat(_))[$lstat_nlink],
					{$File::Find::name,undef}
				]
			}
	}elsif(! exists $d{(lstat(_))[$lstat_dev]}{(lstat(_))[$lstat_size]}{(lstat(_))[$lstat_ino]}){
		# have dev and size but not ino, add ino, etc.
		$d{(lstat(_))[$lstat_dev]}{(lstat(_))[$lstat_size]}{(lstat(_))[$lstat_ino]}=
			[(lstat(_))[$lstat_mtime],(lstat(_))[$lstat_nlink],
				{$File::Find::name,undef}
			];
	}else{
		# have dev, size, ino (and mtime), ensure pathname in hash
		$d{(lstat(_))[$lstat_dev]}{(lstat(_))[$lstat_size]}{(lstat(_))[$lstat_ino]}[2]{$File::Find::name}=
			undef;
	}
}

# use process to stuff our data into %d
find(
	{
		wanted => \&process,
	},
	(@ARGV)
);

# we have all the data we need except
# we still need to read and compare data in the files for each inode
# where the device and size data matches and we have
# multiple inodes

# we no longer need the device and size data itself so long as we don't
# confuse data that is associated with a unique device and size
# combination with that of another
for my $d (keys %d) {
	for my $s (keys %{$d{$d}}) {

		# do we have only one ino for this dev and size?
		if(scalar(()=%{$d{$d}{$s}})<=2){
			# if so, delete that data and go to next iteration of this loop
			delete $d{$d}{$s};
			next;
		}

		# sort inodes (of same dev and size) by mtime and number of links,
		# once we've done that, we don't care about inode numbers
		# themselves (or nlinks), as long as we keep their mtime/pathname(s)
		# associated with them separate, so we'll simplify (and sort) our
		# data structure at this time, from:
		# {	ino,[mtime,nlink,{pathname=>,...}],
		#	ino,[mtime,nlink,{pathname=>,...}],
		#	...
		# }
		# and add opened file handles, so we then have:
		# [
		# [fhr,pathname,...]
		# ...
		# ]

		my @f=(); # sets of file data to be checked for sameness
		# we'll build @f up as array of anonymous arrays each with first
		# element as reference to anonymous filehandle and remaining
		# element(s) as pathname(s)

		for my $i
			(sort
				{
					# oldest mtime first
					$d{$d}{$s}{$a}[0] <=> $d{$d}{$s}{$b}[0] or
						# if mtimes, match, highest nlink first
						$d{$d}{$s}{$b}[1] <=> $d{$d}{$s}{$a}[1]
				} keys %{$d{$d}{$s}}
			)
		{
			if(my @fhrp=&getfhrp(keys %{$d{$d}{$s}{$i}[2]})){
				push @f,[@fhrp];
				delete ${${$d{$d}}{$s}}{$i}; #don't need that data there anymore
			#}else{
				# the failed open(s) should cover sufficient diagnostics,
				# nothing else to do here
			};
		}
		&cmpln(@f); #do actual comparison and linking as applicable
		delete ${$d{$d}}{$s}; #finished processing for that dev and size
	}
	delete $d{$d}; #finished processing for that dev
}

exit $exit_val;

#initialize
#	lstat file(s)
#	open file handles
#	read data
#	hash keyed by data
#	recursively handle possible matches
