:
##########################################################################
# Title      :	finddup - find duplicate files
# Author     :	Heiner Steven <heiner.steven@odn.de>
# Date       :	2002-09-16
# Requires   :	sum
# Category   :	File Utilities
# SCCS-Id.   :	@(#) finddup	1.6 18/05/03
##########################################################################
# Description
#	Prints a list of duplicate files. Files are assumed to
#	be identical if they have the same checksum as calculated
#	by the $SUM program below (preferred: "md5sum").
#
# Note
#    o	Cannot handle files with embedded LF (ASCII 10) characters
#    o	If a large number of files has the same checksum, they
#	probably have a file size of 0 bytes.
#    o	For improved speed the program sorts files by file
#	size, and only compares files with equal size.
#	This additional work only pays off for large files.
##########################################################################

PN=`basename "$0"`			# Program name
VER='1.6'

# Set the following variables to disable the search for the fastest program:
#SUM=
#NAWK=
#FIND=

findargs="-type f"

###############################################################################
# searchprog - search program using search PATH
# usage: searchprog program
###############################################################################

searchprog () {
    _search=$1; shift

    for _dir in `echo "$PATH" | sed "s/^:/.:/;s/:\$/:./;s/:/ /g"`
    do
        [ -x "$_dir/$_search" ] || continue
        echo "$_dir/$_search"
        return 0
    done

    return 1
}

usage () {
    echo >&2 "$PN - find duplicate files, $VER
usage: $PN [-lv] path [path ...] [findargs]
    -l:  long output format
    -v:  print progress messages

The path specifications can be directories or files. \"findargs\" are
parameters to the find(1) command, the default is \"$findargs\".

The short output format lists the number of duplicate files,
and a list of files separated by TAB all in one line.

The long output format lists the number of duplicates and
the checksum in one line, followed by a colon. After that the file
names follow, each on a separate line. An empty line terminates the list."
    exit 1
}

msg () {
    for MsgLine
    do echo "$PN: $MsgLine"
    done >&2
}

fatal () { msg "$@"; exit 1; }

###############################################################################
# add_size_column - prefix each file name with its size (in bytes)
###############################################################################

add_size_column () {
    OldIFS=$IFS; IFS="
"
    # Note that reading input line-by-line usually is very slow.
    while read -r path
    do
	# Quote the path name in a way AWK can print it later.
	# Write a "\" prefix before quotation marks and backslashes.
	quoted=`sed 's/["\\]/\\\&/g' <<-EOT
	$path
	EOT
	`
	#echo "path=<$path> quoted=<$quoted>" >&2

	# Use a slash ("/") separator, because it cannot appear
	# within a file name (except as a directory delimiter).
	ls -ld "$path" |
	    awk '
	    	{print $5 "/" "'"$quoted"'"}
	    '
    done
    IFS=$OldIFS
}

###############################################################################
# checksum_files - create checksum of a file's content
#
# Input: list of path names on standard input, one path each line
# Output: alternating path name, checksum of path
###############################################################################

checksum_files () {
    (
	OldIFS=$IFS; IFS="
"
	while read -r path
	do
	    echo "$path"
	    "$SUM" "$path"
		
	done
	IFS=$OldIFS
    )
}

###############################################################################
# Start of main script
###############################################################################

set -- `getopt :hlv "$@" || exit 1` || usage
[ $# -lt 1 ] && usage			# "getopt" detected an error

LongOutput=false
Verbose=false
while [ $# -gt 0 ]
do
    case "$1" in
    	-l)	LongOutput=true;;
	-v)	Verbose=true;;
	--)	shift; break;;
	-h)	usage;;
	-*)	usage;;
	*)	break;;			# First file name
    esac
    shift
done

[ $# -lt 1 ] && usage

# Find fastest AWK implementation

: ${NAWK:=`searchprog gawk || searchprog mawk || searchprog nawk || echo awk`}

# Find a program to calculate the checksum. "md5sum" generates better
# checksums than BSD "sum", which still creates better checksums than
# System V "sum".

: ${SUM:=`searchprog md5sum || searchprog cksum || searchprog sum`}

# Configure the input column numbers AWK should use as checksum. Be
# careful when changing the following lines, because quoting is
# important.

case "$SUM" in
    *md5sum)	fieldlist='$1';;
    *cksum)	fieldlist='$1 " " $2';;		# Solaris
    *sum)	fieldlist='$1 " " $2';;
    *)		fatal "cannot find \"md5sum\" or \"sum\"";;
esac

: ${FIND:=`searchprog gfind || searchprog find`}

# Check if the find(1) program supports the -printf "%s" action for
# printing the file size. This can more than double the speed of the
# script.

if "$FIND" /dev/null -printf "%s/%p\n" >/dev/null 2>&1
then find_has_printf=true
else find_has_printf=false
fi

[ $Verbose = true ] &&
    msg "using programs SUM=$SUM; NAWK=$NAWK; FIND=$FIND" \
    		"find_has_printf=$find_has_printf"

if [ "$find_has_printf" = "true" ]
then
    # find(1) -printf format specifications:
    #	%s - file size (bytes)
    #	%p - file path

    "$FIND" "$@" $findargs -printf "%s/%p\n"
	# Output example: 123/./testdata.txt
else
    "$FIND" "$@" $findargs -print |
	# Prepend the file size to the file name
	add_size_column
	# Output example: 123/./testdata.txt
fi |
    # Input example: 123/./testdata.txt

    # Sort by file size, ascending order
    sort -t/ -n |
    "$NAWK" '
	# Print groups of files (at least two) with the same size
	
	function remove_sizefield(s) {
	    sub(/[0-9]*\//, "", s)
	    return s
	}

    	BEGIN {
	    verbose = ("'"$Verbose"'" == "true")
	    FS = "/"
	    prev_size = -1
	    same_size_run = 0
	    input_lines = 0
	    output_lines = 0
	}
    	{
	    ++input_lines
	    size = $1
	    filename = $0

	    if ( size == prev_size ) {
		if ( !same_size_run ) {
		    # Start of run of files with same size
		    same_size_run = 1
		    print remove_sizefield(prev_filename)
		    print remove_sizefield(filename)
		    output_lines += 2
		} else {
		    print remove_sizefield(filename)
		    ++output_lines
		}
	    } else {
	    	if ( same_size_run ) {
		    same_size_run = 0
		}
	    }

	    prev_size = size
	    prev_filename = filename
	}
	END {
	    if ( verbose+0 ) {
		print "'"$PN"': INFO: " output_lines+0 " of " input_lines+0 \
		    " files need to be compared by file content." \
		    | "cat >&2"
	    }
	}
    ' |

    # Create checksums on the (hopefully reduced) list of files
    # where at least two files have the same size

    # Input: path names, one each line
    checksum_files |
    # Output: alternating path name, checksum output lines

    # Group the files by checksum, and print all checksums with
    # two or more file names.
    "$NAWK" '
    	function replace_whitespace(s) {
	    gsub(/[ 	]/, "_", s)
	    return s
	}

	BEGIN {
	    longoutput = ("'"$LongOutput"'" == "true")
	    verbose = ("'"$Verbose"'" == "true")
	    if ( longoutput ) {
		filesep = "\n"
	    } else {
		filesep = "\t"
	    }
	}
	!file {				# first line is file
	    file = $0
	    #print "DEBUG:", file | "cat >&2"
	    next
	}
	{				# second line is checksum
	    idx = '"$fieldlist"'
	    #print "DEBUG:", idx | "cat >&2"
	    if (files[idx] != "" ) files[idx] = files[idx] filesep

	    files[idx] = files[idx] file
	    if ( ++count[idx] > 1 ) ++dups
	    file = ""
	}
	END {
	    dups = 0
	    for (idx in files) {
		if ( count[idx] > 1 ) {
		    ++dups
		    if ( longoutput ) {
			print "duplicates",count[idx], "x",
		       		replace_whitespace(idx) ":"
			n = split(files[idx], f, filesep)
			for ( i=1; i<=n; ++i ) print f[i]
			print ""
		    } else {
			print count[idx] filesep files[idx]
		    }
		}
	    }
	    exit (dups == 0)	# zero if duplicate found
	}
    '

# exits with "0" if duplicates were found, "1" otherwise