: ########################################################################## # Title : finddup - find duplicate files # Author : Heiner Steven # Date : 2002-09-16 # Requires : sum # Category : File Utilities # SCCS-Id. : @(#) finddup 1.6 18/05/03 ########################################################################## # Description # Prints a list of duplicate files. Files are assumed to # be identical if they have the same checksum as calculated # by the $SUM program below (preferred: "md5sum"). # # Note # o Cannot handle files with embedded LF (ASCII 10) characters # o If a large number of files has the same checksum, they # probably have a file size of 0 bytes. # o For improved speed the program sorts files by file # size, and only compares files with equal size. # This additional work only pays off for large files. ########################################################################## PN=`basename "$0"` # Program name VER='1.6' # Set the following variables to disable the search for the fastest program: #SUM= #NAWK= #FIND= findargs="-type f" ############################################################################### # searchprog - search program using search PATH # usage: searchprog program ############################################################################### searchprog () { _search=$1; shift for _dir in `echo "$PATH" | sed "s/^:/.:/;s/:\$/:./;s/:/ /g"` do [ -x "$_dir/$_search" ] || continue echo "$_dir/$_search" return 0 done return 1 } usage () { echo >&2 "$PN - find duplicate files, $VER usage: $PN [-lv] path [path ...] [findargs] -l: long output format -v: print progress messages The path specifications can be directories or files. \"findargs\" are parameters to the find(1) command, the default is \"$findargs\". The short output format lists the number of duplicate files, and a list of files separated by TAB all in one line. The long output format lists the number of duplicates and the checksum in one line, followed by a colon. After that the file names follow, each on a separate line. An empty line terminates the list." exit 1 } msg () { for MsgLine do echo "$PN: $MsgLine" done >&2 } fatal () { msg "$@"; exit 1; } ############################################################################### # add_size_column - prefix each file name with its size (in bytes) ############################################################################### add_size_column () { OldIFS=$IFS; IFS=" " # Note that reading input line-by-line usually is very slow. while read -r path do # Quote the path name in a way AWK can print it later. # Write a "\" prefix before quotation marks and backslashes. quoted=`sed 's/["\\]/\\\&/g' <<-EOT $path EOT ` #echo "path=<$path> quoted=<$quoted>" >&2 # Use a slash ("/") separator, because it cannot appear # within a file name (except as a directory delimiter). ls -ld "$path" | awk ' {print $5 "/" "'"$quoted"'"} ' done IFS=$OldIFS } ############################################################################### # checksum_files - create checksum of a file's content # # Input: list of path names on standard input, one path each line # Output: alternating path name, checksum of path ############################################################################### checksum_files () { ( OldIFS=$IFS; IFS=" " while read -r path do echo "$path" "$SUM" "$path" done IFS=$OldIFS ) } ############################################################################### # Start of main script ############################################################################### set -- `getopt :hlv "$@" || exit 1` || usage [ $# -lt 1 ] && usage # "getopt" detected an error LongOutput=false Verbose=false while [ $# -gt 0 ] do case "$1" in -l) LongOutput=true;; -v) Verbose=true;; --) shift; break;; -h) usage;; -*) usage;; *) break;; # First file name esac shift done [ $# -lt 1 ] && usage # Find fastest AWK implementation : ${NAWK:=`searchprog gawk || searchprog mawk || searchprog nawk || echo awk`} # Find a program to calculate the checksum. "md5sum" generates better # checksums than BSD "sum", which still creates better checksums than # System V "sum". : ${SUM:=`searchprog md5sum || searchprog cksum || searchprog sum`} # Configure the input column numbers AWK should use as checksum. Be # careful when changing the following lines, because quoting is # important. case "$SUM" in *md5sum) fieldlist='$1';; *cksum) fieldlist='$1 " " $2';; # Solaris *sum) fieldlist='$1 " " $2';; *) fatal "cannot find \"md5sum\" or \"sum\"";; esac : ${FIND:=`searchprog gfind || searchprog find`} # Check if the find(1) program supports the -printf "%s" action for # printing the file size. This can more than double the speed of the # script. if "$FIND" /dev/null -printf "%s/%p\n" >/dev/null 2>&1 then find_has_printf=true else find_has_printf=false fi [ $Verbose = true ] && msg "using programs SUM=$SUM; NAWK=$NAWK; FIND=$FIND" \ "find_has_printf=$find_has_printf" if [ "$find_has_printf" = "true" ] then # find(1) -printf format specifications: # %s - file size (bytes) # %p - file path "$FIND" "$@" $findargs -printf "%s/%p\n" # Output example: 123/./testdata.txt else "$FIND" "$@" $findargs -print | # Prepend the file size to the file name add_size_column # Output example: 123/./testdata.txt fi | # Input example: 123/./testdata.txt # Sort by file size, ascending order sort -t/ -n | "$NAWK" ' # Print groups of files (at least two) with the same size function remove_sizefield(s) { sub(/[0-9]*\//, "", s) return s } BEGIN { verbose = ("'"$Verbose"'" == "true") FS = "/" prev_size = -1 same_size_run = 0 input_lines = 0 output_lines = 0 } { ++input_lines size = $1 filename = $0 if ( size == prev_size ) { if ( !same_size_run ) { # Start of run of files with same size same_size_run = 1 print remove_sizefield(prev_filename) print remove_sizefield(filename) output_lines += 2 } else { print remove_sizefield(filename) ++output_lines } } else { if ( same_size_run ) { same_size_run = 0 } } prev_size = size prev_filename = filename } END { if ( verbose+0 ) { print "'"$PN"': INFO: " output_lines+0 " of " input_lines+0 \ " files need to be compared by file content." \ | "cat >&2" } } ' | # Create checksums on the (hopefully reduced) list of files # where at least two files have the same size # Input: path names, one each line checksum_files | # Output: alternating path name, checksum output lines # Group the files by checksum, and print all checksums with # two or more file names. "$NAWK" ' function replace_whitespace(s) { gsub(/[ ]/, "_", s) return s } BEGIN { longoutput = ("'"$LongOutput"'" == "true") verbose = ("'"$Verbose"'" == "true") if ( longoutput ) { filesep = "\n" } else { filesep = "\t" } } !file { # first line is file file = $0 #print "DEBUG:", file | "cat >&2" next } { # second line is checksum idx = '"$fieldlist"' #print "DEBUG:", idx | "cat >&2" if (files[idx] != "" ) files[idx] = files[idx] filesep files[idx] = files[idx] file if ( ++count[idx] > 1 ) ++dups file = "" } END { dups = 0 for (idx in files) { if ( count[idx] > 1 ) { ++dups if ( longoutput ) { print "duplicates",count[idx], "x", replace_whitespace(idx) ":" n = split(files[idx], f, filesep) for ( i=1; i<=n; ++i ) print f[i] print "" } else { print count[idx] filesep files[idx] } } } exit (dups == 0) # zero if duplicate found } ' # exits with "0" if duplicates were found, "1" otherwise