See also: Heapify

Back to doug's directory

File information
Filename: hashduplicates.py Uploaded: Fri, 18th Sep 2009 00:02:00
Size (bytes): 5.5 KiB md5 checksum: 26571bbef98d2ed9e7b0766e5fdc3542
Uploader doug Download: hashduplicates.py
Description:

Find all duplicate file checksums using various hash functions in a named directory tree. Useful for finding files with identical contents across all of your file system in a fast and relatively reliable way. It is possible for two files with different contents to have the same checksum which is why I say relatively reliable. Were the tool to check byte for byte then walking your entire file system would take a considerably longer time unless you had a lot more RAM than any one person should have or your file system is tiny.

#!/usr/bin/env python
 
"""
Hash duplication finder, hashduplicates.py
Copyright (C) 2009 Douglas Lawrie
 
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
 
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
 
 
import os, hashlib, sys, getopt, stat
 
 
hashFunctions = {"md5":    hashlib.md5,    "sha1":   hashlib.sha1,
                 "sha224": hashlib.sha224, "sha256": hashlib.sha256,
                 "sha384": hashlib.sha384, "sha512": hashlib.sha512}
 
matchTypes    = ["complete", "possible", "none"]
 
 
class IllegalMatchType(Exception):
	pass
 
class FileChecksum:
	def __init__(self, filename, checksum):
		self.filename  = filename
		self.checksum  = checksum
		self.size      = os.path.getsize(filename)
		self.matchtype = "none"
 
	def SetType(self, newType):
		if not newType in matchTypes:
			raise IllegalMatchType("%s is not a legal type. Legal types are: %s" % (newType, ", ".join(matchTypes)))
		else:
			self.matchtype = newType
 
	def SetPossibleMatch(self):
		self.SetType("possible")
 
	def SetExactMatch(self):
		self.SetType("complete")
 
	def __cmp__(self, y):
		return cmp(self.matchtype, y.matchtype)
 
def usage(name = sys.argv[0]):
	print "Usage: %s [OPTIONS]" % name
	print "Options:"
	print "-h  --help              Show this help."
	print "-p  --path PATH         Walk a particular directory tree. The default is '.'."
	print "-v                      Be verbose."
	print "-l  --include-symlinks  Resolve links and treat them as separate files. "
	print "                        This is useful for finding all paths to the same file too."
	print "                        This option is turned off by default."
	print "-e  --examine LENGTH    Uses only the first LENGTH bytes to compute the checksum."
	print "-i  --ignore-io-errors  Don't print file IO errors."
	print "-s                      Include sizes in the result output."
	print "-f  --hash-func FUNC    A named hash function to use."
	print "Available hash functions:"
	print "  " + "\n  ".join(hashFunctions.keys())
	print
 
try:
	opts, args = getopt.getopt(sys.argv[1:], "hp:f:vlise:",
	                                        ["help",
	                                         "path=",
	                                         "include-symlinks",
	                                         "ignore-io-errors",
	                                         "hash-func=",
	                                         "examine="])
except getopt.GetoptError, err:
	print str(err)
	usage()
	sys.exit(2)
 
 
 
path          = "."
verbose       = 0
showSizes     = False
maxlength     = -1
traverseLinks = False
hashFunc      = hashlib.md5
ignoreIOError = False
 
for name, value in opts:
	if   name == "-v":
		verbose = verbose + 1
 
	elif name == "-s":
		showSizes = True
 
	elif name in ("-h", "--help"):
		usage()
		sys.exit()
 
	elif name in ("-p", "--path"):
		path = value
 
	elif name in ("-i", "--ignore-io-errors"):
		ignoreIOError = True
 
	elif name in ("-f", "--hash"):
		if not hashFunctions.has_key(value):
			print value, "is not the name of a valid hash function!"
			usage()
			sys.exit(4)
		else:
			hashFunc = hashFunctions[value]
 
	elif name in ("-e", "--examine"):
		try:
			maxlength = int(value)
		except ValueError:
			print value, "is not a valid integer!"
			usage()
			sys.exit(3)
 
	elif name in ("-l", "--include-symlinks"):
		traverseLinks = True
 
	else:
		assert False, "unhandled option"
 
 
knownFileHashes   = {}
matchedFileHashes = {}
 
for dirpath, dirnames, filenames in os.walk(path):
	for filename in filenames:
		fullpath = os.path.join(dirpath, filename)
		if os.path.exists(fullpath):
			if os.path.isfile(fullpath) and (not os.path.islink(fullpath) or traverseLinks):
				if verbose:
					print "Checking", fullpath
 
				data = ""
				try:
					fd = open(fullpath, "rb")
				except IOError, (code, msg):
					if not ignoreIOError:
						sys.stderr.write("Failed to open %s: %s\n" % (fullpath, msg))
					continue
				data     = fd.read(maxlength)
				checksum = hashFunc(data).hexdigest()
				fd.close()
				datalen  = len(data)
				match    = FileChecksum(fullpath, checksum)
 
				if maxlength == -1 or datalen == match.size:
					match.SetExactMatch()
				else:
					match.SetPossibleMatch()
 
				if not knownFileHashes.has_key(checksum):
					knownFileHashes[checksum] = [match]
				else:
					if verbose:
						print fullpath, "matches", len(knownFileHashes[checksum]), "other files"
					knownFileHashes[checksum].append(match)
					knownFileHashes[checksum].sort()
					matchedFileHashes[checksum] = knownFileHashes[checksum]
 
 
if matchedFileHashes:
	print "The following files are duplicates:"
	for (checksum, matches) in matchedFileHashes.items():
		print "Checksum: %s" % (checksum)
		lastMatchType = "none"
		for match in matches:
			if match.matchtype != lastMatchType:
				print "  %s matches:" % (match.matchtype.capitalize())
				lastMatchType = match.matchtype
			if showSizes:
				print "    %s (%d bytes)" % (match.filename, match.size)
			else:
				print "    %s" % (match.filename)
 
else:
	print "There are no duplicates."
 
 
 
 
 
 
RSS
Powered by Debian, Guinness, and excessive quantities of caffeine and sugar.