File information | |||
---|---|---|---|
Filename: | hashduplicates.py | Uploaded: | Fri, 18th Sep 2009 00:02:00 |
Size (bytes): | 5.5 KiB | md5 checksum: | 26571bbef98d2ed9e7b0766e5fdc3542 |
Uploader | doug | Download: | hashduplicates.py |
Description: |
Find all duplicate file checksums using various hash functions in a named directory tree. Useful for finding files with identical contents across all of your file system in a fast and relatively reliable way. It is possible for two files with different contents to have the same checksum which is why I say relatively reliable. Were the tool to check byte for byte then walking your entire file system would take a considerably longer time unless you had a lot more RAM than any one person should have or your file system is tiny. |
#!/usr/bin/env python """ Hash duplication finder, hashduplicates.py Copyright (C) 2009 Douglas Lawrie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ import os, hashlib, sys, getopt, stat hashFunctions = {"md5": hashlib.md5, "sha1": hashlib.sha1, "sha224": hashlib.sha224, "sha256": hashlib.sha256, "sha384": hashlib.sha384, "sha512": hashlib.sha512} matchTypes = ["complete", "possible", "none"] class IllegalMatchType(Exception): pass class FileChecksum: def __init__(self, filename, checksum): self.filename = filename self.checksum = checksum self.size = os.path.getsize(filename) self.matchtype = "none" def SetType(self, newType): if not newType in matchTypes: raise IllegalMatchType("%s is not a legal type. Legal types are: %s" % (newType, ", ".join(matchTypes))) else: self.matchtype = newType def SetPossibleMatch(self): self.SetType("possible") def SetExactMatch(self): self.SetType("complete") def __cmp__(self, y): return cmp(self.matchtype, y.matchtype) def usage(name = sys.argv[0]): print "Usage: %s [OPTIONS]" % name print "Options:" print "-h --help Show this help." print "-p --path PATH Walk a particular directory tree. The default is '.'." print "-v Be verbose." print "-l --include-symlinks Resolve links and treat them as separate files. " print " This is useful for finding all paths to the same file too." print " This option is turned off by default." print "-e --examine LENGTH Uses only the first LENGTH bytes to compute the checksum." print "-i --ignore-io-errors Don't print file IO errors." print "-s Include sizes in the result output." print "-f --hash-func FUNC A named hash function to use." print "Available hash functions:" print " " + "\n ".join(hashFunctions.keys()) print try: opts, args = getopt.getopt(sys.argv[1:], "hp:f:vlise:", ["help", "path=", "include-symlinks", "ignore-io-errors", "hash-func=", "examine="]) except getopt.GetoptError, err: print str(err) usage() sys.exit(2) path = "." verbose = 0 showSizes = False maxlength = -1 traverseLinks = False hashFunc = hashlib.md5 ignoreIOError = False for name, value in opts: if name == "-v": verbose = verbose + 1 elif name == "-s": showSizes = True elif name in ("-h", "--help"): usage() sys.exit() elif name in ("-p", "--path"): path = value elif name in ("-i", "--ignore-io-errors"): ignoreIOError = True elif name in ("-f", "--hash"): if not hashFunctions.has_key(value): print value, "is not the name of a valid hash function!" usage() sys.exit(4) else: hashFunc = hashFunctions[value] elif name in ("-e", "--examine"): try: maxlength = int(value) except ValueError: print value, "is not a valid integer!" usage() sys.exit(3) elif name in ("-l", "--include-symlinks"): traverseLinks = True else: assert False, "unhandled option" knownFileHashes = {} matchedFileHashes = {} for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: fullpath = os.path.join(dirpath, filename) if os.path.exists(fullpath): if os.path.isfile(fullpath) and (not os.path.islink(fullpath) or traverseLinks): if verbose: print "Checking", fullpath data = "" try: fd = open(fullpath, "rb") except IOError, (code, msg): if not ignoreIOError: sys.stderr.write("Failed to open %s: %s\n" % (fullpath, msg)) continue data = fd.read(maxlength) checksum = hashFunc(data).hexdigest() fd.close() datalen = len(data) match = FileChecksum(fullpath, checksum) if maxlength == -1 or datalen == match.size: match.SetExactMatch() else: match.SetPossibleMatch() if not knownFileHashes.has_key(checksum): knownFileHashes[checksum] = [match] else: if verbose: print fullpath, "matches", len(knownFileHashes[checksum]), "other files" knownFileHashes[checksum].append(match) knownFileHashes[checksum].sort() matchedFileHashes[checksum] = knownFileHashes[checksum] if matchedFileHashes: print "The following files are duplicates:" for (checksum, matches) in matchedFileHashes.items(): print "Checksum: %s" % (checksum) lastMatchType = "none" for match in matches: if match.matchtype != lastMatchType: print " %s matches:" % (match.matchtype.capitalize()) lastMatchType = match.matchtype if showSizes: print " %s (%d bytes)" % (match.filename, match.size) else: print " %s" % (match.filename) else: print "There are no duplicates."