#!/usr/bin/python import glob import os import sys import md5 import stat import mmap MAX_CHECK_SIZE = 1073741824 class Kechikechi(object): def __init__(self): self.fileMap = {} self.totalReduce = 0 self.linkedCnt = 1 self.totalCnt = 1 def __getFileMD5(self, filename): st = os.stat(filename) size = st[stat.ST_SIZE] f = open(filename, "rb") content = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) try: try: m = md5.md5(content) except IOError, e: print "not found %s" % filename raise e finally: content.close() f.close() return m def __getBaseFilename(self, filename, digest, st): key = "%s%d_%d_%d_d" % (digest.digest(), st[stat.ST_MODE], st[stat.ST_UID], st[stat.ST_GID]) try: baseFilename = self.fileMap[key] except KeyError, e: self. fileMap[key] = filename baseFilename = None return baseFilename def __createLink(self, base, filename, tmp): os.link(base, "%s.%s" % (filename, tmp)) os.remove(filename) os.rename("%s.%s" % (filename, tmp), filename) def __makeFileMap(self, filename): self.totalCnt += 1 st = os.stat(filename) size = st[stat.ST_SIZE] if size == 0: return if size >= MAX_CHECK_SIZE: return md5digest = self.__getFileMD5(filename) baseFilename = self.__getBaseFilename(filename, md5digest, st) if not baseFilename: return print "%d / %d: %s (%d : %d)" % (self.linkedCnt, self.totalCnt, filename, size, self.totalReduce) print " ==> %s " % (baseFilename), if os.path.samefile(filename, baseFilename): print "=" return try: self.__createLink(baseFilename, filename, md5digest.hexdigest()) except Exception, e: print "Error" pass else: print "OK" self.totalReduce += size self.linkedCnt += 1 def makeMap(self, dirname): islink = os.path.islink isfile = os.path.isfile isdir = os.path.isdir print "Directory: %s" % dirname files = glob.glob("%s/*" % dirname) for filename in files: if islink(filename): continue if isdir(filename): self.makeMap(filename) elif isfile(filename): self.__makeFileMap(filename) def main(): ic = Kechikechi() if os.name == "nt": os.link = lambda base, filename: os.system("fsutil hardlink create \"%s\" \"%s\" > nul" % (filename, base)) os.path.samefile = lambda x, y: False try: ic.makeMap(sys.argv[1]) except IndexError, e: ic.makeMap(".") print "Total scaned files: %d files" % ic.totalCnt print "Total reduced files: %d files, %d bytes" % (ic.linkedCnt - 1, ic.totalReduce) if __name__ == '__main__': main()