#!/usr/bin/python import glob import os import sys import md5 import stat import mmap MAX_CHECK_SIZE = 1073741824 class Kechikechi(object): def __init__(self): self.fileMap = {} self.totalReduce = 0 self.linkedCnt = 1 self.totalCnt = 1 def __getFileMD5(self, filename): st = os.stat(filename) size = st[stat.ST_SIZE] f = open(filename, "rb") content = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) try: try: m = md5.md5(content) except IOError, e: print "not found %s" % filename raise e finally: content.close() f.close() return m def __getBaseFilename(self, filename, digest, st): md5hex = digest.hexdigest() key = "%s%d_%d_%d" % (md5hex, st[stat.ST_MODE], st[stat.ST_UID], st[stat.ST_GID]) baseFilename = "%s/%s/%s/%s" %("kechikechibase", md5hex[0:2], md5hex[2:4], key) if os.path.exists(baseFilename): return baseFilename else: try: os.link(filename, baseFilename) except OSError, e: if e[0] == 2: os.makedirs(os.path.dirname(baseFilename)) os.link(filename, baseFilename) return None def __createLink(self, base, filename, tmp): os.link(base, "%s.%s" % (filename, tmp)) os.remove(filename) os.rename("%s.%s" % (filename, tmp), filename) def __makeFileMap(self, filename): self.totalCnt += 1 st = os.stat(filename) size = st[stat.ST_SIZE] if size == 0: return if size >= MAX_CHECK_SIZE: return md5digest = self.__getFileMD5(filename) baseFilename = self.__getBaseFilename(filename, md5digest, st) if not baseFilename: return print "%d / %d: %s (%d : %d) ==> " % (self.linkedCnt, self.totalCnt, filename, size, self.totalReduce), if os.path.samefile(filename, baseFilename): print "=" return try: self.__createLink(baseFilename, filename, md5digest.hexdigest()) except e: print "Error" raise e else: print "OK" self.totalReduce += size self.linkedCnt += 1 def makeMap(self, dirname): islink = os.path.islink isfile = os.path.isfile isdir = os.path.isdir print "Directory: %s" % dirname files = glob.glob("%s/*" % dirname) for filename in files: if islink(filename): continue if isdir(filename): self.makeMap(filename) elif isfile(filename): self.__makeFileMap(filename) def main(): ic = Kechikechi() try: ic.makeMap(sys.argv[1]) except IndexError, e: ic.makeMap(".") print "Total scaned files: %d files" % ic.totalCnt print "Total reduced files: %d files, %d bytes" % (ic.linkedCnt - 1, ic.totalReduce) if __name__ == '__main__': main()