同步操作将从 mktime/python-learn 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
#!/usr/bin/env python2import os.pathfrom os.path import isfile, isdir, join, getsize, basenamefrom sys import argv, exitimport getoptfrom hashlib import md5'''date: 2015年07月16日 12:56:27author: withrockmail: withfaker@gmail.comdesc: this tool is used to find content-repeated file in a directory tree.it didn't use md5 hash because of calcluating hash slowly for big file.i wrote a simple hash function `calc_hash` whhich just depends on file size and file name.if you have good idea for quickly calcluating file hash,you can give me some advise, thank you!'''def calc_hash(filepath, size):data = str(size) + "-" + basename(filepath)m = md5()m.update(data)return m.hexdigest()def calc_hash_slow(filepath, size):buffer_size = 1024*1024*2handle = open(filepath, "rb")m = md5()while True:data = handle.read(buffer_size)if not data:breakm.update(data)handle.close()return m.hexdigest()def pretty_size(size):if size < 1024:return "%.2f Bytes" % sizeelif size < (1024 * 1024):return "%.2f Kibs" % (float(size) / 1024.00)elif size < (1024 * 1024 * 1024):return "%.2f Mibs" % (float(size) / (1024.00 * 1024.00))elif size < (1024 * 1024 * 1024 * 1024):return "%.2f Gibs" % (float(size) / (1024.00 * 1024.00 * 1024.00))else:return "%.2f Tibs" % (float(size) / (1024.00 * 1024.00 * 1024.00 * 1024.00))'''{hash1 --> [(path, filesize), ...],hash2 --> [(path, filesize), ...],hash3 --> [(path, filesize), ...],hash4 --> [(path, filesize), ...]...}'''data_set = {}def insert_data(data):_hash = data['hash']if not data_set.has_key(_hash):data_set[_hash] = [data[_hash], ]else:data_set[_hash].append(data[_hash])def analyze_data():for _hash in data_set:if len(data_set[_hash]) > 1:print("-" * 40)for one in data_set[_hash]:print("\t", one)def find_repeat(p, big_size):'''find repeat-content file'''try:items = os.listdir(p)except:items = []for item in items:fp = join(p, item)if isfile(fp):file_size = getsize(fp)if file_size > big_size:_hash = calc_hash(fp, file_size)_size = pretty_size(file_size)data = {'hash': _hash,_hash : (fp, _size)}insert_data(data)else:find_repeat(fp, big_size)def usage():print('''python find_repeat.py [options]-d, --dir the dir to find-m, --minsize find size large than minsize-h, --help show help-v, --version show version''')exit(0)if __name__ == '__main__':big_size = ""query_dir = ""try:options, args = getopt.getopt(argv[1:], "hvd:m:", ["help", "dir=", "minsize="])except:usage()for o, v in options:if o in ("-h", "--help"):usage()if o in ("-v", "--version"):print("0.0.1 --by withrock")exit(0)if o in ("-d", "--dir"):query_dir = vif not isdir(query_dir):print("dir invalid.")usage()if o in ("-m", "--minsize"):try:big_size = long(v)except:print("minsize invalid.")usage()if not big_size or not query_dir:usage()find_repeat(query_dir, big_size)analyze_data()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。