#!/usr/bin/env python # $Id: de_dupe.py,v 1.36 2014/09/03 22:43:45 jdeifik Exp $ # de_dupe.py - deduplicates a filesystem, based on duplicates files # Copyright Jeff turbo Deifik 2010. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . from __future__ import print_function desc = [ 'Name: de_dupe - takes in a source directory and one or more search', 'directories. Looks for files in the directory(s) that are duplicates', 'example: de_dupe.py --total --minsize=1000000 foo-dir...' ] import os import sys import stat from optparse import OptionParser import filecmp import jefflib # used by Uniq function def tuple_cmp(a, b): if a[1] > b[1]: return 1 elif a[1] < b[1]: return -1 else: return 0 # Looks through source, trying to find a match inside of it. # If found, temporarly rename file, make a hard link, then delete renamed file. def look_through_one_table(source): bytes_saved = 0 # Used for logging progress progress_ind.Reset() print('\nLooking for duplicates') if Options.big: sorted_list = list(sorted(source.keys(), reverse=True)) else: sorted_list = list(sorted(source.keys())) # loop through file sizes that are the same for size in sorted_list: source_files = source[size] # List of all files with same size # Sort by inode s_source_files = sorted(source_files, key=lambda foo: foo[1]) # Uniqify based on inode u_source_files = jefflib.Uniq(s_source_files, cmp=tuple_cmp) if len(u_source_files) < 2: # Can't match if less than 2 files continue # Compute md5sum on first block of each file m_source_files = md5sum_s(u_source_files) leng = len(m_source_files) for i in range(0, leng-1): # Check all files of the same size f = m_source_files[i] # filecmp.cmp uses an undocumented cache, which can run out of memory, # therefore I am clearing it periodically filecmp._cache = {} # Against all other files of the same size, after index i for ff in m_source_files[(i+1):]: # If the md5sum's don't match, we are done (as they are sorted) if f[1] != ff[1]: break res = core(size + bytes_saved, f[0], ff[0]) if res == 2: bytes_saved += size del source[size] # Save some memory # the core file comparison and linking code # Return 0 if files don't match. # Return 1 if files match, but f and ff was a hard link # Return 2 if files match, and f or ff was not a hard link (disk space saved) def core(siz, f, ff): f_s = os.stat(f) ff_s = os.stat(ff) f_link = f_s[stat.ST_NLINK] ff_link = ff_s[stat.ST_NLINK] ret = 0 # Verify inodes differ if (f_s[stat.ST_INO] != ff_s[stat.ST_INO]): progress_ind.P_I() res = filecmp.cmp(f, ff, False) if res: # Make link to file having the highest link count if ff_link >= f_link: res = file_to_link(link_file = f, keep_file = ff) else: res = file_to_link(link_file = ff, keep_file = f) if res: ret += 1 if Options.verbose: print('Files match', f, ff) if f_link == 1 or ff_link == 1: # Woo-hoo disk space is saved! ret += 1 if Options.total: progress_ind.Reset() print(jefflib.Split_Thousands(siz)) return ret # rename link_file to something clever, # make a hard link named link_file to keep_file # delete renamed file def file_to_link(link_file, keep_file): (head, tail) = os.path.split(link_file) tmp_name = head + '/de_dupe-' + tail ret = False # Rename link_file to tmp_name if jefflib.Try_To_Rename(link_file, tmp_name, False): # Make the hard link if jefflib.Try_To_Link(keep_file, link_file, False): # Delete the renamed link_file if jefflib.Try_To_Remove(tmp_name, False): ret = True return ret # Compute md5sum on each file # Return tuple of file_name, md5sum def md5sum_s(tuples): new_lis = [] for tup in tuples: md = jefflib.Md5sum_On_First_Block_Of_File(tup[0]) new_lis.append((tup[0], md)) # Sort by md5sum new_lis = sorted(new_lis, key=lambda foo: foo[1]) return new_lis # Top level code progress_ind = jefflib.Progress_Indicator(1000, 1) filecmp.BUFSIZE = 1024 * 1024 # Bigger buffer size for cmp # Command line parsing code Parser = OptionParser() Parser.add_option("--minsize", dest="minsize", action="store", type="int", default=512, help="specify min size") Parser.add_option("--maxsize", dest="maxsize", action="store", type="int", default=0, help="specify max size") Parser.add_option("-t", "--total", dest="total", action="store_true", help="print total bytes saved") Parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose output") Parser.add_option("-b", "--big", dest="big", action="store_true", help="big to small", default=False) Parser.add_option("-u", "--usage", dest="use", action="store_true", default=False, help="Print usage and exit") (Options, Args) = Parser.parse_args() if Options.use: jefflib.Usage(desc) if len(Args) == 0: print("Need to specify one or more directoryes", file=sys.stderr) jefflib.Usage(desc) dirs = Args search_table = jefflib.files_and_sizes(dirs, Options.minsize, Options.maxsize, progress_ind.P_I, ignore_links = False) look_through_one_table(search_table)