#!/usr/bin/env python3 # Copyright Jeff turbo Deifik 2003, 2004, 2005 All rights reserved # $Id: md5gen.py,v 1.16 2025/06/06 13:41:49 jdeifik Exp $ # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Jul-25-2003 JTD Wrote # Jun-27-2005 JTD Fixed bug that processes each dir twice # Jun-28-2005 JTD Made multithreaded, large potential speedup # Jun-29-2005 JTD Wait for all threads to finish before exiting # Jan-03-2016 JTD Removed threaded code, fixed mail code to work with # multiple directories. # Jan-09-2016 JTD Print each directory name by default (-q) to suppress # Apr-10-2018 JTD Converted to python3 # Jul-26-2022 JTD Made order of appending directories alphabetical import os import sys import difflib import time import argparse from typing import * import jefflib desc: List[str] = [ 'Usage: md5gen.py [-h] [-v] root-dir ...', '-h prints help message', '-q quiet (do not print directory names by default)', '-v verbose mode', 'Recurse through all directories starting at root directory root-dir', 'Generate a md5 file, called jtd_MD5SUM of all plain text files', 'If there is an existing md5 file, diff the current sums with the old ones', 'Rename the existing md5 file (if found) to old_jtd_MD5SUM' ] # Constants BLOCKSIZE = 1024*1024 CKNAME = b'jtd_MD5SUM' OLD_CKNAME = b'old_' + CKNAME def debug_print(st : str): return sys.stdout.write(jefflib.String_to_Bytes(time.clock()) + ' DEBUG ' + \ st + b'\n') # Print output if verbose is set def verbose_print(*bs : bytes): if Opts.verbose: for be in bs: sys.stdout.write(jefflib.String_to_Bytes(be) + b' ') sys.stdout.write(b'\n') # Scan a directory tree, looking for md5sum files in each directory that # contains at least one normal file. If there isn't one, generate one. def do_md5gen(the_dirs : List[bytes]): while len(the_dirs) > 0: insert_pos = 1 debug_print('top of while loop') debug_print(the_dirs) # Work on current root directory cur_dir = the_dirs[0] if os.path.isdir(cur_dir): process_the_directory(cur_dir) # Try to find subdirectories to process try: names = os.listdir(cur_dir) except: the_dirs.pop(0) continue for name in names: # Iterate through files in current dir # JTD - skip recycler and system volume information... For windows only if name in [b'RECYCLER', b'$RECYCLE.BIN', b'$RECYCLE.bin', \ b'System Volume Information', \ b'System_Volume_Information', b'CVS', b'cygdrive', \ b'Diskeeper', b'proc']: continue if cur_dir == b'/': File = b'/' + name else: File = cur_dir + b'/' + name if os.path.isdir(File): # Insert in aplhabetical order after current directory the_dirs.insert(insert_pos, File) insert_pos += 1 the_dirs.pop(0) # Process a directory do the md5sum computation and comparison # Take in a full directory path... def process_the_directory(dir_name : bytes): md5list = [] if not Opts.quiet: sys.stdout.flush() sys.stdout.write(b'Processing directory ' + dir_name + b'\n') debug_print("TOP OF PROCESS_THE_DIRECTORY" + \ jefflib.Bytes_to_String(dir_name)) (unused, plain_file_list) = jefflib.Plain_Files_In_Directory(dir_name) full_ckname = dir_name + b'/' + CKNAME full_old_ckname = dir_name + b'/' + OLD_CKNAME if len(plain_file_list) > 0 : # This loop is where almost all the cpu time is spent # debug_print('top of compute loop') for plain in plain_file_list : # Skip processing md5sum and old md5sum files if (plain == CKNAME) or (plain == OLD_CKNAME) : continue md5str = jefflib.Md5sum_On_File(dir_name + b'/' + plain, plain) # Add a newline md5list.append(md5str + b'\n') # Append it to the md5list # debug_print('bottom of compute loop') md5list.sort() # Put it in sorted form # debug_print('sorted') # This is a big lock for all normal output generation # io_lock.acquire(1) if CKNAME in plain_file_list : # Is there a md5sum file? # Now, read in existing md5sum file verbose_print("found md5sum file") old_text = jefflib.File_To_List(full_ckname, True) old_text.sort() # Sort it if not jefflib.Cmp_Lists(md5list, old_text): # There are differences sys.stdout.write(b'Differences Found !!!! directory is: ' + \ dir_name + b'\n') # debug_print('calling diff_bytes') the_diff = difflib.context_diff result = list(difflib.diff_bytes(the_diff, old_text, md5list)) sys.stdout.write(b'Differences are:\n') for r in result: if len(r) > 1 and r[0] == ord(' '): continue if len(r) > 3 and (r[0:3] == b'***' or r[0:3] == b'---'): continue if len(r) == 3 and r[0:3] == b'- \n': continue sys.stdout.write(r) # print("Old md5sum is: ", old_text) # print("New md5sum is: ", md5list) if OLD_CKNAME in plain_file_list: os.remove(full_old_ckname) # Remove old file # Rename old md5sum file os.rename(full_ckname, full_old_ckname) # Print out the new md5sum file # debug_print('calling list to file') jefflib.List_To_File(full_ckname, md5list, True) # debug_print('called list to file') else : verbose_print("md5sum's match") else : sys.stdout.write(b'directory ' + dir_name + \ b' missing md5sum file !!!!\n') # Print out the new md5sum file # debug_print('calling list to file') jefflib.List_To_File(full_ckname, md5list, True) # debug_print('called list to file') # io_lock.release() # Top level code if __name__ == '__main__': # Command line parsing code Parser = argparse.ArgumentParser() Parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False, help="enable verbose messages") Parser.add_argument("-q", "--quiet", action="store_true", dest="quiet", default=False, help="enable verbose messages") Parser.add_argument("-u", "--usage", dest="use", action="store_true", help="Print usage and exit") Parser.add_argument("args", nargs='*', help="the real arguments") Opts = Parser.parse_args() if Opts.use: jefflib.Usage(desc) if len(Opts.args) == 0: jefflib.Usage(desc) else: # Make args into binary sttrings dirs = jefflib.List_String_to_Bytes(Opts.args) debug_print(dirs) sys.stdout = sys.stdout.buffer # Put stdout in binary mode do_md5gen(dirs)