Compare files by name in two folders A and B and delete duplicates from folder A

Question 1

I wrote my very first Python program to answer a Super User question. I'm wondering how the application can be rewritten to make it as pythonic as possible.

Unit test

import unittest
import os
import shutil
from DeleteDuplicateInFolderA import DeleteDuplicateInFolderA
class DeleteDuplicateInFolderATest(unittest.TestCase):
 def __init__(self, *args, **kwargs):
 super(DeleteDuplicateInFolderATest, self).__init__(*args, **kwargs)
 self._base_directory = r"c:\temp\test"
 self._path_A = self._base_directory + r"\A"
 self._path_B = self._base_directory + r"\B"
 def create_folder_and_create_some_files(self, path, filename_list):
 if os.path.exists(path):
 shutil.rmtree(path)
 os.makedirs(path)
 for filename in filename_list:
 open(os.path.join(path, filename), "w+").close()
 def setUp(self):
 # Create folders and files for testing
 self.create_folder_and_create_some_files(self._path_A, ["1.txt", "2.txt"])
 self.create_folder_and_create_some_files(self._path_B, ["2.txt", "3.txt"])
 def tearDown(self):
 for path in [self._path_A, self._path_B, self._base_directory]:
 if os.path.exists(path):
 shutil.rmtree(path)
 def test_duplicate_file_gets_deleted(self):
 # Arrange
 app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=False)
 # Act
 app.delete_duplicates_in_folder_A()
 # Assert
 self.assertFalse(os.path.isfile(self._path_A + r"2円.txt"), "File 2.txt has not been deleted.")
 def test_duplicate_file_gets_not_deleted_in_mode_dryrun(self):
 # Arrange
 app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=True)
 # Act
 app.delete_duplicates_in_folder_A()
 # Assert
 self.assertTrue(os.path.isfile(self._path_A + r"2円.txt"), "File 2.txt should not have been deleted in mode '--dryrun'")
def main():
 unittest.main()
if __name__ == '__main__':
 main()

Application

#!/usr/bin/python
import sys
import os
class DeleteDuplicateInFolderA(object):
 """Given two paths A and B, the application determines which files are in
 path A which are also in path B and then deletes the duplicates from
 path A.
 If the "dry run" flag is set to 'false', files are deleted. Otherwise
 they are only displayed but not deleted.
 """
 def __init__(self, path_A, path_B, is_dry_run=True):
 self._path_A = path_A
 self._path_B = path_B
 self._is_dry_run = is_dry_run
 def get_filenames_in_folder(self, folder_path):
 only_files = []
 for (dirpath, dirnames, filenames) in os.walk(folder_path):
 only_files.extend(filenames)
 return only_files
 def print_files(sel, heading, files):
 print(heading)
 if len(files) == 0:
 print(" none")
 else:
 for file in files:
 print(" {}".format(file))
 def delete_duplicates_in_folder_A(self):
 only_files_A = self.get_filenames_in_folder(self._path_A)
 only_files_B = self.get_filenames_in_folder(self._path_B)
 files_of_A_that_are_in_B = [file for file in only_files_A if file in only_files_B]
 self.print_files("Files in {}".format(self._path_A), only_files_A)
 self.print_files("Files in {}".format(self._path_B), only_files_B)
 if self._is_dry_run:
 self.print_files("These files would be deleted: ", [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B])
 else:
 print("Deleting files:")
 for filepath in [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B]:
 print(" {}".format(filepath))
 os.remove(filepath) 
if __name__ == "__main__":
 if len(sys.argv) == 4:
 is_dry_run_argument = sys.argv[3]
 if not is_dry_run_argument == "--dryrun":
 print("The 3rd argument must be '--dryrun' or nothing.")
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
 app.delete_duplicates_in_folder_A()

Question 2

You made a little mistake. It should be print(...), not println(...) in your if not is_dry_... block. Python, not JavaScript ;)

Question 3

@zondo: Well spotted, thanks. I fixed it. That means my test suite sucks...

Question 4

@Lernkurve, please remember it is CodeReview policy not to change your code once you've gotten feedback.

Question 5

Is your intention for the comparison/deletion to work recursively in subdirectories? Have you tested the recursive comparison?

Question 6

@200_success: No, there was no intention to compare recursively in subdirectories.

Question 7

First of all, make sure you use some tools to check your code (e.g. Pylint or Prospector).

And a few comments:

Use os.path.join:

self._path_A = os.path.join(self._base_directory, "A")

vs

self._path_A = self._base_directory + r"\A"

Consider using object methods to check for the content:

if not lines:

vs

if len(files) == 0:

Use the above mentioned tools to find obvious typos:

def print_files(self, heading, files):

vs

def print_files(sel, heading, files)

Do not name unused results:

for (dirpath, _, filenames) in os.walk(folder_path):

vs

for (dirpath, dirnames, filenames) in os.walk(folder_path):

You do not seem to take into account any subdirectories:

for (dirpath, dirnames, filenames) in os.walk(folder_path):
 only_files.extend(filenames)

which you might want to do as you then try to remove files in them:

 for filepath in [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B]:

Wrap the top-level code into a function:

def main()
 if len(sys.argv) == 4:
 is_dry_run_argument = sys.argv[3]
 if not is_dry_run_argument == "--dryrun":
 print("The 3rd argument must be '--dryrun' or nothing.")
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
 app.delete_duplicates_in_folder_A()
if __name__ == '__main__':
 main()

vs

if __name__ == '__main__':
 if len(sys.argv) == 4:
 is_dry_run_argument = sys.argv[3]
 if not is_dry_run_argument == "--dryrun":
 print("The 3rd argument must be '--dryrun' or nothing.")
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
 app.delete_duplicates_in_folder_A()

Question 8

To make things more Pythonic you should follow PEP8, as it is the standard for Pythonic code.

I personally would change get_filenames_in_folder, as it gets all child files, even ones in folders. And so I'd make it a generator that yields files with paths relative to its folder. I.e .\spam.py and foo\bar.py. To achieve this you just need to add os.path.relpath and then os.path.join the result with the file name. This results in:

def sub_files(folder):
 relpath = os.path.relpath
 join = os.path.join
 for path, _, files in os.walk(folder):
 relative = relpath(path, folder)
 for file in files:
 yield join(relative, file)

Your way of finding the duplicates, or as you call them files_of_A_that_are_in_B, is ok. I'd go for an easier to read way of making sets and getting the unison. set(files_a) & set(files_b). Nice and easy to read.

I don't like that you print different things with and without is_dry_run. Instead I'd make it print duplicates if there are some, otherwise nothing. And I also don't like that any file that has the same name is a duplicate, I can't think of good names when I want a temporary python file, and so I do make all: tmp, temp, tmp.py, temp.py. Sometimes I even do temp.temp.py! I know that those files are not duplicates, but have the same name. So I'd use filecmp.

And so I'd use the following function to do the same as you:

def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
 folders = [folder_a, folder_b]
 files = [set(sub_files(folder)) for folder in folders]
 duplicates, *_ = groups = [files[0] & files[1]]
 if not shallow:
 duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
 print('\n\n'.join(
 '{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
 for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
 if files
 ))
 if not dry_run:
 join = os.path.join
 remove = os.remove
 for file in duplicates:
 remove(join(folder_a, file))

Which uses very little actual code.

After this, I'd change how do your __main__. Instead of rolling your own argument passer, you can just use argparse. Trust me it's really simple to make and use. Instead of what you're doing, you can just do:

if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument('folder_a', help='folder to delete from')
 parser.add_argument('folder_b', help='folder to find duplicates in')
 parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
 parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
 action="store_true")
 args = parser.parse_args()
 remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)

And you don't have to care about wrong input, as it'll never get to you.

The only thing that I think is ugly is the print I'm using, but since the alternate is slow or a lot longer I'm fine with it. And possibly the duplicates, *_ hack. And so it could be:

import os
import filecmp
import argparse
def sub_files(folder):
 relpath = os.path.relpath
 join = os.path.join
 for path, _, files in os.walk(folder):
 relative = relpath(path, folder)
 for file in files:
 yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
 folders = [folder_a, folder_b]
 files = [set(sub_files(folder)) for folder in folders]
 duplicates, *_ = groups = [files[0] & files[1]]
 if not shallow:
 duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
 print('\n\n'.join(
 '{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
 for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
 if files
 ))
 if not dry_run:
 join = os.path.join
 remove = os.remove
 for file in duplicates:
 remove(join(folder_a, file))
if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument('folder_a', help='folder to delete from')
 parser.add_argument('folder_b', help='folder to find duplicates in')
 parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
 parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
 action="store_true")
 args = parser.parse_args()
 remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)

Question 9

The first "duplicates" line should read

duplicates, *_ = groups = [files[0] , files[1]]

my extended code

move to trash - instead of delete


# Ferdiga
# moves duplicate files in folder_a to trash
# 20220625
# credits to
# https://codereview.stackexchange.com/questions/133333/compare-files-by-name-in-two-folders-a-and-b-and-delete-duplicates-from-folder-a
import os
import filecmp
import argparse
from send2trash import send2trash
def sub_files(folder):
 relpath = os.path.relpath
 join = os.path.join
 for path, _, files in os.walk(folder):
 relative = relpath(path, folder)
 for file in files:
 yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
 Trash = "/Users/ferdinand/.Trash"
 folders = [folder_a, folder_b]
 files = [set(sub_files(folder)) for folder in folders]
 duplicates, *_ = groups = [files[0] , files[1]]
 if not shallow:
 duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
 print('\n\n'.join(
 '{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
 for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
 if files
 ))
 if not dry_run:
 join = os.path.join
 for file in duplicates:
 send2trash(join(folder_a, file))
if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument('folder_a', help='folder to delete from')
 parser.add_argument('folder_b', help='folder to find duplicates in')
 parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
 parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
 action="store_true")
 args = parser.parse_args()
 remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)
```

Mikhail Sobolev Mikhail Sobolev 261 bronze badge · Accepted Answer · 2016-06-29 04:06:25Z

First of all, make sure you use some tools to check your code (e.g. Pylint or Prospector).

And a few comments:

Use os.path.join:

self._path_A = os.path.join(self._base_directory, "A")

vs

self._path_A = self._base_directory + r"\A"

Consider using object methods to check for the content:

if not lines:

vs

if len(files) == 0:

Use the above mentioned tools to find obvious typos:

def print_files(self, heading, files):

vs

def print_files(sel, heading, files)

Do not name unused results:

for (dirpath, _, filenames) in os.walk(folder_path):

vs

for (dirpath, dirnames, filenames) in os.walk(folder_path):

You do not seem to take into account any subdirectories:

for (dirpath, dirnames, filenames) in os.walk(folder_path):
 only_files.extend(filenames)

which you might want to do as you then try to remove files in them:

 for filepath in [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B]:

Wrap the top-level code into a function:

def main()
 if len(sys.argv) == 4:
 is_dry_run_argument = sys.argv[3]
 if not is_dry_run_argument == "--dryrun":
 print("The 3rd argument must be '--dryrun' or nothing.")
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
 app.delete_duplicates_in_folder_A()
if __name__ == '__main__':
 main()

vs

if __name__ == '__main__':
 if len(sys.argv) == 4:
 is_dry_run_argument = sys.argv[3]
 if not is_dry_run_argument == "--dryrun":
 print("The 3rd argument must be '--dryrun' or nothing.")
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
 else:
 app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
 app.delete_duplicates_in_folder_A()

Stack Exchange Network

Compare files by name in two folders A and B and delete duplicates from folder A

Unit test

Application

3 Answers 3

Hot Network Questions

Compare files by name in two folders A and B and delete duplicates from folder A

Unit test

Application

3 Answers 3

Related

Hot Network Questions