I wrote my very first Python program to answer a Super User question. I'm wondering how the application can be rewritten to make it as pythonic as possible.
Unit test
import unittest
import os
import shutil
from DeleteDuplicateInFolderA import DeleteDuplicateInFolderA
class DeleteDuplicateInFolderATest(unittest.TestCase):
def __init__(self, *args, **kwargs):
super(DeleteDuplicateInFolderATest, self).__init__(*args, **kwargs)
self._base_directory = r"c:\temp\test"
self._path_A = self._base_directory + r"\A"
self._path_B = self._base_directory + r"\B"
def create_folder_and_create_some_files(self, path, filename_list):
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
for filename in filename_list:
open(os.path.join(path, filename), "w+").close()
def setUp(self):
# Create folders and files for testing
self.create_folder_and_create_some_files(self._path_A, ["1.txt", "2.txt"])
self.create_folder_and_create_some_files(self._path_B, ["2.txt", "3.txt"])
def tearDown(self):
for path in [self._path_A, self._path_B, self._base_directory]:
if os.path.exists(path):
shutil.rmtree(path)
def test_duplicate_file_gets_deleted(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=False)
# Act
app.delete_duplicates_in_folder_A()
# Assert
self.assertFalse(os.path.isfile(self._path_A + r"2円.txt"), "File 2.txt has not been deleted.")
def test_duplicate_file_gets_not_deleted_in_mode_dryrun(self):
# Arrange
app = DeleteDuplicateInFolderA(self._path_A, self._path_B, is_dry_run=True)
# Act
app.delete_duplicates_in_folder_A()
# Assert
self.assertTrue(os.path.isfile(self._path_A + r"2円.txt"), "File 2.txt should not have been deleted in mode '--dryrun'")
def main():
unittest.main()
if __name__ == '__main__':
main()
Application
#!/usr/bin/python
import sys
import os
class DeleteDuplicateInFolderA(object):
"""Given two paths A and B, the application determines which files are in
path A which are also in path B and then deletes the duplicates from
path A.
If the "dry run" flag is set to 'false', files are deleted. Otherwise
they are only displayed but not deleted.
"""
def __init__(self, path_A, path_B, is_dry_run=True):
self._path_A = path_A
self._path_B = path_B
self._is_dry_run = is_dry_run
def get_filenames_in_folder(self, folder_path):
only_files = []
for (dirpath, dirnames, filenames) in os.walk(folder_path):
only_files.extend(filenames)
return only_files
def print_files(sel, heading, files):
print(heading)
if len(files) == 0:
print(" none")
else:
for file in files:
print(" {}".format(file))
def delete_duplicates_in_folder_A(self):
only_files_A = self.get_filenames_in_folder(self._path_A)
only_files_B = self.get_filenames_in_folder(self._path_B)
files_of_A_that_are_in_B = [file for file in only_files_A if file in only_files_B]
self.print_files("Files in {}".format(self._path_A), only_files_A)
self.print_files("Files in {}".format(self._path_B), only_files_B)
if self._is_dry_run:
self.print_files("These files would be deleted: ", [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B])
else:
print("Deleting files:")
for filepath in [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B]:
print(" {}".format(filepath))
os.remove(filepath)
if __name__ == "__main__":
if len(sys.argv) == 4:
is_dry_run_argument = sys.argv[3]
if not is_dry_run_argument == "--dryrun":
print("The 3rd argument must be '--dryrun' or nothing.")
else:
app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
else:
app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
app.delete_duplicates_in_folder_A()
3 Answers 3
First of all, make sure you use some tools to check your code (e.g. Pylint or Prospector).
And a few comments:
Use os.path.join
:
self._path_A = os.path.join(self._base_directory, "A")
vs
self._path_A = self._base_directory + r"\A"
Consider using object methods to check for the content:
if not lines:
vs
if len(files) == 0:
Use the above mentioned tools to find obvious typos:
def print_files(self, heading, files):
vs
def print_files(sel, heading, files)
Do not name unused results:
for (dirpath, _, filenames) in os.walk(folder_path):
vs
for (dirpath, dirnames, filenames) in os.walk(folder_path):
You do not seem to take into account any subdirectories:
for (dirpath, dirnames, filenames) in os.walk(folder_path):
only_files.extend(filenames)
which you might want to do as you then try to remove files in them:
for filepath in [os.path.join(self._path_A, file) for file in files_of_A_that_are_in_B]:
Wrap the top-level code into a function:
def main()
if len(sys.argv) == 4:
is_dry_run_argument = sys.argv[3]
if not is_dry_run_argument == "--dryrun":
print("The 3rd argument must be '--dryrun' or nothing.")
else:
app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
else:
app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
app.delete_duplicates_in_folder_A()
if __name__ == '__main__':
main()
vs
if __name__ == '__main__':
if len(sys.argv) == 4:
is_dry_run_argument = sys.argv[3]
if not is_dry_run_argument == "--dryrun":
print("The 3rd argument must be '--dryrun' or nothing.")
else:
app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=True)
else:
app = DeleteDuplicateInFolderA(sys.argv[1], sys.argv[2], is_dry_run=False)
app.delete_duplicates_in_folder_A()
To make things more Pythonic you should follow PEP8, as it is the standard for Pythonic code.
I personally would change get_filenames_in_folder
, as it gets all child files, even ones in folders.
And so I'd make it a generator that yields files with paths relative to its folder.
I.e .\spam.py
and foo\bar.py
.
To achieve this you just need to add os.path.relpath
and then os.path.join
the result with the file name.
This results in:
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)
Your way of finding the duplicates
, or as you call them files_of_A_that_are_in_B
, is ok.
I'd go for an easier to read way of making sets and getting the unison. set(files_a) & set(files_b)
.
Nice and easy to read.
I don't like that you print different things with and without is_dry_run
.
Instead I'd make it print duplicates if there are some, otherwise nothing.
And I also don't like that any file that has the same name is a duplicate,
I can't think of good names when I want a temporary python file, and so I do make all: tmp
, temp
, tmp.py
, temp.py
.
Sometimes I even do temp.temp.py
!
I know that those files are not duplicates, but have the same name.
So I'd use filecmp
.
And so I'd use the following function to do the same as you:
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] & files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
remove = os.remove
for file in duplicates:
remove(join(folder_a, file))
Which uses very little actual code.
After this, I'd change how do your __main__
.
Instead of rolling your own argument passer, you can just use argparse
.
Trust me it's really simple to make and use.
Instead of what you're doing, you can just do:
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)
And you don't have to care about wrong input, as it'll never get to you.
The only thing that I think is ugly is the print I'm using, but since the alternate is slow or a lot longer I'm fine with it.
And possibly the duplicates, *_
hack.
And so it could be:
import os
import filecmp
import argparse
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] & files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
remove = os.remove
for file in duplicates:
remove(join(folder_a, file))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)
The first "duplicates" line should read
duplicates, *_ = groups = [files[0] , files[1]]
my extended code
- move to trash - instead of delete
# Ferdiga
# moves duplicate files in folder_a to trash
# 20220625
# credits to
# https://codereview.stackexchange.com/questions/133333/compare-files-by-name-in-two-folders-a-and-b-and-delete-duplicates-from-folder-a
import os
import filecmp
import argparse
from send2trash import send2trash
def sub_files(folder):
relpath = os.path.relpath
join = os.path.join
for path, _, files in os.walk(folder):
relative = relpath(path, folder)
for file in files:
yield join(relative, file)
def remove_duplicates(folder_a, folder_b, shallow=False, dry_run=False):
Trash = "/Users/ferdinand/.Trash"
folders = [folder_a, folder_b]
files = [set(sub_files(folder)) for folder in folders]
duplicates, *_ = groups = [files[0] , files[1]]
if not shallow:
duplicates, *_ = groups = filecmp.cmpfiles(*folders, duplicates)
print('\n\n'.join(
'{}:\n{}'.format(name, '\n'.join(' {}'.format(file) for file in files))
for files, name in zip(groups, ('Duplicates', 'Non-Duplicates', 'Errors'))
if files
))
if not dry_run:
join = os.path.join
for file in duplicates:
send2trash(join(folder_a, file))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('folder_a', help='folder to delete from')
parser.add_argument('folder_b', help='folder to find duplicates in')
parser.add_argument('-d', '--dry', help='dry-run the program', action="store_true")
parser.add_argument('-s', '--shallow', help='only check file names for duplicates',
action="store_true")
args = parser.parse_args()
remove_duplicates(args.folder_a, args.folder_b, shallow=args.shallow, dry_run=args.dry)
```
Explore related questions
See similar questions with these tags.
print(...)
, notprintln(...)
in yourif not is_dry_...
block. Python, not JavaScript ;) \$\endgroup\$