I'm trying to do a one-way directory sync. Given a list of existing files in a dir, I'd like to make the files in the dir equal a new list of files. There will be subdirs under the dir. Because operating system calls are expensive, I'd rather minimize the number needed.
It's easy to just delete each file in the existing list but not on the new list, but that could leave empty subdirs. I could test for empty subdirs with OS calls, but as noted I'd like to avoid that. Similarly, I'd prefer removing dirs to first removing each file in the dir, then removing the empty dir.
I'm just operating on file names, not checking whether two files with the same name are the same or actually copying or deleting files or directories.
'''
Input:
- list of existing files
- revised list of files
Output:
- lists to be used to transform first list to second list
-- list of files to be added to existing dir
-- list of directories to be pruned
-- list of files to be deleted
'''
import os
import sys
def file_to_list(file):
return [x.strip() for x in open(file, 'r') if not x.startswith('#EXT')]
def one_minus_two(one, two):
return [x for x in one if x not in set(two)]
def reduce_dirs(delete_files, new_list):
new_delete = []
for file in delete_files:
parts = file.split('\\')
sub = ''
for i in range(len(parts)):
sub = os.path.join(sub, parts[i])
if sub == '':
sub = '\\'
count = 0
for song in new_list:
if song.startswith(sub):
count += 1
break
if count == 0:
new_delete.append(sub)
break
return list(set(new_delete))
def reduce_files(remove_dirs, delete_files):
rd = []
rf = []
for dir in remove_dirs:
if dir in delete_files:
rf.append(dir)
else:
rd.append(dir)
return rf, rd
def main():
old_file = sys.argv[1]
new_file = sys.argv[2]
old_list = file_to_list(old_file)
new_list = file_to_list(new_file)
add_files = one_minus_two(new_list, old_list)
print 'add_files', add_files
delete_files = one_minus_two(old_list, new_list)
print '\nraw delete list', delete_files # intermediate result
remove_items = reduce_dirs(delete_files, new_list)
print '\nreduced delete list', remove_items # intermediate result
rf, rd = reduce_files(remove_items, delete_files)
print '\ndelete files', rf
print '\nprune dirs', rd
if __name__ == '__main__':
main()
Sample list of existing files (old_files):
\dir\who\tommy\song1 \dir\who\tommy\song2 \dir\who\tommy\song3 \dir\rolling\beggars\song4 \dir\rolling\beggars\song5 \dir\rolling\beggars\song6 \dir\who\next\song7 \dir\who\next\song8 \dir\who\next\song9 \dir\pink\dark\song10 \dir\pink\dark\song11 \dir\pink\dark\song12 \dir\bach\orch\fugue\song13 \dir\bach\orch\fugue\song14 \dir\bach\orch\fugue\song15
Sample list of new_files:
\dir\rolling\beggars\song4 \dir\rolling\beggars\song5 \dir\rolling\beggars\song6 \dir\pink\dark\song10 \dir\pink\dark\song11 \dir\yes\closer\song16 \dir\yes\closer\song17 \dir\yes\closer\song18 \dir\king\court\song2 \dir\king\court\song4 \dir\king\court\song6
There are likely cases I'm ignoring with these simple examples.
I have the feeling I'm reinventing the wheel here.
1 Answer 1
Because operating system calls are expensive, I'd rather minimize the number needed
That's fine, and a great idea especially for dry-runs and testing.
It's easy to just delete each file in the existing list but not on the new list, but that could leave empty subdirs. I could test for empty subdirs with OS calls, but as noted I'd like to avoid that.
There is some risk here. What happens if a directory designated for deletion has files that did not appear in the new-list? I think at the least, the old list should be based on the true contents of the filesystem (outside of a unit test context). However, I don't demonstrate this.
not [...] actually copying or deleting files or directories.
The program implies only deletion, not copying.
Otherwise:
Convert from Python 2 to Python 3.
Add unit tests.
Don't hard-code for backslash path separators; use pathlib
properly instead.
I don't think there's value in all of your prints, only the final output. If you want to add optional debugging content, use logging
.
Use argparse
since this is supposed to be a command-line utility.
About your algorithm, things I like: it's non-recursive. Things I don't like as much: one_minus_two
is just reimplementing set subtraction; and it relies too much on string manipulation. It also differentiates between files and directories when I don't see a lot of value in doing that. count += 1
should terminate that loop.
As an alternative, I demonstrate a tree recursion algorithm. It's not clearly better or worse, but I can understand how it works better.
import argparse
import io
import typing
from collections import defaultdict
from pathlib import Path
def load_index(file: typing.TextIO) -> list[Path]:
return [
Path(line.rstrip())
for line in file
if not line.startswith('#EXT')
]
class Node:
__slots__ = 'children', 'dir', 'parent', 'to_drop', 'to_keep'
def __init__(self) -> None:
self.children: defaultdict[str, Node] = defaultdict(Node)
self.dir: Path | None = None
self.to_drop: set[Path] = set()
self.to_keep: set[Path] = set()
self.parent: Node | None = None
def add(self, paths: typing.Iterable[Path], keep: bool) -> None:
for path in paths:
# Traverse through children, get/create deepest leaf
leaf = self
for ancestor in path.parts[:-1]:
new_leaf = leaf.children[ancestor]
new_leaf.parent = leaf
leaf = new_leaf
# Add path to appropriate set
if keep:
leaf.to_keep.add(path)
elif path not in leaf.to_keep:
leaf.to_drop.add(path)
# Back-propagate path breadcrumbs
while leaf is not None and leaf.dir is None:
path = path.parent
leaf.dir = path
leaf = leaf.parent
def reduce(self) -> tuple[bool, set[Path]]:
# Get drop data for this node
has_kept = len(self.to_keep) > 0
to_drop = self.to_drop.copy()
# Get drop data recursed from child nodes
for child in self.children.values():
child_has_kept, child_drops = child.reduce()
if child_has_kept:
has_kept = True
to_drop |= child_drops
else:
to_drop.add(child.dir)
return has_kept, to_drop
def transform(old_file: typing.TextIO, new_file: typing.TextIO) -> set[Path]:
old_paths = load_index(old_file)
new_paths = load_index(new_file)
root = Node()
root.add(new_paths, keep=True)
root.add(old_paths, keep=False)
any_kept, top_drop = root.reduce()
return top_drop
def test() -> None:
with io.StringIO(
r'''\dir\who\tommy\song1
\dir\who\tommy\song2
\dir\who\tommy\song3
\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\who\next\song7
\dir\who\next\song8
\dir\who\next\song9
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\pink\dark\song12
\dir\bach\orch\fugue\song13
\dir\bach\orch\fugue\song14
\dir\bach\orch\fugue\song15
''') as old_file, io.StringIO(
r'''\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\yes\closer\song16
\dir\yes\closer\song17
\dir\yes\closer\song18
\dir\king\court\song2
\dir\king\court\song4
\dir\king\court\song6
''') as new_file:
top_drops = transform(old_file, new_file)
assert top_drops == {
Path('/dir/bach'),
Path('/dir/who'),
Path('/dir/pink/dark/song12'),
}
'''
Original output:
add_files ...
raw delete list ...
reduced delete list ['\\dir\\bach', '\\dir\\who', '\\dir\\pink\\dark\\song12']
delete files ['\\dir\\pink\\dark\\song12']
prune dirs ['\\dir\\bach', '\\dir\\who']
'''
def main() -> None:
parser = argparse.ArgumentParser(description='Produce a delete-set to deduplicate a directory')
parser.add_argument('old', type=open, help='File containing old file names')
parser.add_argument('new', type=open, help='File containing new file names')
args = parser.parse_args()
top_drops = transform(args.old, args.new)
print('\n'.join(str(p) for p in top_drops))
if __name__ == '__main__':
# test()
main()
\dir\pink\dark\song12
\dir\bach
\dir\who
shutil.rmtree(path)
... but I guess that would slow down so still not a viable option. 2) You could replace yourone_minus_two
by this:set(one) - set(two)
\$\endgroup\$