Python files sync

Question 1

I'm trying to do a one-way directory sync. Given a list of existing files in a dir, I'd like to make the files in the dir equal a new list of files. There will be subdirs under the dir. Because operating system calls are expensive, I'd rather minimize the number needed.

It's easy to just delete each file in the existing list but not on the new list, but that could leave empty subdirs. I could test for empty subdirs with OS calls, but as noted I'd like to avoid that. Similarly, I'd prefer removing dirs to first removing each file in the dir, then removing the empty dir.

I'm just operating on file names, not checking whether two files with the same name are the same or actually copying or deleting files or directories.

''' 
Input:
- list of existing files
- revised list of files
Output:
- lists to be used to transform first list to second list
-- list of files to be added to existing dir
-- list of directories to be pruned
-- list of files to be deleted
'''
import os
import sys
def file_to_list(file):
 return [x.strip() for x in open(file, 'r') if not x.startswith('#EXT')]
def one_minus_two(one, two):
 return [x for x in one if x not in set(two)]
def reduce_dirs(delete_files, new_list):
 new_delete = []
 for file in delete_files:
 parts = file.split('\\')
 sub = ''
 for i in range(len(parts)):
 sub = os.path.join(sub, parts[i])
 if sub == '':
 sub = '\\'
 count = 0
 for song in new_list:
 if song.startswith(sub):
 count += 1
 break
 if count == 0:
 new_delete.append(sub)
 break
 return list(set(new_delete))
def reduce_files(remove_dirs, delete_files):
 rd = []
 rf = []
 for dir in remove_dirs:
 if dir in delete_files:
 rf.append(dir)
 else:
 rd.append(dir)
 return rf, rd
def main(): 
 old_file = sys.argv[1]
 new_file = sys.argv[2]
 old_list = file_to_list(old_file)
 new_list = file_to_list(new_file)
 add_files = one_minus_two(new_list, old_list)
 print 'add_files', add_files
 delete_files = one_minus_two(old_list, new_list)
 print '\nraw delete list', delete_files # intermediate result
 remove_items = reduce_dirs(delete_files, new_list)
 print '\nreduced delete list', remove_items # intermediate result
 rf, rd = reduce_files(remove_items, delete_files)
 print '\ndelete files', rf
 print '\nprune dirs', rd
if __name__ == '__main__': 
 main()

Sample list of existing files (old_files):

\dir\who\tommy\song1
\dir\who\tommy\song2
\dir\who\tommy\song3
\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\who\next\song7
\dir\who\next\song8
\dir\who\next\song9
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\pink\dark\song12
\dir\bach\orch\fugue\song13
\dir\bach\orch\fugue\song14
\dir\bach\orch\fugue\song15

Sample list of new_files:

\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\yes\closer\song16
\dir\yes\closer\song17
\dir\yes\closer\song18
\dir\king\court\song2
\dir\king\court\song4
\dir\king\court\song6

There are likely cases I'm ignoring with these simple examples.

I have the feeling I'm reinventing the wheel here.

Question 2

Some comments: 1) Why are you afraid of leaving empty subdirs? You could just remove all the subdirectories with shutil.rmtree(path)... but I guess that would slow down so still not a viable option. 2) You could replace your one_minus_two by this: set(one) - set(two)

Question 3

This is just rsync -a --delete.

Question 4

Because operating system calls are expensive, I'd rather minimize the number needed

That's fine, and a great idea especially for dry-runs and testing.

It's easy to just delete each file in the existing list but not on the new list, but that could leave empty subdirs. I could test for empty subdirs with OS calls, but as noted I'd like to avoid that.

There is some risk here. What happens if a directory designated for deletion has files that did not appear in the new-list? I think at the least, the old list should be based on the true contents of the filesystem (outside of a unit test context). However, I don't demonstrate this.

not [...] actually copying or deleting files or directories.

The program implies only deletion, not copying.

Otherwise:

Convert from Python 2 to Python 3.

Add unit tests.

Don't hard-code for backslash path separators; use pathlib properly instead.

I don't think there's value in all of your prints, only the final output. If you want to add optional debugging content, use logging.

Use argparse since this is supposed to be a command-line utility.

About your algorithm, things I like: it's non-recursive. Things I don't like as much: one_minus_two is just reimplementing set subtraction; and it relies too much on string manipulation. It also differentiates between files and directories when I don't see a lot of value in doing that. count += 1 should terminate that loop.

As an alternative, I demonstrate a tree recursion algorithm. It's not clearly better or worse, but I can understand how it works better.

import argparse
import io
import typing
from collections import defaultdict
from pathlib import Path
def load_index(file: typing.TextIO) -> list[Path]:
 return [
 Path(line.rstrip())
 for line in file
 if not line.startswith('#EXT')
 ]
class Node:
 __slots__ = 'children', 'dir', 'parent', 'to_drop', 'to_keep'
 def __init__(self) -> None:
 self.children: defaultdict[str, Node] = defaultdict(Node)
 self.dir: Path | None = None
 self.to_drop: set[Path] = set()
 self.to_keep: set[Path] = set()
 self.parent: Node | None = None
 def add(self, paths: typing.Iterable[Path], keep: bool) -> None:
 for path in paths:
 # Traverse through children, get/create deepest leaf
 leaf = self
 for ancestor in path.parts[:-1]:
 new_leaf = leaf.children[ancestor]
 new_leaf.parent = leaf
 leaf = new_leaf
 # Add path to appropriate set
 if keep:
 leaf.to_keep.add(path)
 elif path not in leaf.to_keep:
 leaf.to_drop.add(path)
 # Back-propagate path breadcrumbs
 while leaf is not None and leaf.dir is None:
 path = path.parent
 leaf.dir = path
 leaf = leaf.parent
 def reduce(self) -> tuple[bool, set[Path]]:
 # Get drop data for this node
 has_kept = len(self.to_keep) > 0
 to_drop = self.to_drop.copy()
 # Get drop data recursed from child nodes
 for child in self.children.values():
 child_has_kept, child_drops = child.reduce()
 if child_has_kept:
 has_kept = True
 to_drop |= child_drops
 else:
 to_drop.add(child.dir)
 return has_kept, to_drop
def transform(old_file: typing.TextIO, new_file: typing.TextIO) -> set[Path]:
 old_paths = load_index(old_file)
 new_paths = load_index(new_file)
 root = Node()
 root.add(new_paths, keep=True)
 root.add(old_paths, keep=False)
 any_kept, top_drop = root.reduce()
 return top_drop
def test() -> None:
 with io.StringIO(
r'''\dir\who\tommy\song1
\dir\who\tommy\song2
\dir\who\tommy\song3
\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\who\next\song7
\dir\who\next\song8
\dir\who\next\song9
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\pink\dark\song12
\dir\bach\orch\fugue\song13
\dir\bach\orch\fugue\song14
\dir\bach\orch\fugue\song15
''') as old_file, io.StringIO(
r'''\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\yes\closer\song16
\dir\yes\closer\song17
\dir\yes\closer\song18
\dir\king\court\song2
\dir\king\court\song4
\dir\king\court\song6
''') as new_file:
 top_drops = transform(old_file, new_file)
 assert top_drops == {
 Path('/dir/bach'),
 Path('/dir/who'),
 Path('/dir/pink/dark/song12'),
 }
 '''
 Original output:
 
 add_files ...
 raw delete list ...
 reduced delete list ['\\dir\\bach', '\\dir\\who', '\\dir\\pink\\dark\\song12']
 delete files ['\\dir\\pink\\dark\\song12']
 prune dirs ['\\dir\\bach', '\\dir\\who']
 '''
def main() -> None:
 parser = argparse.ArgumentParser(description='Produce a delete-set to deduplicate a directory')
 parser.add_argument('old', type=open, help='File containing old file names')
 parser.add_argument('new', type=open, help='File containing new file names')
 args = parser.parse_args()
 top_drops = transform(args.old, args.new)
 print('\n'.join(str(p) for p in top_drops))
if __name__ == '__main__':
 # test()
 main()

\dir\pink\dark\song12
\dir\bach
\dir\who

Reinderien 71.2k5 gold badges76 silver badges257 bronze badges · Answer 1 · 2025-01-11 16:44:49Z

Because operating system calls are expensive, I'd rather minimize the number needed

That's fine, and a great idea especially for dry-runs and testing.

It's easy to just delete each file in the existing list but not on the new list, but that could leave empty subdirs. I could test for empty subdirs with OS calls, but as noted I'd like to avoid that.

There is some risk here. What happens if a directory designated for deletion has files that did not appear in the new-list? I think at the least, the old list should be based on the true contents of the filesystem (outside of a unit test context). However, I don't demonstrate this.

not [...] actually copying or deleting files or directories.

The program implies only deletion, not copying.

Otherwise:

Convert from Python 2 to Python 3.

Add unit tests.

Don't hard-code for backslash path separators; use pathlib properly instead.

I don't think there's value in all of your prints, only the final output. If you want to add optional debugging content, use logging.

Use argparse since this is supposed to be a command-line utility.

About your algorithm, things I like: it's non-recursive. Things I don't like as much: one_minus_two is just reimplementing set subtraction; and it relies too much on string manipulation. It also differentiates between files and directories when I don't see a lot of value in doing that. count += 1 should terminate that loop.

As an alternative, I demonstrate a tree recursion algorithm. It's not clearly better or worse, but I can understand how it works better.

import argparse
import io
import typing
from collections import defaultdict
from pathlib import Path
def load_index(file: typing.TextIO) -> list[Path]:
 return [
 Path(line.rstrip())
 for line in file
 if not line.startswith('#EXT')
 ]
class Node:
 __slots__ = 'children', 'dir', 'parent', 'to_drop', 'to_keep'
 def __init__(self) -> None:
 self.children: defaultdict[str, Node] = defaultdict(Node)
 self.dir: Path | None = None
 self.to_drop: set[Path] = set()
 self.to_keep: set[Path] = set()
 self.parent: Node | None = None
 def add(self, paths: typing.Iterable[Path], keep: bool) -> None:
 for path in paths:
 # Traverse through children, get/create deepest leaf
 leaf = self
 for ancestor in path.parts[:-1]:
 new_leaf = leaf.children[ancestor]
 new_leaf.parent = leaf
 leaf = new_leaf
 # Add path to appropriate set
 if keep:
 leaf.to_keep.add(path)
 elif path not in leaf.to_keep:
 leaf.to_drop.add(path)
 # Back-propagate path breadcrumbs
 while leaf is not None and leaf.dir is None:
 path = path.parent
 leaf.dir = path
 leaf = leaf.parent
 def reduce(self) -> tuple[bool, set[Path]]:
 # Get drop data for this node
 has_kept = len(self.to_keep) > 0
 to_drop = self.to_drop.copy()
 # Get drop data recursed from child nodes
 for child in self.children.values():
 child_has_kept, child_drops = child.reduce()
 if child_has_kept:
 has_kept = True
 to_drop |= child_drops
 else:
 to_drop.add(child.dir)
 return has_kept, to_drop
def transform(old_file: typing.TextIO, new_file: typing.TextIO) -> set[Path]:
 old_paths = load_index(old_file)
 new_paths = load_index(new_file)
 root = Node()
 root.add(new_paths, keep=True)
 root.add(old_paths, keep=False)
 any_kept, top_drop = root.reduce()
 return top_drop
def test() -> None:
 with io.StringIO(
r'''\dir\who\tommy\song1
\dir\who\tommy\song2
\dir\who\tommy\song3
\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\who\next\song7
\dir\who\next\song8
\dir\who\next\song9
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\pink\dark\song12
\dir\bach\orch\fugue\song13
\dir\bach\orch\fugue\song14
\dir\bach\orch\fugue\song15
''') as old_file, io.StringIO(
r'''\dir\rolling\beggars\song4
\dir\rolling\beggars\song5
\dir\rolling\beggars\song6
\dir\pink\dark\song10
\dir\pink\dark\song11
\dir\yes\closer\song16
\dir\yes\closer\song17
\dir\yes\closer\song18
\dir\king\court\song2
\dir\king\court\song4
\dir\king\court\song6
''') as new_file:
 top_drops = transform(old_file, new_file)
 assert top_drops == {
 Path('/dir/bach'),
 Path('/dir/who'),
 Path('/dir/pink/dark/song12'),
 }
 '''
 Original output:
 
 add_files ...
 raw delete list ...
 reduced delete list ['\\dir\\bach', '\\dir\\who', '\\dir\\pink\\dark\\song12']
 delete files ['\\dir\\pink\\dark\\song12']
 prune dirs ['\\dir\\bach', '\\dir\\who']
 '''
def main() -> None:
 parser = argparse.ArgumentParser(description='Produce a delete-set to deduplicate a directory')
 parser.add_argument('old', type=open, help='File containing old file names')
 parser.add_argument('new', type=open, help='File containing new file names')
 args = parser.parse_args()
 top_drops = transform(args.old, args.new)
 print('\n'.join(str(p) for p in top_drops))
if __name__ == '__main__':
 # test()
 main()

\dir\pink\dark\song12
\dir\bach
\dir\who

Stack Exchange Network

Python files sync

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Python files sync

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions