Message 147128 - Python tracker

➜

This issue tracker has been migrated to GitHub , and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

In-reply-to
Author	ncoghlan
Recipients	eric.araujo, ncoghlan, vstinner
Date	2011年11月06日.00:43:32
SpamBayes Score	1.3814392e-09
Marked as misclassified	No
Message-id	<1320540213.89.0.365463113335.issue13229@psf.upfronthosting.co.za>

Content
I should probably update that posted recipe to my latest version (which adds "excluded_files" and "excluded_dirs" parameters). However, since I've been dealing with remote filesystems where os.listdir() and os.stat() calls from the local machine aren't possible lately, I also think we may need to reconsider how this is structured and look at the idea of building a more effective pipeline model that permits more efficient modes of interaction. Let's take 'os.walk' as the base primitive - the basis of the pipeline will always be an iterator that produces 3-tuples of a base name, a list of subdirectories and a list of files. The filtering pipeline elements will require that the underlying walk include "topdown=True" and pay attention to changes in the subdirectory list. Then consider the following possible pipeline elements: def filter_dirs(walk_iter, include_filters, exclude_filters=()): def should_include(dirname): return any(fnmatch(dirname, include) for include in include_filters) def should_exclude(dirname): return any(fnmatch(dirname, include) for exclude in exclude_filters) for dirpath, subdirs, files in walk_iter: subdirs[:] = [subdir for subdir in subdirs if should_include(subdir) and not should_exclude(subdir)] yield dirpath, subdirs, files def filter_files(walk_iter, include_filters, exclude_filters=()): def should_include(dirname): return any(fnmatch(dirname, include) for include in include_filters) def should_exclude(dirname): return any(fnmatch(dirname, include) for exclude in exclude_filters) for dirpath, subdirs, files in walk_iter: files[:] = [fname for fname in files if should_include(fname) and not should_exclude(fname)] yield dirpath, subdirs, files def limit_depth(walk_iter, depth): if depth < 0: msg = "Depth limit greater than 0 ({!r} provided)" raise ValueError(msg.format(depth)) sep = os.sep for top, subdirs, files in walk_iter: yield top, subdirs, files initial_depth = top.count(sep) if depth == 0: subdirs[:] = [] break for dirpath, subdirs, files in walk_iter: yield dirpath, subdirs, files current_depth = dirpath.count(sep) - initial_depth if current_depth >= depth: subdirs[:] = [] def detect_symlink_loops(walk_iter, onloop=None): if onloop is None: def onloop(path): msg = "Symlink {!r} refers to a parent directory, skipping\n" sys.stderr.write(msg.format(path)) sys.stderr.flush() for top, subdirs, files in walk_iter: yield top, subdirs, files real_top = os.path.abspath(os.path.realpath(top)) break for dirpath, subdirs, files in walk_iter: if os.path.islink(dirpath): # We just descended into a directory via a symbolic link # Check if we're referring to a directory that is # a parent of our nominal directory relative = os.path.relpath(dirpath, top) nominal_path = os.path.join(real_top, relative) real_path = os.path.abspath(os.path.realpath(dirpath)) path_fragments = zip(nominal_path.split(sep), real_path.split(sep)) for nominal, real in path_fragments: if nominal != real: break else: if not onloop(dirpath): subdirs[:] = [] continue yield dirpath, subdirs, files And pipeline terminators: def walk_dirs(walk_iter): for dirpath, subdirs, files in walk_iter: yield dirpath def walk_files(walk_iter): for dirpath, subdirs, files in walk_iter: for fname in files: yield os.path.join(dirpath, fname) def walk_all(walk_iter): for dirpath, subdirs, files in walk_iter: yield dirpath for fname in files: yield os.path.join(dirpath, fname) The pipeline terminators could then be combined with ordinary iterable consumers like comprehensions: base_walk = detect_symlink_loops(os.walk(os.path.abspath(base_dir, followlinks=True))) depth_limited_walk = limit_depth(base_walk, 2) filtered_walk = filter_dirs(filter_files(depth_limited_walk, ".py"), ".pyp") tree_info = {path, os.stat(path) for path in walk_all(filtered_walk)}

Content

I should probably update that posted recipe to my latest version (which adds "excluded_files" and "excluded_dirs" parameters).
However, since I've been dealing with remote filesystems where os.listdir() and os.stat() calls from the local machine aren't possible lately, I also think we may need to reconsider how this is structured and look at the idea of building a more effective pipeline model that permits more efficient modes of interaction.
Let's take 'os.walk' as the base primitive - the basis of the pipeline will always be an iterator that produces 3-tuples of a base name, a list of subdirectories and a list of files. The filtering pipeline elements will require that the underlying walk include "topdown=True" and pay attention to changes in the subdirectory list.
Then consider the following possible pipeline elements:
def filter_dirs(walk_iter, *include_filters, exclude_filters=()):
 def should_include(dirname):
 return any(fnmatch(dirname, include) for include in include_filters)
 def should_exclude(dirname):
 return any(fnmatch(dirname, include) for exclude in exclude_filters)
 for dirpath, subdirs, files in walk_iter:
 subdirs[:] = [subdir for subdir in subdirs
 if should_include(subdir) and not should_exclude(subdir)]
 yield dirpath, subdirs, files
def filter_files(walk_iter, *include_filters, exclude_filters=()):
 def should_include(dirname):
 return any(fnmatch(dirname, include) for include in include_filters)
 def should_exclude(dirname):
 return any(fnmatch(dirname, include) for exclude in exclude_filters)
 for dirpath, subdirs, files in walk_iter:
 files[:] = [fname for fname in files
 if should_include(fname) and not should_exclude(fname)]
 yield dirpath, subdirs, files
def limit_depth(walk_iter, depth):
 if depth < 0:
 msg = "Depth limit greater than 0 ({!r} provided)"
 raise ValueError(msg.format(depth))
 sep = os.sep
 for top, subdirs, files in walk_iter:
 yield top, subdirs, files
 initial_depth = top.count(sep)
 if depth == 0:
 subdirs[:] = []
 break
 for dirpath, subdirs, files in walk_iter:
 yield dirpath, subdirs, files
 current_depth = dirpath.count(sep) - initial_depth
 if current_depth >= depth:
 subdirs[:] = []
def detect_symlink_loops(walk_iter, onloop=None):
 if onloop is None:
 def onloop(path):
 msg = "Symlink {!r} refers to a parent directory, skipping\n"
 sys.stderr.write(msg.format(path))
 sys.stderr.flush()
 for top, subdirs, files in walk_iter:
 yield top, subdirs, files
 real_top = os.path.abspath(os.path.realpath(top))
 break
 for dirpath, subdirs, files in walk_iter:
 if os.path.islink(dirpath):
 # We just descended into a directory via a symbolic link
 # Check if we're referring to a directory that is
 # a parent of our nominal directory
 relative = os.path.relpath(dirpath, top)
 nominal_path = os.path.join(real_top, relative)
 real_path = os.path.abspath(os.path.realpath(dirpath))
 path_fragments = zip(nominal_path.split(sep), real_path.split(sep))
 for nominal, real in path_fragments:
 if nominal != real:
 break
 else:
 if not onloop(dirpath):
 subdirs[:] = []
 continue
 yield dirpath, subdirs, files
And pipeline terminators:
def walk_dirs(walk_iter):
 for dirpath, subdirs, files in walk_iter:
 yield dirpath
def walk_files(walk_iter):
 for dirpath, subdirs, files in walk_iter:
 for fname in files:
 yield os.path.join(dirpath, fname)
def walk_all(walk_iter):
 for dirpath, subdirs, files in walk_iter:
 yield dirpath
 for fname in files:
 yield os.path.join(dirpath, fname)
The pipeline terminators could then be combined with ordinary iterable consumers like comprehensions:
 base_walk = detect_symlink_loops(os.walk(os.path.abspath(base_dir, followlinks=True)))
 depth_limited_walk = limit_depth(base_walk, 2)
 filtered_walk = filter_dirs(filter_files(depth_limited_walk, "*.py"), "*.pyp")
 tree_info = {path, os.stat(path) for path in walk_all(filtered_walk)}

History
Date	User	Action	Args
2011年11月06日 00:43:33	ncoghlan	set	recipients: + ncoghlan, vstinner, eric.araujo
2011年11月06日 00:43:33	ncoghlan	set	messageid: <1320540213.89.0.365463113335.issue13229@psf.upfronthosting.co.za>
2011年11月06日 00:43:33	ncoghlan	link	issue13229 messages
2011年11月06日 00:43:32	ncoghlan	create

homepage