[Python-checkins] r46987 - sandbox/trunk/Doc/func-example.py

andrew.kuchling python-checkins at python.org
Fri Jun 16 02:42:16 CEST 2006


Author: andrew.kuchling
Date: Fri Jun 16 02:42:15 2006
New Revision: 46987
Added:
 sandbox/trunk/Doc/func-example.py (contents, props changed)
Log:
Add larger example for functional HOWTO; I'll dissect it in a section to be written
Added: sandbox/trunk/Doc/func-example.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/Doc/func-example.py	Fri Jun 16 02:42:15 2006
@@ -0,0 +1,199 @@
+#!/Users/andrewk/source/p/python/python.exe
+
+# Example: indexer that records info about all the files in a directory tree.
+
+import os, sys
+import itertools, functools
+import cPickle
+
+#
+# Management of the list of indexing functions.
+#
+
+_indexers = {}
+
+def register(ext, func):
+ """Registers the function 'func'
+
+ >>> is_indexable_filename('foo.jpg')
+ False
+ >>> register('jpg', None)
+ >>> is_indexable_filename('foo.jpg')
+ True
+ >>> _indexers.clear()
+ """
+ _indexers['.' + ext] = func
+
+def is_indexable_filename (fn):
+ """Returns true if there's an indexer available for the given filename.
+
+ >>> register('txt', None)
+ >>> is_indexable_filename('foo.txt')
+ True
+ >>> is_indexable_filename('foo.jpg')
+ False
+ """
+ base, ext = os.path.splitext(fn)
+ return _indexers.has_key(ext)
+
+def is_ignorable_directory (dirname):
+ """Return true if the directory with the given name shouldn't be scanned.
+
+ >>> is_ignorable_directory('.svn')
+ True
+ >>> is_ignorable_directory('text')
+ False
+ """
+ return (dirname in ('.svn', 'CVS'))
+
+def remove_punctuation (word):
+ """Removes leading and trailing punctuation characters from a word.
+ May return the empty string.
+
+ >>> remove_punctuation('test')
+ 'test'
+ >>> remove_punctuation('comma,')
+ 'comma'
+ >>> remove_punctuation('()')
+ ''
+ """
+ word = word.strip(',.?!"\'()[]#*\\')
+ return word
+
+#
+# Functions for indexing directories and files
+#
+
+def index (*args):
+ """Index the directory trees rooted at the specified paths.
+ Can take any number of arguments.
+ Returns the index data structure.
+ """
+ idx = load_index()
+ for path in args:
+ index_tree(idx, path)
+ save_index(idx)
+ return idx
+
+def index_tree (idx, path):
+ """Index the contents of the files in the directory tree rooted at 'path'.
+ """
+ for dirpath, dirnames, filenames in os.walk(path):
+ # Remove ignorable directories
+ for d in list(dirnames):
+ if is_ignorable_directory(d):
+ dirnames.remove(d)
+
+ # Discard uninteresting filenames
+ filenames = [fn for fn in filenames
+ if is_indexable_filename(fn)]
+
+ # Index files 
+ for fn in filenames:
+ full_path = os.path.join(dirpath, fn)
+ index_file(idx, full_path)
+
+def index_file (idx, path):
+ """Index the contents of a single file. It's assumed that
+ an indexing function will be found for the file's type.
+ """
+ assert is_indexable_filename(path)
+ base, ext = os.path.splitext(path)
+ 
+ indexer = _indexers[ext]
+ record_func = functools.partial(record, idx)
+ indexer(path, record_func)
+
+
+#
+# Index data structure
+#
+# The index is a big dictionary:
+# { word => [list of (filename, line number) tuples] } 
+# 
+
+def lookup (idx, word):
+ """Return an iterator over the files and lines containing the requested
+ word.
+ """
+ for file, line in idx.get(word, []):
+ yield (file, line)
+ 
+def record (idx, word, path, line=None):
+ """Add an index entry for the given word, using the specified path
+ and line number. The line number can be None.
+
+ >>> record({}, 'word', '/path', None)
+ {'word': {('/path', None): 1}}
+ >>> record({}, 'word', '/path', 42)
+ {'word': {('/path', 42): 1}}
+ """
+ d = idx.setdefault(word, {})
+ key = (path, line)
+ if key not in d:
+ d[key] = 1
+ return idx
+
+def load_index ():
+ """Read index from disk.
+ """
+ index_filename = '/tmp/index'
+ if os.path.exists(index_filename):
+ input = open(index_filename, 'rb')
+ idx = cPickle.load(input)
+ input.close()
+ else:
+ idx = {}
+ 
+ return idx
+
+def save_index (idx):
+ """Write index to disk.
+ """
+ output = open('/tmp/index', 'wb')
+ cPickle.dump(idx, output, -1)
+ output.close()
+ 
+ import pprint
+ print len(idx), 'words in index'
+ #print idx
+ ##pprint.pprint(idx)
+
+#
+# File analysis functions
+#
+
+def text_inspector (input_file, record_func):
+ line_num = 1
+ for line in open(input_file, 'r'):
+ for word in line.split():
+ word = remove_punctuation(word.lower())
+ if word != '':
+ record_func(word, input_file, line_num)
+ line_num += 1
+
+
+if __name__ == '__main__':
+ if '-t' in sys.argv[1:]:
+ import doctest
+ doctest.testmod()
+ raise SystemExit
+
+ register('txt', text_inspector)
+ #register('jpg', jpg_inspector)
+ #register('gif', gif_inspector)
+
+ idx = index(*sys.argv[1:])
+
+ # Look up a word
+ for filename, line in lookup(idx, 'the'):
+ print filename, line
+ 
+
+# Exercises:
+# * Matching lines are output in random order. Output them in sorted order.
+# [5] (One-line change)
+# * Use itertools.groupby() for better output, i.e. file.txt: 1 3 4 5
+# [10]
+# * Remove file entries before adding new ones. [15]
+


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /