#!/usr/bin/env python
import re
import nltk.corpus
def tokenize(text, **kw):
"""
Break down text into a set of unique lower case words.
"""
plain_words = nltk.corpus.abc.words()
stop_words = nltk.corpus.stopwords.words()
# Defaults
kw.setdefault('min_len', 4)
kw.setdefault('max_len', 0)
kw.setdefault('ignore_list', [])
kw['ignore_list'] += stop_words
ok_length = lambda token: [len(token) >= kw['min_len']
and kw['max_len'] <= kw['max_len']][0]
tokens = set([token.lower()
for token in re.findall('([A-z]+)+', text)
if ok_length(token) and
token not in kw['ignore_list']]).intersection(plain_words)
return tokens
## TESTING tokenize
TEXT = open('/bin/bash', 'r').read()
print tokenize(TEXT,
min_len=4,
max_len=0,
ignore_list=['code'])
My concerns:
- Is
tokenize()
too complex as one function? - Should I replace the list comprehensions?
- Does this regex
'([A-z]+)+'
need improving? - How could I make this code more idiomatic?
- Have I overlooked any faults in any of the logic?
Example output:
11:26 PM$ python Tokenize.py
set(['replacing', 'default', 'all', 'forget', 'chain', 'skip', 'global', 'dollar', 'splitting', 'existing', 'four', 'executing', 'go', 'follow', 'expressions', 'activates', 'saved', 'children', 'causes', 'row', 'whose', 'tv', 'graph', 'discard', 'send', 'environment', 'to', 'topic', 'program', 'marks', 'include', 'sent', 'allocate', 'division', 'random', 'slash', 'dynamic', 'reserved', 'removing', 'manipulated', 'every', 'nesting', 'decide', 'entries', 'locked', 'syntax', 'exact', 'condition', 'entire', 'redistribute', 'magic', 'exits', 'level', 'turns', 'array', 'exec', 'list', 'fewer', 'try', 'mm', 'quick', 'refer', 'upper', 'unexpected', 'force', 'portable', 'be', 'obsolete', 'sign', 'jump', 'consists', 'second', 'displays', 'insertion', 'pass', 'gm', 'even', 'index', 'errors', 'adds', 'sub', 'directive', 'near', 'supplied', 'current', 'seconds', 'waiting', 'version', 'new', 'movement', 'redirect', 'full', 'simultaneously', 'exchange', 'respectively', 'error', 'commercial', 'equals', 'reported', 'objects', 'let', 'undo', 'groups', 'erm', 'active', 'path', '[', 'diagnostic', 'appears', 'change', 'wait', 'digits', 'great', 'copyright', 'coerced', 'handlers', 'items', 'changed', 'allows', 'reports', 'ignoring', 'amount', 'resulting', 'menu', 'usually', 'history', 'makes', 'exited', 'missing', 'composed', 'named', 'via', 'useful', 'extra', 'prefer', 'logical', 'replace', 'visible', 'names', 'apply', 'unit', 'use', 'takes', 'working', 'nine', 'escapes', 'two', 'next', 'r', 'duplicate', 'handler', 'call', 'memory', 'scope', 'type', 'until', 'more', 'separated', 'successful', 'initial', 'operators', 'tested', 'flag', 'controlling', 'encountered', 'disabling', 'must', 'me', 'escaped', 'none', 'te', 'word', 'err', 'indicates', 'this', 'loops', 'work', 'mi', 'modified', 'abort', 'values', 'can', 'socket', 'following', 'making', 'closing', 'my', 'example', 'performed', 'control', 'del', 'prompt', 'links', 'give', 'process', 'lock', 'functions', 'share', 'accept', 'trap', 'high', 'effectively', 'tag', 'numbers', 'allowed', 'scheduling', 'counting', 'audible', 'information', 'rather', 'means', 'j', 'write', 'how', 'silent', 'instead', 'profile', 'map', 'fr', 'blocks', 'description', 'may', 'max', 'resumes', 'tries', 'disable', 'coming', 'date', 'horizontal', 'law', 'data', 'types', 'fo', 'a', 'ambiguous', 'short', 'physical', 'remember', 'third', 'whenever', 'maybe', 'lines', 'bugs', 'element', 'provide', 'expression', 'allow', 'decreasing', 'scroll', 'subsequently', 'operate', 'order', 'se', 'feed', 'breaking', 'interpretation', 'help', 'disables', 'move', 'displayed', 'interpreted', 'disabled', 'timing', 'suspend', 'un', 'differs', 'interpreter', 'still', 'pointer', 'positional', 'style', 'le', 'group', 'monitor', 'curly', 'shifts', 'lo', 'll', 'detected', 'lu', 'systems', 'listing', 'mail', 'hidden', 'main', 'pending', 'split', 'non', 'return', 'greater', 'output', 'matches', 'auto', 'runs', 'number', 'break', 'internally', 'blink', 'killed', 'matched', 'term', 'name', 'ifs', 'always', 'revert', 'identified', 'privileged', 'possibilities', 'applied', 'token', 'inequality', 'stopped', 'mode', 'arrow', 'each', 'found', 'cc', 'reset', 'preceded', 'square', 'invoked', 'generation', 'ed', 'chunk', 'hard', 'frames', 'expect', 'exceeded', 'eu', 'et', 'operation', 'bay', 'event', 'special', 'intended', 'large', 'shown', 'network', 'space', 'restricted', 'since', 'preserve', 'unknown', 'looking', 're', 'acting', 'flushing', 'exporting', 'print', 'got', 'rn', 'cause', 'occurs', 'common', 'foundation', 'turning', 'resume', 'free', 'standard', 'indices', 'base', 'execute', 'put', 'org', 'wanted', 'beginning', 'l', 'software', 'resumed', 'definition', 'g', 'created', 'locations', 'retrieving', 'messages', 'times', 'creates', 'turn', 'length', 'place', 'w', 'assumed', 'timed', 'onto', 'assign', 'first', 'origin', 'already', 'succeeds', 'omitted', 'variables', 'symbolic', 'primary', 'owned', 'one', 'restrict', 'hook', 'done', 'notify', 'suspended', 'blank', 'reached', 'message', 'open', 'braces', 'size', 'given', 'checked', 'exists', 'service', 'redirection', 'meaningful', 'top', 'behaves', 'accent', 'system', 'construct', 'priority', 'indicate', 'returns', 'listed', 'passed', 'typing', 'white', 'final', 'gives', 'shell', 'option', 'trapped', 'ch', 'completed', 'exactly', 'lists', 'copy', 'completes', 'specify', 'character', 'begins', 'b', 'target', 'quantum', 'instruction', 'enabled', 'depends', 'i', 'determined', 'bind', 'enables', 'declare', 'interactive', 'and', 'files', 'false', 'topics', 'turned', 'argument', 'dash', 'width', 'need', 'seen', 'any', 'contents', 'forced', 'zero', 'depending', 'self', 'note', 'also', 'internal', 'build', 'indexed', 'destroy', 'copied', 'brace', 'begin', 'added', 'unless', 'trace', 'normal', 'buffer', 'object', 'leave', 'regular', 'eight', 'printed', 'letter', 'termination', 'nothing', 'alpha', 'segment', 'associative', 'grave', 'appear', 'kg', 'foreground', 'clear', 'later', 'm', 'km', 'looked', 'bracket', 'keywords', 'pattern', 'normally', 'notion', 'selection', 'show', 'text', 'supported', 'brief', 'session', 'beg', 'conditional', 'find', 'completion', 'access', 'based', 'quoted', 'parameters', 'implementation', 'true', 'specified', 'assertion', 'controls', 'terminal', 'failed', 'only', 'inherited', 'override', 'query', 'local', 'columns', 'do', 'specifications', 'invoke', 'get', 'convert', 'de', 'stop', 'da', 'cannot', 'negative', 'words', 'reply', 'report', 'du', 'procedures', 'sorts', 'secondary', 'processes', 'resource', 'horizontally', 'fields', 'remove', 'calling', 'arrays', 'bad', 'processed', 'contain', 'release', 'x', 'fixed', 'automatic', 'flagged', 'ignored', 'set', 'dump', 'frame', 'prints', 'maximum', 'relative', 'see', 'result', 'successive', 'sequences', 'fails', 'evaluation', 'vertical', 'placed', 'ways', 'subsequent', 'currently', 'written', 'protected', 'neither', 'reading', 'conditions', 'checks', 'available', 'suppresses', 'jobs', 'parent', 'opening', 'modify', 'screen', 'sole', 'transpose', 'disallow', 'nd', 'job', 'succeed', 'selectively', 'key', 'interface', 'printing', 'optional', 'valid', 'hits', 'last', 'reverse', 'limits', 'many', 'region', 'la', 'according', 'minus', 'etc', 's', 'context', 'attributes', 'delete', 'whole', 'botched', 'otherwise', 'load', 'pre', 'permitted', 'co', 'extent', 'point', 'simple', 'effective', 'period', 'pop', 'cz', 'simply', 'unsuccessful', 'table', 'allocated', 'indefinite', 'suppressing', 'described', 'duo', 'addition', 'shells', 'create', 'three', 'mark', 'pc', 'treat', 'expected', 'entered', 'empty', 'define', 'generating', 'enable', 'corresponding', 'suppress', 'sufficient', 'search', 'else', 'child', 'an', 'assigning', 'present', '^', 'case', 'handling', 'license', 'these', 'plain', 'expanded', 'examine', 'value', 'n', 'while', 'replaced', 'behavior', 'shift', 'evaluates', 'di', 'property', 'precede', 'loop', 'seven', 've', 'resident', 'is', 'dumped', 'binding', 'it', 'equal', 'vu', 'in', 'ie', 'if', 'binary', 'containing', 'perform', 'make', 'attribute', 'member', 'read', 'arguments', 'freed', 'modification', 'document', 'events', 'resources', 'status', 'used', 'temporary', 'receives', 'keys', 'reporting', 'upon', 'effect', 'alert', 'action', 'running', 'levels', 'uses', 'user', 'characters', 'stack', 'expand', 'recent', 'lower', 'older', 'shared', 'changes', 'well', 'spent', 'options', 'patterns', 'without', 'flags', 'sets', 'y', 'position', 'the', 'left', 'comment', 'newest', 'sourced', 'less', 'percent', 'obtain', 'actions', 'assigned', 'stored', 'kill', 'immediately', 'followed', 'alternative', 'rotates', 'previous', 'adding', 'loading', 'generator', 'grouped', 'bell', 'guaranteed', 'except', 'signals', 'source', 'add', 'setting', 'combine', 'location', 'usage', 'input', 'reusable', 'interprets', 'remaining', 'match', 'take', 'real', 'tests', 'format', 'rules', 'evaluate', 'showing', 'unlimited', 'possible', 'five', 'background', 'using', 'bit', 'accepted', 'string', 'd', 'insert', 'appearing', 'like', 'success', 'sizes', 'signal', 'performing', 'manual', 'specific', 'exhausted', 'continue', 'hosts', 't', 'become', 'soft', 'attempting', 'right', 'old', 'often', 'sequence', 'oriented', 'creation', 'some', 'back', 'oh', 'export', 'evaluated', 'loaded', 'duration', 'multiple', 'matching', 'reasons', 'ignore', 'describing', 'for', 'notification', 'avoid', 'though', 'comments', 'disk', 'exit', 'select', 'provides', 'indication', 'leader', 'either', 'core', 'command', 'run', 'remembered', 'equivalent', 'processing', 'continuing', 'bi', 'expansion', 'utilities', 'host', 'display', 'offset', 'leftover', 'post', 'refers', 'by', 'comparison', 'pipeline', 'ok', 'would', 'getting', 'column', 'of', 'http', 'o', 'page', 'stamp', 'range', 'plus', 'stand', 'illegal', 'connected', 'os', 'or', 'block', 'op', 'contains', 'letters', 'previously', 'within', 'bound', 'son', 'en', 'determine', 'operator', 'accumulated', 'exchanges', 'terminated', 'statistics', 'additional', 'waits', ']', 'there', 'question', 'long', 'start', 'restricts', 'editor', 'way', 'forward', 'eg', 'combined', 'function', 'head', 'successfully', 'complete', 'form', 'hr', 'attempted', 'removes', 'commands', 'failure', 'manipulate', 'hi', 'link', 'newer', 'line', 'with', 'bug', 'he', 'count', 'entry', 'places', 'versions', 'whether', 'wish', 'caller', 'up', 'us', 'record', 'carriage', 'converted', 'limit', 'fetch', 'pm', 'similar', 'called', 'connect', 'detailed', 'storing', 'definitions', 'associated', 'ad', 'ag', 'defined', 'pseudo', 'universal', 'escape', 'incremental', 'al', 'general', 'consumed', 'single', 'warning', 'exist', 'at', 'file', 'home', 'importing', 'trailing', 'check', 'defines', 'echo', 'pipe', 'marking', 'remainder', 'no', 'when', 'virtual', 'started', 'other', 'outputs', 'test', 'you', 'acceptable', 'arithmetic', 'formats', 'elements', 'star', 'colon', 'separate', 'preceding', 'searched', 'reused', 'includes', 'generated', 'exported', 'variable', 'structure', 'opened', 'e', 'requires', 'required', 'mask', 'visual', 'strings', 'u', 'time', 'directory', 'backward', 'starting', 'original'])
1 Answer 1
- No,
tokenize
is not too complex for one function. It does a single operation: it tokenizes a string. - Your list comprehensions are fine. They are somewhat complex but are still readable and easily understandable.
Instead of using the regex
([A-z]+)+
you could simply use the 'shortcut' `\w+'. This regex will match one-to-many words.If you wanted your regex to recognize hyphenated words, a simple change will suffice:
\w(-?\w)*
- Your code is quite Pythonic. Your spacing and indentation is fine. The same goes for your variable names. If you have more concerns about style, consult PEP8, the official Python style guide.
There is a flaw in your logic though. Take your
lambda
expression:ok_length = lambda token: [len(token) >= kw['min_len'] and kw['max_len'] <= kw['max_len']][0]
Currently, this will except any string whose length is greater than the
min_length
this is because you havekw['max_len'] == kw['max_len']
which will always evaluateTrue
. You also have the conditionals switched around. This is how you should have the check:ok_length = lambda token: [kw['min_length'] <= len(token) <= kw['max_length']][0]
Now, the question you need to ask is, whether or not it pays off to use a
lambda
expression here. Is the functionality 'deserving' of alambda
expression? Or can the check simply be placed into the list comprehension a little later on? Here's what it would look like, I'll let you decide (I like this way):tokens = set([token.lower() for token in re.findall('([A-z]+)+', text) if kw['min_len'] <= len(token) <= kw['max_len'] and token not in kw['ignore_list']]).intersection(plain_words)
The next thing I would consider is your used of the
kw
dict. How needed is it? Could you just implement optional parameters in the function declaration? I like that implementation because this function does not (and probably won't) take an arbitrary number of keyword arguments. It will only take the 3. As a related side note, you have swapped yourmax_len
andmin_len
values. Currently,max_len < min_len
.Finally, I would remove the
intersection
function call when you assign thetokens
variable. The nametokens
and the context with which its generated implies thattokens
will hold all tokens that match the given regex. However, in your case, it only contains that are also inplain_words
. I would wait to do the intersection until thereturn
statement:return tokens.intersection(plain_words)
Here is 'my' version of the tokenize
function:
def tokenize(text, min_len=0, max_len=4, ignore_list=[]):
"""
Break down text into a set of unique lower case words.
"""
plain_words = nltk.corpus.abc.words()
stop_words = nltk.corpus.stopwords.words()
ignore_list += stop_words
tokens = set([token.lower()
for token in re.findall('\w+', text)
if min_len <= len(token) <= max_len and
token not in kw['ignore_list']])
return tokens.intersection(plain_words)