import timeit def _url_collapse_path_split_glenn(path): # Similar to os.path.split(os.path.normpath(path)) but specific to URL # path semantics rather than local operating system semantics. path_parts = path.split('/') head_parts = [] for part in path_parts[:-1]: if part == '..': head_parts.pop() # IndexError if more '..' than prior parts elif part and part != '.': head_parts.append( part ) if path_parts: tail_part = path_parts.pop() if tail_part: if tail_part == '..': head_parts.pop() tail_part = '' elif tail_part == '.': tail_part = '' else: tail_part = '' return ('/' + '/'.join(head_parts), tail_part) # TODO(gregory.p.smith): Move this into an appropriate library. def _url_collapse_path_split(path): """ Given a URL path, remove extra '/'s and '.' path elements and collapse any '..' references. Implements something akin to RFC-2396 5.2 step 6 to parse relative paths. Returns: A tuple of (head, tail) where tail is everything after the final / and head is everything before it. Head will always start with a '/' and, if it contains anything else, never have a trailing '/'. Raises: IndexError if too many '..' occur within the path. """ # Similar to os.path.split(os.path.normpath(path)) but specific to URL # path semantics rather than local operating system semantics. path_parts = [] for part in path.split('/'): if part == '.': path_parts.append('') else: path_parts.append(part) # Filter out blank non trailing parts before consuming the '..'. path_parts = [part for part in path_parts[:-1] if part] + path_parts[-1:] if path_parts: tail_part = path_parts.pop() else: tail_part = '' head_parts = [] for part in path_parts: if part == '..': head_parts.pop() else: head_parts.append(part) if tail_part and tail_part == '..': head_parts.pop() tail_part = '' return ('/' + '/'.join(head_parts), tail_part) test_vectors = { '': ('/', ''), '/': ('/', ''), '//': ('/', ''), '/\\': ('/', '\\'), '/.//': ('/', ''), 'cgi-bin/file1.py': ('/cgi-bin', 'file1.py'), '/cgi-bin/file1.py': ('/cgi-bin', 'file1.py'), '/cgi-bin/file1.py/../../a': ('/', 'a'), '/cgi-bin/file1.py/a/b/c/../../d': ('/cgi-bin/file1.py/a', 'd'), 'a': ('/', 'a'), '/a': ('/', 'a'), '//a': ('/', 'a'), './a': ('/', 'a'), './C:/': ('/C:', ''), '/a/b': ('/a', 'b'), '/a/b/': ('/a/b', ''), '/a/b/.': ('/a/b', ''), '/a/b/c/..': ('/a/b', ''), '/a/b/c/../d': ('/a/b', 'd'), '/a/b/c/../d/e/../f': ('/a/b/d', 'f'), '/a/b/c/../d/e/../../f': ('/a/b', 'f'), '/a/b/c/../d/e/.././././..//f': ('/a/b', 'f'), '/a/b/c/../d/e/../../../f': ('/a', 'f'), '/a/b/c/../d/e/../../../../f': ('/', 'f'), '/a/b/c/../d/e/../../../../f/..': ('/', ''), '/a/b/c/../d/e/../../../../f/../.': ('/', ''), } def orig(): for path, expected in test_vectors.items(): _url_collapse_path_split( path ) def glenn(): for path, expected in test_vectors.items(): _url_collapse_path_split_glenn( path ) o = timeit.Timer( orig ) g = timeit.Timer( glenn ) print("o: %.2f usec/pass" % (1000000 * o.timeit(number=100000)/100000)) print("g: %.2f usec/pass" % (1000000 * g.timeit(number=100000)/100000))