# textop.py # # A program that reads a large collection of text into memory # and performs various operations on it. This should work in both # Python 2 and Python 3. Use it for a performance comparison. NSAMPLES = 10 from timethis import timethis import sys # For Python 2, map range to xrange try: range = xrange except NameError: pass # Read an Apache log file into memory and replicate it to make a large sample. # The result should be a string with about 6 million characters in it logdata = open("access-log","rt").read()*10 # Test 1: Memory use print("Size %d bytes" % sys.getsizeof(logdata)) # Test 2: Finding all lines using find() and slicing with timethis("find lines"): index = 0 while index < len(logdata): nextindex = logdata.find("\n",index) line = logdata[index:nextindex] index = nextindex+1 # Test 3 : Split into lines with timethis("line splitting"): lines = logdata.splitlines() # Test 4 : Splitting on whitespace with timethis("whitespace splitting"): fields = logdata.split() # Test 5 : Regex pattern matching. import re ip_pattern = re.compile(r"\d+\.\d+\.\d+\.\d+") with timethis("regex pattern matching"): unique_ips = set() for m in ip_pattern.finditer(logdata): unique_ips.add(m.group()) # Test 6 : Iterate by characters with timethis("iterate by character"): for c in logdata: pass # Test 7 : Replace text with timethis("Replace characters"): s = logdata.replace(" ",":")