I have a relatively simple project to parse some HTTP server logs using Python and SQLite. I wrote up the code but I'm always looking for tips on being a better Python scripter. Though this is a simple task and my code works as written, I was hoping for some pointers to improve my coding ability.
import re
import sqlite3
## Load in the log file
file = open("logs")
rawdata = file.read()
## Create a database
conn = sqlite3.connect("parsed_logs.sqlite3")
c = conn.cursor()
# Build the SQLite database if needed
#c.execute('''CREATE TABLE requests (ip text, date text, requested_url text, response_code int, referer text, agent text);''')
#conn.commit()
## Prepare data
lines = rawdata.split("\n")
for line in lines:
#print line
# Parse data from each line
date = re.findall("\[.*?\]", line)
date = re.sub("\[", "", date[0])
date = re.sub("\]", "", date)
quoted_data = re.findall("\".*?\"", line)
#print quoted_data
requested_url = quoted_data[0]
referer = quoted_data[1]
agent = quoted_data[2]
unquoted_data_stream = re.sub("\".*?\"", "", line)
unquoted_data = unquoted_data_stream.split(" ")
ip = unquoted_data[0]
response_code = unquoted_data[6]
#print ip
#print date
#print requested_url
#print response_code
#print referer
#print agent
## Insert elements into rows
c.execute("INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?)", [ip, date, requested_url, response_code, referer, agent])
conn.commit()
## Check to see if it worked
for row in c.execute("SELECT count(*) from requests"):
print row
Here is some sample data:
99.122.86.237 - - [14/Oct/2012:00:01:06 -0400] "GET /epic/running_epic_tier_cover_300w.jpg HTTP/1.1" 200 81804 "http://images.google.com/search?num=10&hl=en&site=&tbm=isch&source=hp&q=epic&oq=epic&gs_l=img.3..0l10.2603.421004470.5.5.0.0.0.0.137.412.4j1.5.0...0.0...1ac.1.Ycx4MqWP66w&biw=1024&bih=672&sei=mzd6UIjlKMrcqQHM3YGwAg" "Mozilla/5.0 (iPad; CPU OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A405 Safari/7534.48.3"
209.131.41.49 - - [14/Oct/2012:00:01:36 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
66.249.73.139 - - [14/Oct/2012:00:01:48 -0400] "GET /editorials/67 HTTP/1.1" 404 419 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
180.76.5.181 - - [14/Oct/2012:00:01:52 -0400] "GET /review_462.html HTTP/1.1" 200 5982 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
208.94.116.217 - - [14/Oct/2012:00:01:58 -0400] "GET /index.xml HTTP/1.1" 200 112946 "-" "Fever/1.26 (Feed Parser; http://feedafever.com; Allow like Gecko)"
50.30.9.10 - - [14/Oct/2012:00:02:03 -0400] "GET /index.xml HTTP/1.1" 200 6335 "-" "Motorola"
66.228.33.168 - - [14/Oct/2012:00:02:11 -0400] "GET /feed/ HTTP/1.1" 301 469 "-" "Fever/1.26 (Feed Parser; http://feedafever.com; Allow like Gecko)"
66.228.33.168 - - [14/Oct/2012:00:02:11 -0400] "GET /index.xml/ HTTP/1.1" 404 383 "-" "Fever/1.26 (Feed Parser; http://feedafever.com; Allow like Gecko)"
209.131.41.48 - - [14/Oct/2012:00:02:16 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
184.73.130.89 - - [14/Oct/2012:00:02:47 -0400] "GET /index.xml HTTP/1.1" 200 129755 "-" "Python-urllib/2.7"
74.125.176.24 - - [14/Oct/2012:00:02:51 -0400] "GET /index.xml HTTP/1.1" 301 444 "-" "Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; feed-id=10448260103645447977)"
74.125.19.39 - - [14/Oct/2012:00:02:51 -0400] "GET /index.xml HTTP/1.1" 200 6335 "-" "Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; feed-id=10448260103645447977)"
209.131.41.49 - - [14/Oct/2012:00:03:27 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
173.199.114.115 - - [14/Oct/2012:00:03:28 -0400] "GET /robots.txt HTTP/1.1" 301 445 "-" "Mozilla/5.0 (compatible; AhrefsBot/4.0; +http://ahrefs.com/robot/)"
173.199.114.115 - - [14/Oct/2012:00:03:28 -0400] "GET /robots.txt HTTP/1.1" 404 381 "-" "Mozilla/5.0 (compatible; AhrefsBot/4.0; +http://ahrefs.com/robot/)"
180.76.5.140 - - [14/Oct/2012:00:03:43 -0400] "GET /loral/001432.html HTTP/1.1" 200 9211 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
98.139.134.97 - - [14/Oct/2012:00:03:49 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
192.55.29.140 - - [14/Oct/2012:00:03:55 -0400] "GET /index.xml HTTP/1.1" 304 197 "-" "Motorola"
50.53.20.104 - - [14/Oct/2012:00:04:08 -0400] "GET /feed/ HTTP/1.1" 301 450 "-" "RSSOwl/2.1.2.201108131746 (Windows; U; en)"
50.53.20.104 - - [14/Oct/2012:00:04:09 -0400] "GET /index.xml/ HTTP/1.1" 404 381 "-" "RSSOwl/2.1.2.201108131746 (Windows; U; en)"
209.131.41.48 - - [14/Oct/2012:00:04:12 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
184.18.38.106 - - [14/Oct/2012:00:04:15 -0400] "GET /index.xml HTTP/1.1" 304 216 "-" "Planet Spoon -- D&D +http://blacksun/planetspoon/dnd Planet/2.0 +http://www.planetplanet.org UniversalFeedParser/4.1 +http://feedparser.org/"
95.108.158.239 - - [14/Oct/2012:00:04:15 -0400] "GET /wp-content/uploads/2010/03/shoshanna.jpg HTTP/1.1" 200 41791 "-" "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)"
91.206.113.167 - - [14/Oct/2012:00:04:21 -0400] "GET /robots.txt HTTP/1.1" 301 450 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
91.206.113.167 - - [14/Oct/2012:00:04:22 -0400] "GET /robots.txt HTTP/1.1" 404 381 "http://www.slyflourish.com/robots.txt" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
91.206.113.234 - - [14/Oct/2012:00:04:22 -0400] "GET / HTTP/1.1" 200 7381 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
65.55.24.245 - - [14/Oct/2012:00:04:25 -0400] "GET /robots.txt HTTP/1.1" 404 416 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
64.130.10.15 - - [14/Oct/2012:00:04:37 -0400] "GET /index.xml HTTP/1.1" 200 6354 "-" "UniversalFeedParser/4.1 +http://feedparser.org/"
66.249.73.234 - - [14/Oct/2012:00:04:38 -0400] "GET /Mikes_Rollerball_and_Foun.html HTTP/1.1" 200 24064 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
98.139.134.99 - - [14/Oct/2012:00:04:46 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
217.146.191.19 - - [14/Oct/2012:00:05:07 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
83.140.155.184 - - [14/Oct/2012:00:05:19 -0400] "GET /index.xml HTTP/1.1" 304 197 "-" "Bloglovin/1.0 (http://www.bloglovin.com; 1 subscribers)"
98.139.134.96 - - [14/Oct/2012:00:05:20 -0400] "GET /newsfeeds/boingboing.xml HTTP/1.1" 301 612 "-" "Yahoo Pipes 2.0"
184.72.46.156 - - [14/Oct/2012:00:05:27 -0400] "GET /feed/ HTTP/1.1" 301 469 "-" "RockMeltService"
184.72.46.156 - - [14/Oct/2012:00:05:27 -0400] "GET /index.xml/ HTTP/1.1" 404 400 "-" "RockMeltService"
85.234.131.115 - - [14/Oct/2012:00:05:31 -0400] "GET /index.xml HTTP/1.1" 304 253 "-" "Protopage/3.0 (http://www.protopage.com)"
69.10.177.43 - - [14/Oct/2012:00:05:37 -0400] "GET /comments/feed/ HTTP/1.1" 404 383 "-" "Motorola"
69.10.177.43 - - [14/Oct/2012:00:05:37 -0400] "GET /comments/feed/ HTTP/1.1" 404 383 "-" "Jakarta Commons-HttpClient/3.1"
24.116.85.10 - - [14/Oct/2012:00:05:42 -0400] "GET / HTTP/1.1" 200 7437 "http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CB8QFjAA&url=http%3A%2F%2Fslyflourish.com%2F&ei=Ezp6UJCjJYW3qAHDsYDYDQ&usg=AFQjCNGlwfi63WJY_SLVcnA2cPxzN2NsIQ" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
184.72.46.160 - - [14/Oct/2012:00:05:42 -0400] "GET /feed/ HTTP/1.1" 301 469 "-" "RockMeltService"
24.116.85.10 - - [14/Oct/2012:00:05:42 -0400] "GET /style.css HTTP/1.1" 200 1605 "http://slyflourish.com/" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
24.116.85.10 - - [14/Oct/2012:00:05:42 -0400] "GET /sfbook.jpg HTTP/1.1" 200 23087 "http://slyflourish.com/" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
24.116.85.10 - - [14/Oct/2012:00:05:42 -0400] "GET /running_epic_tier_cover_200w.jpg HTTP/1.1" 200 34283 "http://slyflourish.com/" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
184.72.46.160 - - [14/Oct/2012:00:05:42 -0400] "GET /index.xml/ HTTP/1.1" 404 400 "-" "RockMeltService"
24.116.85.10 - - [14/Oct/2012:00:05:42 -0400] "GET /favicon.ico HTTP/1.1" 404 437 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
69.10.179.173 - - [14/Oct/2012:00:05:43 -0400] "GET /index.xml HTTP/1.1" 304 197 "-" "Motorola"
-
\$\begingroup\$ Read [dabeaz.com/generators/]. There's a wonderful amount of information in there about how to deal with logs in a scalable manner. \$\endgroup\$Marcin– Marcin2014年05月12日 13:08:32 +00:00Commented May 12, 2014 at 13:08
2 Answers 2
It would probably be better if you didn't read the whole log file at once. You could try something like this instead
with open('logs','r') as f:
for line in f:
#print line
# Parse data from each line
date = re.findall("\[.*?\]", line)
...
import re
import sqlite3
## Load in the log file
file = open("logs")
Its usually recommended to have any sort of logic inside a main
function. As @Matt suggests, use with
to make sure the file gets closed, and its probably best to avoid reading the whole file in.
rawdata = file.read()
## Create a database
conn = sqlite3.connect("parsed_logs.sqlite3")
c = conn.cursor()
# Build the SQLite database if needed
#c.execute('''CREATE TABLE requests (ip text, date text, requested_url text, response_code int, referer text, agent text);''')
#conn.commit()
Rather then comment this out, I'd add a CREATE_TABLE = False
line at the top of the file, and check that variable. That way you can just flip it to re-enable this piece of code.
## Prepare data
lines = rawdata.split("\n")
for line in lines:
for line in file
gets you the same effect
#print line
# Parse data from each line
date = re.findall("\[.*?\]", line)
.*?
is the same as .*
, there isn't really a reason to use the ?. Furthermore if you make the expression \[(.*)\]
then findall will only return the part in parens. That'll get rid of the [ and ] thus saving the next two lines
date = re.sub("\[", "", date[0])
date = re.sub("\]", "", date)
If just replacing a character, it probably makes sense to use strings replace rather than a regular expression.
quoted_data = re.findall("\".*?\"", line)
#print quoted_data
requested_url = quoted_data[0]
referer = quoted_data[1]
agent = quoted_data[2]
unquoted_data_stream = re.sub("\".*?\"", "", line)
unquoted_data = unquoted_data_stream.split(" ")
ip = unquoted_data[0]
response_code = unquoted_data[6]
The problem is that its a bit tricky to follow the logic of where things are coming from.
#print ip
#print date
#print requested_url
#print response_code
#print referer
#print agent
Please remove debug code
## Insert elements into rows
c.execute("INSERT INTO requests VALUES (?, ?, ?, ?, ?, ?)", [ip, date, requested_url, response_code, referer, agent])
conn.commit()
## Check to see if it worked
for row in c.execute("SELECT count(*) from requests"):
print row
Here's how I reworked your parsing code:
file = StringIO(open("logs").read().replace('[','"').replace(']','"'))
reader = csv.reader(file, delimiter=' ')
for line in reader:
ip, _, _, date, url, response_code, _, referer, agent = line
I replace the [ and ], which makes your log a valid space delimited file which can be read using python's csv
module. From there, its a simple task to decode the parts in local variables.
I don't love the replacement trick here. But I'd probably develop a generic parsing system to handle both [] and " and then return the result of each row as a list and decode as I've done above.