Python Twitter parser

Question 1

In the interests of improving my Python coding skills, I wanted to post a program I recently built and get it critiqued by you fine folks. Please let me know where you think I might improve this program. I tried to stick to PEP8 and follow other standard conventions mentioned here.

### This program will filter a list of tweets by a certain
### threshold of retweets divided by followers
# Fetch tweets from Twitter list
# Store them in SQlite3
# Query database
# Output an HTML file with the results
# Clean database of data older than a month
import json
import re
import datetime
import time
import urllib
import sqlite3
params = {
 'threshold': 0.02, # retweet / follower threshold percentage.
 'db_file': '/blah/blah/blah/news_tweets.sqlite',
 'tweet_list_url': 'https://api.twitter.com/1/lists/statuses.json'\
 '?slug=my-news-sources&owner_screen_name=mshea&page=',
 'output_file': '/blah/blah/blah/news.html',
 'output_weekly_file': '/blah/blah/blah/weekly_news.html',
 'page_header': '''<!DOCTYPE html>
<meta name="viewport" content="user-scalable=yes, width=device-width">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<style>
body { font-family: Verdana, Geneva, sans-serif; color:#333; max-width:35em; 
 margin:auto; }
.score { font-size: .6em; color: #999; }
ul { list-style:none; }
/* Desktops and laptops ----------- */
@media only screen and (min-width : 321px) {
 ul, h1, .updated { margin:0; padding:0; }
 li { padding-left: 1.3em; padding-bottom: 1em; line-height: 1.6em; 
 text-indent: -1em; }
 ul { list-style:none; }
 h1 { font-weight: normal; font-size: 1.4em; padding-top: 1em; 
 padding-bottom: 1em; }
 .updated { font-size:.8em; text-align: center; padding-bottom: 1em;}
}
/* Smartphones (portrait and landscape) ----------- */
@media only screen and (min-width : 320px) and (max-width : 480px) {
 ul, h1, .updated { margin:0; padding:0; }
 li, h1, .updated { border-top:1px #ddd solid; }
 li { font-size:.9em; line-height:1.5em; padding:.5em; text-indent: 0; }
 .updated { font-size:.8em; padding: .8em; text-align: center; }
 h1 { font-size:1.2em; font-weight: normal; padding:.5em; 
 text-align: center; background: #eee;}
}
</style>
<title>News</title>
'''
}
conn = sqlite3.connect(params['db_file'])
# Filter strange character encodings to pure ascii.
def only_ascii(char):
 if ord(char) < 32 or ord(char) > 127: return ''
 else: return char
# Fetch tweets from the list and dump them into SQLite3
def fetch_tweets(tweet_list_url):
 jsonaggregate = []
 for jsonpagecount in range (1,30):
 fh = urllib.urlopen(tweet_list_url+str(jsonpagecount))
 data = fh.read()
 try:
 jsonaggregate += json.loads(data)
 except:
 print 'failed on page '+str(jsonpagecount)
 print 'parsing twitter json page '+str(jsonpagecount)
 print str(len(jsonaggregate))+ ' tweets parsed...'
 # Dump tweets to SQlite
 tweetinsertquery = conn.cursor()
 for item in jsonaggregate:
 tweet_time = time.strptime(item['created_at'], 
 '%a %b %d %H:%M:%S +0000 %Y')
 timestring = time.strftime('%Y-%m-%dT%H:%M:%S', tweet_time)
 tweetinsertquery.execute('''
 insert or replace into tweets 
 values (?, ?, ?, ?, ?, ?, ?, ?)
 ''',
 [
 item['id_str'],
 item['text'],
 timestring,
 item['favorited'],
 item['user']['screen_name'],
 item['retweet_count'],
 item['user']['followers_count'],
 item['user']['location']
 ]
 )
 conn.commit()
def link_text(text):
 return re.sub('http://[^ ,]*', lambda t: '<a href="%s">%s</a>'
 % (t.group(0), t.group(0)), text)
def build_page(): #Pull tweets from the database
 daycache = ''
 first_header = 1
 tweetquery = conn.cursor()
 tweetquery.execute('''
 select *, ((retweet_count*100.0) / (follower_count*100.0))
 from tweets 
 where (retweet_count*1.0 / follower_count*1.0 > (? / 100)) 
 and tweet like '%http%' 
 and datetime(created_at) > date('now','-6 day') 
 order by created_at desc;'''
 , [params['threshold']])
 fileoutput = [params['page_header']]
 for result in tweetquery:
 id, tweet, created_at, favorited, screen_name, \
 retweet_count, follower_count, location, score = result
 time_struct = time.strptime(created_at, '%Y-%m-%dT%H:%M:%S')
 currentday = time.strftime('%A, %d %B', 
 time.localtime(time.mktime(time_struct)-14400))
 if currentday != daycache:
 daycache = currentday
 if first_header != 1: #flag so we don't add an extra </ul>
 fileoutput.append('</ul>\n')
 else:
 first_header = 0 
 fileoutput.append('<h1>%s</h1>\n<ul>\n' % daycache)
 score = str(round(score*100, 3)).replace("0.",".")
 fileoutput.append('''<li><strong>%(screen_name)s:</strong>'''
 ''' %(tweet)s <span class="score">%(score)s</span>'''
 % { 'screen_name': screen_name,
 'tweet': filter(only_ascii, link_text(tweet)),
 'score': score
 })
 # Query for the top_weekly_tweets
 tweetquery.execute('''
 select *, ((retweet_count*100.0) / (follower_count*100.0)) 
 as value_rank 
 from tweets 
 where datetime(created_at) > date('now','-6 day') 
 and tweet like '%http%' 
 order by value_rank desc limit 50;
 ''')
 fileoutput.append('\n<h1>Top Weekly Links</h1>\n<ul>')
 for result in tweetquery:
 id, tweet, created_at, favorited, screen_name, \
 retweet_count, follower_count, location, score = result
 score = str(round(score*100, 3)).replace("0.",".")
 fileoutput.append('<li><strong>%(screen_name)s</strong>: ' \
 '%(tweet)s <span class="score">%(score)s</span></li>\n' 
 % {
 'screen_name': screen_name,
 'tweet': filter(only_ascii, link_text(tweet)),
 'score': score
 })
 fileoutput.append('</ul>\n<p class="updated">Updated %(updated)s</p>'
 % {'updated': time.strftime("%d %B at %I:%M %p", time.localtime()) })
 with open(params['output_file'], "w") as outputfile:
 outputfile.write(''.join(fileoutput).encode("utf8"))
def purge_database():
 cleandatabase = conn.cursor()
 cleandatabase.execute('''
 delete from tweets
 where datetime(created_at) < date('now','-14 day');
 ''')
 cleandatabase.execute('vacuum;')
 conn.commit()
fetch_tweets(params['tweet_list_url'])
build_page()
purge_database()

Question 2

### This program will filter a list of tweets by a certain
### threshold of retweets divided by followers
# Fetch tweets from Twitter list
# Store them in SQlite3
# Query database
# Output an HTML file with the results
# Clean database of data older than a month
import json
import re
import datetime
import time
import urllib
import sqlite3
params = {

By python convention, global constants should be ALL_CAPS

 'threshold': 0.02, # retweet / follower threshold percentage.
 'db_file': '/blah/blah/blah/news_tweets.sqlite',
 'tweet_list_url': 'https://api.twitter.com/1/lists/statuses.json'\
 '?slug=my-news-sources&owner_screen_name=mshea&page=',
 'output_file': '/blah/blah/blah/news.html',
 'output_weekly_file': '/blah/blah/blah/weekly_news.html',

This is a rather unusual way of handling parameters. Typically, we define global constants not a dict of parameters. I can't say there is anything really bad about the approach, but I don't see how it helps much either.

 'page_header': '''<!DOCTYPE html>
<meta name="viewport" content="user-scalable=yes, width=device-width">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<style>
body { font-family: Verdana, Geneva, sans-serif; color:#333; max-width:35em; 
 margin:auto; }
.score { font-size: .6em; color: #999; }
ul { list-style:none; }
/* Desktops and laptops ----------- */
@media only screen and (min-width : 321px) {
 ul, h1, .updated { margin:0; padding:0; }
 li { padding-left: 1.3em; padding-bottom: 1em; line-height: 1.6em; 
 text-indent: -1em; }
 ul { list-style:none; }
 h1 { font-weight: normal; font-size: 1.4em; padding-top: 1em; 
 padding-bottom: 1em; }
 .updated { font-size:.8em; text-align: center; padding-bottom: 1em;}
}
/* Smartphones (portrait and landscape) ----------- */
@media only screen and (min-width : 320px) and (max-width : 480px) {
 ul, h1, .updated { margin:0; padding:0; }
 li, h1, .updated { border-top:1px #ddd solid; }
 li { font-size:.9em; line-height:1.5em; padding:.5em; text-indent: 0; }
 .updated { font-size:.8em; padding: .8em; text-align: center; }
 h1 { font-size:1.2em; font-weight: normal; padding:.5em; 
 text-align: center; background: #eee;}
}
</style>
<title>News</title>
'''

For that amount of stuff I really suggest looking into using a template file.

}
conn = sqlite3.connect(params['db_file'])
# Filter strange character encodings to pure ascii.
def only_ascii(char):
 if ord(char) < 32 or ord(char) > 127: return ''
 else: return char
# Fetch tweets from the list and dump them into SQLite3
def fetch_tweets(tweet_list_url):

This function is named fetch_tweets, but it also dumps, so the name doesn't quite fit

 jsonaggregate = []

I'd call that json_aggregate

 for jsonpagecount in range (1,30):
 fh = urllib.urlopen(tweet_list_url+str(jsonpagecount))
 data = fh.read()

fh?

 try:
 jsonaggregate += json.loads(data)

Why not use json.load(fh)?

 except:

Don't do this, catch the specific exceptions you want to handle here. As it is you may hide other things going wrong

 print 'failed on page '+str(jsonpagecount)
 print 'parsing twitter json page '+str(jsonpagecount)

No you aren't, you've already parsed them

 print str(len(jsonaggregate))+ ' tweets parsed...'
 # Dump tweets to SQlite
 tweetinsertquery = conn.cursor()

Its a cursor not a query

 for item in jsonaggregate:
 tweet_time = time.strptime(item['created_at'], 
 '%a %b %d %H:%M:%S +0000 %Y')
 timestring = time.strftime('%Y-%m-%dT%H:%M:%S', tweet_time)

I'd pull those last two lines to a convert_timestamp function

 tweetinsertquery.execute('''
 insert or replace into tweets 
 values (?, ?, ?, ?, ?, ?, ?, ?)
 ''',
 [
 item['id_str'],
 item['text'],
 timestring,
 item['favorited'],
 item['user']['screen_name'],
 item['retweet_count'],
 item['user']['followers_count'],
 item['user']['location']
 ]

This should really be a tuple, not a list. I'd look into using executemany or prepared statements.

 )
 conn.commit()
def link_text(text):
 return re.sub('http://[^ ,]*', lambda t: '<a href="%s">%s</a>'
 % (t.group(0), t.group(0)), text)
def build_page(): #Pull tweets from the database
 daycache = ''
 first_header = 1
 tweetquery = conn.cursor()
 tweetquery.execute('''
 select *, ((retweet_count*100.0) / (follower_count*100.0))
 from tweets 
 where (retweet_count*1.0 / follower_count*1.0 > (? / 100)) 
 and tweet like '%http%' 
 and datetime(created_at) > date('now','-6 day') 
 order by created_at desc;'''
 , [params['threshold']])
 fileoutput = [params['page_header']]
 for result in tweetquery:
 id, tweet, created_at, favorited, screen_name, \
 retweet_count, follower_count, location, score = result
 time_struct = time.strptime(created_at, '%Y-%m-%dT%H:%M:%S')
 currentday = time.strftime('%A, %d %B', 
 time.localtime(time.mktime(time_struct)-14400))
 if currentday != daycache:
 daycache = currentday
 if first_header != 1: #flag so we don't add an extra </ul>
 fileoutput.append('</ul>\n')
 else:
 first_header = 0 
 fileoutput.append('<h1>%s</h1>\n<ul>\n' % daycache)
 score = str(round(score*100, 3)).replace("0.",".")
 fileoutput.append('''<li><strong>%(screen_name)s:</strong>'''
 ''' %(tweet)s <span class="score">%(score)s</span>'''
 % { 'screen_name': screen_name,
 'tweet': filter(only_ascii, link_text(tweet)),
 'score': score
 })

All this HTML output would be cleaner to use a proper template such as those provided by Mako.

 # Query for the top_weekly_tweets
 tweetquery.execute('''
 select *, ((retweet_count*100.0) / (follower_count*100.0)) 
 as value_rank 
 from tweets 
 where datetime(created_at) > date('now','-6 day') 
 and tweet like '%http%' 
 order by value_rank desc limit 50;
 ''')
 fileoutput.append('\n<h1>Top Weekly Links</h1>\n<ul>')
 for result in tweetquery:
 id, tweet, created_at, favorited, screen_name, \
 retweet_count, follower_count, location, score = result
 score = str(round(score*100, 3)).replace("0.",".")
 fileoutput.append('<li><strong>%(screen_name)s</strong>: ' \
 '%(tweet)s <span class="score">%(score)s</span></li>\n' 
 % {
 'screen_name': screen_name,
 'tweet': filter(only_ascii, link_text(tweet)),
 'score': score
 })
 fileoutput.append('</ul>\n<p class="updated">Updated %(updated)s</p>'
 % {'updated': time.strftime("%d %B at %I:%M %p", time.localtime()) })
 with open(params['output_file'], "w") as outputfile:
 outputfile.write(''.join(fileoutput).encode("utf8"))
def purge_database():
 cleandatabase = conn.cursor()
 cleandatabase.execute('''
 delete from tweets
 where datetime(created_at) < date('now','-14 day');
 ''')
 cleandatabase.execute('vacuum;')
 conn.commit()
fetch_tweets(params['tweet_list_url'])
build_page()
purge_database()

Typically, it makes sense to put your root level function into a main function.

Example main function:

# this is the function that actually does your work
def main():
 fetch_tweets(params['tweet_list_url'])
 build_page()
 purge_database()
# this is true only if you directly execute this file
# it's not true if you import the file.
if __name__ == '__main__':
 main()

Question 3

Thank you very much, Winston. This is very helpful. I'll look into templating, though my ISP doesn't let me install Python plugins so I am limited in how much I can add. I made a list of tuples and then used executemany for the query. Someone else on a previous program mentioned using params that way for program parameters. Can you tell me more about the main function? What would it look like?

Question 4

@MikeShea, you don't have to install the python library, just copy the python source files into the same directory and it will find them. I've added an example main.

Question 5

Thanks again Winston. Can you describe the advantage of the main function you have there? Is it for security? Performance? Thanks again for all the help. I'll try out the templating.

Question 6

@MikeShea, code will run slightly faster inside a function then at the global scope. It also prevents the variables from becoming global to module. It also lets you import the module from another python module without actually running it.

Winston Ewert 30.7k4 gold badges52 silver badges79 bronze badges · Accepted Answer · 2013-01-23 20:14:51Z

### This program will filter a list of tweets by a certain
### threshold of retweets divided by followers
# Fetch tweets from Twitter list
# Store them in SQlite3
# Query database
# Output an HTML file with the results
# Clean database of data older than a month
import json
import re
import datetime
import time
import urllib
import sqlite3
params = {

By python convention, global constants should be ALL_CAPS

 'threshold': 0.02, # retweet / follower threshold percentage.
 'db_file': '/blah/blah/blah/news_tweets.sqlite',
 'tweet_list_url': 'https://api.twitter.com/1/lists/statuses.json'\
 '?slug=my-news-sources&owner_screen_name=mshea&page=',
 'output_file': '/blah/blah/blah/news.html',
 'output_weekly_file': '/blah/blah/blah/weekly_news.html',

This is a rather unusual way of handling parameters. Typically, we define global constants not a dict of parameters. I can't say there is anything really bad about the approach, but I don't see how it helps much either.

 'page_header': '''<!DOCTYPE html>
<meta name="viewport" content="user-scalable=yes, width=device-width">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<style>
body { font-family: Verdana, Geneva, sans-serif; color:#333; max-width:35em; 
 margin:auto; }
.score { font-size: .6em; color: #999; }
ul { list-style:none; }
/* Desktops and laptops ----------- */
@media only screen and (min-width : 321px) {
 ul, h1, .updated { margin:0; padding:0; }
 li { padding-left: 1.3em; padding-bottom: 1em; line-height: 1.6em; 
 text-indent: -1em; }
 ul { list-style:none; }
 h1 { font-weight: normal; font-size: 1.4em; padding-top: 1em; 
 padding-bottom: 1em; }
 .updated { font-size:.8em; text-align: center; padding-bottom: 1em;}
}
/* Smartphones (portrait and landscape) ----------- */
@media only screen and (min-width : 320px) and (max-width : 480px) {
 ul, h1, .updated { margin:0; padding:0; }
 li, h1, .updated { border-top:1px #ddd solid; }
 li { font-size:.9em; line-height:1.5em; padding:.5em; text-indent: 0; }
 .updated { font-size:.8em; padding: .8em; text-align: center; }
 h1 { font-size:1.2em; font-weight: normal; padding:.5em; 
 text-align: center; background: #eee;}
}
</style>
<title>News</title>
'''

For that amount of stuff I really suggest looking into using a template file.

}
conn = sqlite3.connect(params['db_file'])
# Filter strange character encodings to pure ascii.
def only_ascii(char):
 if ord(char) < 32 or ord(char) > 127: return ''
 else: return char
# Fetch tweets from the list and dump them into SQLite3
def fetch_tweets(tweet_list_url):

This function is named fetch_tweets, but it also dumps, so the name doesn't quite fit

 jsonaggregate = []

I'd call that json_aggregate

 for jsonpagecount in range (1,30):
 fh = urllib.urlopen(tweet_list_url+str(jsonpagecount))
 data = fh.read()

fh?

 try:
 jsonaggregate += json.loads(data)

Why not use json.load(fh)?

 except:

Don't do this, catch the specific exceptions you want to handle here. As it is you may hide other things going wrong

 print 'failed on page '+str(jsonpagecount)
 print 'parsing twitter json page '+str(jsonpagecount)

No you aren't, you've already parsed them

 print str(len(jsonaggregate))+ ' tweets parsed...'
 # Dump tweets to SQlite
 tweetinsertquery = conn.cursor()

Its a cursor not a query

 for item in jsonaggregate:
 tweet_time = time.strptime(item['created_at'], 
 '%a %b %d %H:%M:%S +0000 %Y')
 timestring = time.strftime('%Y-%m-%dT%H:%M:%S', tweet_time)

I'd pull those last two lines to a convert_timestamp function

 tweetinsertquery.execute('''
 insert or replace into tweets 
 values (?, ?, ?, ?, ?, ?, ?, ?)
 ''',
 [
 item['id_str'],
 item['text'],
 timestring,
 item['favorited'],
 item['user']['screen_name'],
 item['retweet_count'],
 item['user']['followers_count'],
 item['user']['location']
 ]

This should really be a tuple, not a list. I'd look into using executemany or prepared statements.

 )
 conn.commit()
def link_text(text):
 return re.sub('http://[^ ,]*', lambda t: '<a href="%s">%s</a>'
 % (t.group(0), t.group(0)), text)
def build_page(): #Pull tweets from the database
 daycache = ''
 first_header = 1
 tweetquery = conn.cursor()
 tweetquery.execute('''
 select *, ((retweet_count*100.0) / (follower_count*100.0))
 from tweets 
 where (retweet_count*1.0 / follower_count*1.0 > (? / 100)) 
 and tweet like '%http%' 
 and datetime(created_at) > date('now','-6 day') 
 order by created_at desc;'''
 , [params['threshold']])
 fileoutput = [params['page_header']]
 for result in tweetquery:
 id, tweet, created_at, favorited, screen_name, \
 retweet_count, follower_count, location, score = result
 time_struct = time.strptime(created_at, '%Y-%m-%dT%H:%M:%S')
 currentday = time.strftime('%A, %d %B', 
 time.localtime(time.mktime(time_struct)-14400))
 if currentday != daycache:
 daycache = currentday
 if first_header != 1: #flag so we don't add an extra </ul>
 fileoutput.append('</ul>\n')
 else:
 first_header = 0 
 fileoutput.append('<h1>%s</h1>\n<ul>\n' % daycache)
 score = str(round(score*100, 3)).replace("0.",".")
 fileoutput.append('''<li><strong>%(screen_name)s:</strong>'''
 ''' %(tweet)s <span class="score">%(score)s</span>'''
 % { 'screen_name': screen_name,
 'tweet': filter(only_ascii, link_text(tweet)),
 'score': score
 })

All this HTML output would be cleaner to use a proper template such as those provided by Mako.

 # Query for the top_weekly_tweets
 tweetquery.execute('''
 select *, ((retweet_count*100.0) / (follower_count*100.0)) 
 as value_rank 
 from tweets 
 where datetime(created_at) > date('now','-6 day') 
 and tweet like '%http%' 
 order by value_rank desc limit 50;
 ''')
 fileoutput.append('\n<h1>Top Weekly Links</h1>\n<ul>')
 for result in tweetquery:
 id, tweet, created_at, favorited, screen_name, \
 retweet_count, follower_count, location, score = result
 score = str(round(score*100, 3)).replace("0.",".")
 fileoutput.append('<li><strong>%(screen_name)s</strong>: ' \
 '%(tweet)s <span class="score">%(score)s</span></li>\n' 
 % {
 'screen_name': screen_name,
 'tweet': filter(only_ascii, link_text(tweet)),
 'score': score
 })
 fileoutput.append('</ul>\n<p class="updated">Updated %(updated)s</p>'
 % {'updated': time.strftime("%d %B at %I:%M %p", time.localtime()) })
 with open(params['output_file'], "w") as outputfile:
 outputfile.write(''.join(fileoutput).encode("utf8"))
def purge_database():
 cleandatabase = conn.cursor()
 cleandatabase.execute('''
 delete from tweets
 where datetime(created_at) < date('now','-14 day');
 ''')
 cleandatabase.execute('vacuum;')
 conn.commit()
fetch_tweets(params['tweet_list_url'])
build_page()
purge_database()

Typically, it makes sense to put your root level function into a main function.

Example main function:

# this is the function that actually does your work
def main():
 fetch_tweets(params['tweet_list_url'])
 build_page()
 purge_database()
# this is true only if you directly execute this file
# it's not true if you import the file.
if __name__ == '__main__':
 main()

Thank you very much, Winston. This is very helpful. I'll look into templating, though my ISP doesn't let me install Python plugins so I am limited in how much I can add. I made a list of tuples and then used executemany for the query. Someone else on a previous program mentioned using params that way for program parameters. Can you tell me more about the main function? What would it look like?
@MikeShea, you don't have to install the python library, just copy the python source files into the same directory and it will find them. I've added an example main.
Thanks again Winston. Can you describe the advantage of the main function you have there? Is it for security? Performance? Thanks again for all the help. I'll try out the templating.
@MikeShea, code will run slightly faster inside a function then at the global scope. It also prevents the variables from becoming global to module. It also lets you import the module from another python module without actually running it.

Stack Exchange Network

Python Twitter parser

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Python Twitter parser

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions