This is a script that creates a base dataframe from a sqlite database, adds data to it (also from SQLite), cleanse it and formats it all to an Excel file. I feel like it is incredibly verbose and my code could be simplified quite a bit. One thing I definitely want to do is eliminate the direct references to the db, /Users/meter/code/mm/vid_score/test.db
so that this script can be run on any machine.
Any thoughts on simple changes I can make to decrease the complexity and amount of code?
import pandas as pd
import numpy as np
import sqlite3 as db
from contextlib import closing
from datetime import date, timedelta
ONE = date.today() - timedelta(1)
TWO = date.today() - timedelta(2)
SEVEN = date.today() - timedelta(8)
def video_base():
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM video_played;", con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df_one = df.loc[df['time'] == ONE]
df_two = df.loc[df['time'] == TWO]
df_seven = df.loc[df['time'] == SEVEN]
df_one = df_one.groupby('video_id').agg({"ios_id": {"count_watched": np.count_nonzero,
"unique_watched": pd.Series.nunique},
"feed_position": {"feed_position": np.average},
"time_watched": {"time_watched": np.sum},
"video_length": {"video_length": np.sum}})
df_one.columns = df_one.columns.droplevel(0)
df_one['avg_time_watched'] = df_one['time_watched'] / df_one['video_length']
df_two = df_two.groupby('video_id').agg({"ios_id": {"count_watched": np.count_nonzero,
"unique_watched": pd.Series.nunique},
"feed_position": {"feed_position": np.average},
"time_watched": {"time_watched": np.sum},
"video_length": {"video_length": np.sum}})
df_two.columns = df_two.columns.droplevel(0)
df_two = df_two.rename(columns={'count_watched': 'count_watched_yeterday'})
df_seven = df_seven.groupby('video_id').agg({"ios_id": {"count_watched": np.count_nonzero,
"unique_watched": pd.Series.nunique},
"feed_position": {"feed_position": np.average},
"time_watched": {"time_watched": np.sum},
"video_length": {"video_length": np.sum}})
df_seven.columns = df_seven.columns.droplevel(0)
df_seven = df_seven.rename(columns={'count_watched': 'count_watched_seven'})
video_base = pd.merge(df_one, df_two[['count_watched_yeterday']],
how='left', left_index=True, right_index=True)
video_base = pd.merge(video_base, df_seven[['count_watched_seven']],
how='left', left_index=True, right_index=True)
return video_base
def video_intent(video_base):
# ITEM INFO #
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM item_info_click;", con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df.groupby('video_id').agg({"ios_id": {"count_clicks": np.count_nonzero,
"unique_clicks": pd.Series.nunique},
"feed_position": {"feed_position": np.average}})
df.columns = df.columns.droplevel(0)
video_base['item_info_clicks'] = df['count_clicks']
# FAVED #
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM faved;", con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df.groupby('video_id').agg({"ios_id": {"count_faves": np.count_nonzero,
"unique_clicks": pd.Series.nunique},
"feed_position": {"feed_position": np.average}})
df.columns = df.columns.droplevel(0)
video_base['faves'] = df['count_faves']
# REPLAYS #
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM replay;", con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df.groupby('video_id').agg({"ios_id": {"replays": np.count_nonzero,
"unique_clicks": pd.Series.nunique},
"feed_position": {"feed_position": np.average}})
df.columns = df.columns.droplevel(0)
video_base['replays'] = df['replays']
# ADD TO CART #
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM add_to_cart;", con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df.groupby('video_id').agg({"ios_id": {"add_to_cart": np.count_nonzero,
"unique_clicks": pd.Series.nunique},
"feed_position": {"feed_position": np.average}})
df.columns = df.columns.droplevel(0)
video_base['add_to_cart'] = df['add_to_cart']
# CAROUSEL #
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM carousel;", con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df.groupby('video_id').agg({"ios_id": {"carousel": np.count_nonzero,
"unique_clicks": pd.Series.nunique},
"feed_position": {"feed_position": np.average}})
df.columns = df.columns.droplevel(0)
video_base['carousel'] = df['carousel']
return video_base
def cleanup(video_raw):
video_raw = video_raw.sort('count_watched', ascending=False)
video_raw['percent_yesterday'] = (video_raw['count_watched'] - video_raw['count_watched_yeterday']) / video_raw[
'count_watched_yeterday']
video_raw['percent_seven'] = (video_raw['count_watched'] - video_raw['count_watched_seven']) / video_raw[
'count_watched_seven']
new_cols = ['count_watched', 'percent_yesterday', 'percent_seven',
'unique_watched', 'avg_time_watched', 'feed_position',
'replays', 'item_info_clicks', 'faves', 'add_to_cart', 'carousel']
video_clean = video_raw[new_cols]
video_clean = video_clean.rename(columns={'count_watched': 'Video Plays',
'percent_yesterday': '% Change from Yest.',
'percent_seven': '% Change from Last Week',
'unique_watched': 'Unique Watchers',
'avg_time_watched': 'Avg. Time Watched',
'feed_position': 'Avg. Feed Position',
'replays': 'Replays',
'item_info_clicks': 'Item Info Click',
'faves': 'Faved',
'add_to_cart': 'Add To Cart',
'carousel': 'Carousel'})
return video_clean
def feed_position():
with closing(db.connect('/Users/meter/code/mm/vid_score/test.db')) as con:
df = pd.read_sql_query("SELECT * FROM video_played;", con)
print df
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df[['ios_id', 'video_id', 'feed_position']]
feed_data = df.pivot_table(index=['video_id'],
columns=['feed_position'],
values=['ios_id'],
aggfunc=np.count_nonzero)
feed_data.columns = feed_data.columns.droplevel(0)
return feed_data
def to_excel(video_report, feed):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('daily_report.xlsx', engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
video_report.to_excel(writer, sheet_name='Video Overview', na_rep="-")
# Get the xlsxwriter objects from the dataframe writer object.
workbook = writer.book
worksheet = writer.sheets['Video Overview']
# Add some cell formats.
integer = workbook.add_format({'num_format': '0', 'align': 'center'})
decimal = workbook.add_format({'num_format': '0.00', 'align': 'center'})
percentage = workbook.add_format({'num_format': '0.0%', 'align': 'center'})
zebra = workbook.add_format({'bold': True})
worksheet.set_column('B:B', 13, integer)
worksheet.set_column('C:C', 17, percentage)
worksheet.set_column('D:D', 19, percentage)
worksheet.set_column('E:E', 15, integer)
worksheet.set_column('F:F', 15, percentage)
worksheet.set_column('G:G', 15, decimal)
worksheet.set_column('H:H', 13, integer)
worksheet.set_column('I:I', 13, integer)
worksheet.set_column('J:J', 13, integer)
worksheet.set_column('K:K', 13, integer)
worksheet.set_column('L:L', 13, integer)
worksheet.set_row(3, 20, zebra)
feed.to_excel(writer, sheet_name='Feed Position', na_rep="-")
workbook1 = writer.book
worksheet1 = writer.sheets['Feed Position']
integer = workbook1.add_format({'num_format': '0', 'align': 'center'})
worksheet1.set_column('B:HU', 4, integer)
writer.save()
def main():
video_data = video_base()
intent_added = video_intent(video_data)
cleaned = cleanup(intent_added)
feed_data = feed_position()
to_excel(cleaned, feed_data)
if __name__ == "__main__":
main()
-
\$\begingroup\$ @ferada This is a continuation of the last cleanup you helped me with. Any chance you have thoughts on this one? \$\endgroup\$metersk– metersk2015年06月01日 16:43:30 +00:00Commented Jun 1, 2015 at 16:43
1 Answer 1
So the good thing is it's PEP8 clean (except for the 80 characters limit) and is readable. pandas
and numpy
is very nice as well, the groupby
statements look great for example. But yeah, as you said, you have a lot of duplicated code.
I've uploaded the repo to Github, so you might want to clone that soonish.
Command line arguments
It looks like you can run this just as a filter, i.e. something like
python video.py test.db daily_report.xlsx
. If you want more
convenience I'd suggest
argparse
to have a
nicer interface. After that you could also add a configuration file to
store those settings locally on each machine. However for this post
I'll restrict myself to the first option.
I don't see why you need to open/close the database so many times. I'd
move con
to main
and reuse it as long as you need. That also
eliminates all but one occurrence of that filename. The Excel filename
can be moved to main
as well.
Afterwards both filenames should come from the command-line instead,
i.e. sys.argv
, of course raising an error if you don't get enough
arguments (or you know, use argparse
to do that for you). That now
looks like this:
def main():
if len(sys.argv) != 3:
sys.exit("Need exactly two arguments: database and output file.")
with closing(db.connect(sys.argv[1])) as con:
video_data = video_base(con)
intent_added = video_intent(con, video_data)
feed_data = feed_position(con)
cleaned = cleanup(intent_added)
to_excel(cleaned, feed_data, sys.argv[2])
The cleanup
call is moved out of the closing
body since it doesn't
depend on the order, so it makes sense to still close the database as
early as possible.
Further refactoring
There's a typo where you used yeterday
all the time.
Now that that's done, let's eliminate more duplication.
The first thing I see is the dups in video_base
; basically all you
need to do here is factor out the two changing parameters:
def video_aggregations(df, day):
df = df.loc[df['time'] == day]
df = df.groupby('video_id').agg({"ios_id": {"count_watched": np.count_nonzero,
"unique_watched": pd.Series.nunique},
"feed_position": {"feed_position": np.average},
"time_watched": {"time_watched": np.sum},
"video_length": {"video_length": np.sum}})
df.columns = df_one.columns.droplevel(0)
return df
Called like:
df_one = video_aggregations(df, ONE)
df_one['avg_time_watched'] = df_one['time_watched'] / df_one['video_length']
Same thing for video_intent
really. Factor out the constant parts,
then call that five times. The only thing I'm not sure about here is
the layout of the resulting data frames, so I'm more cautious than is
probably necessary - you might be able to remove the aggregation_name
parameter if the name isn't important:
def video_intent_aggregations(con, video_base, table, result_key, aggregation_name):
df = pd.read_sql_query("SELECT * FROM %s;" % table, con)
df['time'] = pd.to_datetime(df['time'], unit='s').dt.date
df = df.drop('index', axis=1)
df = df.loc[df['time'] == ONE]
df = df.groupby('video_id').agg({"ios_id": {aggregation_name: np.count_nonzero,
"unique_clicks": pd.Series.nunique},
"feed_position": {"feed_position": np.average}})
df.columns = df.columns.droplevel(0)
video_base[result_key] = df[aggregation_name]
def video_intent(con, video_base):
video_intent_aggregations(con, video_base, "item_info_click", "item_info_clicks", "count_clicks")
video_intent_aggregations(con, video_base, "faved", "faves", "count_faves")
video_intent_aggregations(con, video_base, "table", "replays", "replays")
video_intent_aggregations(con, video_base, "add_to_cart", "add_to_cart", "add_to_cart")
video_intent_aggregations(con, video_base, "carousel", "carousel", "carousel")
return video_base
In cleanup
you can get the values from video_raw
once, then reuse
them:
watched = video_raw['count_watched']
yesterday = video_raw['count_watched_yesterday']
seven = video_raw['count_watched_seven']
video_raw['percent_yesterday'] = (watched - yesterday) / yesterday
video_raw['percent_seven'] = (watched - seven) / seven
And finally in to_excel
you could extract the columns into a separate
list, then call the method on each of them:
columns = [('B:B', 13, integer)
('C:C', 17, percentage)
('D:D', 19, percentage)
('E:E', 15, integer)
('F:F', 15, percentage)
('G:G', 15, decimal)
('H:H', 13, integer)
('I:I', 13, integer)
('J:J', 13, integer)
('K:K', 13, integer)
('L:L', 13, integer)]
for column in columns:
worksheet.set_column(*column)
Assuming I made no mistake here I'd be content with this state. There are still opportunities to remove duplicate code, or factor out common code, or use more functional shortcuts, but it really it is up to you if you want to do more in that respect. At the moment this is readable and maintainable enough IMO.