Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a8ac600

Browse files
changing name of event metadata script, turning cec long to wide an audit script
1 parent be630d6 commit a8ac600

File tree

2 files changed

+110
-36
lines changed

2 files changed

+110
-36
lines changed

‎scripts/cec_long_to_wide.py‎

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1+
##
2+
## Generate a wide-to-long CEC table from a long-to-wide CEC table.
3+
##
4+
15
import pandas as pd
2-
import numpy as np
36
import sqlalchemy
47

58
import os
69
import sys
710
import yaml
811

12+
import datetime as dt
13+
914
sys.path.insert(0, os.path.join(os.path.abspath('.'), 'scripts'))
1015

1116
from context import config
@@ -20,7 +25,7 @@
2025
'utf8mb4'), convert_unicode=True)
2126

2227
## get the users to skip
23-
non_users = ['test1', 'admin', 'tina', 'alex', 'ellen', 'ishita', 'andrea', 'karishma']
28+
non_users = ['test1', 'admin', 'tina', 'alex', 'ellen', 'ishita', 'andrea', 'karishma', 'adj1']
2429

2530
## get the disqualifying information rows
2631
disqualifying_variables = yaml.load(
@@ -45,7 +50,7 @@
4550
## get the query
4651
df_long = pd.read_sql(query, con = mysql_engine)
4752

48-
## there should be duplicates but here we are
53+
## there should not be duplicates but here we are
4954
df_long = df_long.drop_duplicates()
5055

5156
## remove test users
@@ -58,37 +63,12 @@
5863
## move text field into value if not null
5964
df_long['value'] = df_long.apply(lambda x: x['text'] if x['text'] is not None else x['value'], axis = 1)
6065

61-
## pivot
62-
columns = ['article-desc', 'desc', 'location', 'start-date']
66+
## pivot, join variables with multiple values with ;
6367
indexes = ['event_id', 'coder_id', 'article_id', 'publication', 'pub_date', 'title']
64-
df_wide = df_long[df_long['variable'].isin(columns)].\
65-
pivot(index = indexes, columns = 'variable', values = 'value')
66-
67-
## rename a few things to be MySQL and SQLAlchemy friendly
68-
df_wide = df_wide.rename(columns = {'article-desc': 'article_desc', 'start-date': 'start_date'})
69-
70-
## reset indexes
71-
df_wide = df_wide.reset_index()
72-
73-
## replace empty values with NaN
74-
df_wide[df_wide == ''] = np.nan
75-
76-
## upload to MySQL
77-
df_wide.to_sql(name = 'event_metadata',
78-
con = mysql_engine,
79-
if_exists= 'replace',
80-
index = True,
81-
index_label = 'id',
82-
dtype = {
83-
'id': sqlalchemy.types.Integer(),
84-
'coder_id': sqlalchemy.types.Text(),
85-
'event_id': sqlalchemy.types.Integer(),
86-
'article_id': sqlalchemy.types.Integer(),
87-
'article_desc': sqlalchemy.types.UnicodeText(),
88-
'desc': sqlalchemy.types.UnicodeText(),
89-
'location': sqlalchemy.types.Text(),
90-
'start_date': sqlalchemy.types.Date(),
91-
'publication': sqlalchemy.types.Text(),
92-
'pub_date': sqlalchemy.types.Date(),
93-
'title': sqlalchemy.types.Text()
94-
})
68+
df_wide = pd.pivot_table(data = df_long,
69+
index = indexes,
70+
columns = 'variable',
71+
values = 'value',
72+
aggfunc = lambda x: ';'.join(x))
73+
74+
df_wide.to_csv('../exports/pivoted-events_{}.csv'.format(dt.datetime.now().strftime('%Y-%m-%d')))

‎scripts/generate_event_metadata.py‎

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import pandas as pd
2+
import numpy as np
3+
import sqlalchemy
4+
5+
import os
6+
import sys
7+
import yaml
8+
9+
sys.path.insert(0, os.path.join(os.path.abspath('.'), 'scripts'))
10+
11+
from context import config
12+
13+
## MySQL setup
14+
mysql_engine = sqlalchemy.create_engine(
15+
'mysql://%s:%s@localhost/%s?unix_socket=%s&charset=%s' %
16+
(config.MYSQL_USER,
17+
config.MYSQL_PASS,
18+
config.MYSQL_DB,
19+
config.MYSQL_SOCK,
20+
'utf8mb4'), convert_unicode=True)
21+
22+
## get the users to skip
23+
non_users = ['test1', 'admin', 'tina', 'alex', 'ellen', 'ishita', 'andrea', 'karishma']
24+
25+
## get the disqualifying information rows
26+
disqualifying_variables = yaml.load(
27+
open(os.path.join(os.path.abspath('..'), 'yes-no.yaml'), 'r'),
28+
Loader = yaml.BaseLoader)
29+
disqualifying_variables = [x[0] for x in disqualifying_variables['Disqualifying information']]
30+
31+
query = """SELECT
32+
event_id,
33+
u.username coder_id,
34+
variable,
35+
value,
36+
cec.text,
37+
am.id article_id,
38+
am.pub_date,
39+
am.publication,
40+
am.title
41+
FROM coder_event_creator cec
42+
LEFT JOIN article_metadata am ON (cec.article_id = am.id)
43+
LEFT JOIN user u ON (cec.coder_id = u.id)"""
44+
45+
## get the query
46+
df_long = pd.read_sql(query, con = mysql_engine)
47+
48+
## there should not be duplicates but here we are
49+
df_long = df_long.drop_duplicates()
50+
51+
## remove test users
52+
df_long = df_long[~df_long['coder_id'].isin(non_users)]
53+
54+
## get disqualified events and remove
55+
disqualified_events = df_long[df_long['variable'].isin(disqualifying_variables)].event_id.unique()
56+
df_long = df_long[~df_long['event_id'].isin(disqualified_events)]
57+
58+
## move text field into value if not null
59+
df_long['value'] = df_long.apply(lambda x: x['text'] if x['text'] is not None else x['value'], axis = 1)
60+
61+
## pivot
62+
columns = ['article-desc', 'desc', 'location', 'start-date']
63+
indexes = ['event_id', 'coder_id', 'article_id', 'publication', 'pub_date', 'title']
64+
df_wide = df_long[df_long['variable'].isin(columns)].\
65+
pivot(index = indexes, columns = 'variable', values = 'value')
66+
67+
## rename a few things to be MySQL and SQLAlchemy friendly
68+
df_wide = df_wide.rename(columns = {'article-desc': 'article_desc', 'start-date': 'start_date'})
69+
70+
## reset indexes
71+
df_wide = df_wide.reset_index()
72+
73+
## replace empty values with NaN
74+
df_wide[df_wide == ''] = np.nan
75+
76+
## upload to MySQL
77+
df_wide.to_sql(name = 'event_metadata',
78+
con = mysql_engine,
79+
if_exists= 'replace',
80+
index = True,
81+
index_label = 'id',
82+
dtype = {
83+
'id': sqlalchemy.types.Integer(),
84+
'coder_id': sqlalchemy.types.Text(),
85+
'event_id': sqlalchemy.types.Integer(),
86+
'article_id': sqlalchemy.types.Integer(),
87+
'article_desc': sqlalchemy.types.UnicodeText(),
88+
'desc': sqlalchemy.types.UnicodeText(),
89+
'location': sqlalchemy.types.Text(),
90+
'start_date': sqlalchemy.types.Date(),
91+
'publication': sqlalchemy.types.Text(),
92+
'pub_date': sqlalchemy.types.Date(),
93+
'title': sqlalchemy.types.Text()
94+
})

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /