Commit a8ac600

committed

changing name of event metadata script, turning cec long to wide an audit script

1 parent be630d6 commit a8ac600Copy full SHA for a8ac600

File tree

2 files changed

+110

-36

lines changed

scripts
- cec_long_to_wide.py
- generate_event_metadata.py

2 files changed

+110

-36

lines changed

`‎scripts/cec_long_to_wide.py‎`

Lines changed: 16 additions & 36 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,16 @@`
	`1`	`+##`
	`2`	`+## Generate a wide-to-long CEC table from a long-to-wide CEC table.`
	`3`	`+##`
	`4`	`+`
`1`	`5`	`import pandas as pd`
`2`		`-import numpy as np`
`3`	`6`	`import sqlalchemy`
`4`	`7`
`5`	`8`	`import os`
`6`	`9`	`import sys`
`7`	`10`	`import yaml`
`8`	`11`
	`12`	`+import datetime as dt`
	`13`	`+`
`9`	`14`	`sys.path.insert(0, os.path.join(os.path.abspath('.'), 'scripts'))`
`10`	`15`
`11`	`16`	`from context import config`
`@@ -20,7 +25,7 @@`
`20`	`25`	`'utf8mb4'), convert_unicode=True)`
`21`	`26`
`22`	`27`	`## get the users to skip`
`23`		`-non_users = ['test1', 'admin', 'tina', 'alex', 'ellen', 'ishita', 'andrea', 'karishma']`
	`28`	`+non_users = ['test1', 'admin', 'tina', 'alex', 'ellen', 'ishita', 'andrea', 'karishma', 'adj1']`
`24`	`29`
`25`	`30`	`## get the disqualifying information rows`
`26`	`31`	`disqualifying_variables = yaml.load(`
`@@ -45,7 +50,7 @@`
`45`	`50`	`## get the query`
`46`	`51`	`df_long = pd.read_sql(query, con = mysql_engine)`
`47`	`52`
`48`		`-## there should be duplicates but here we are`
	`53`	`+## there should not be duplicates but here we are`
`49`	`54`	`df_long = df_long.drop_duplicates()`
`50`	`55`
`51`	`56`	`## remove test users`
`@@ -58,37 +63,12 @@`
`58`	`63`	`## move text field into value if not null`
`59`	`64`	`df_long['value'] = df_long.apply(lambda x: x['text'] if x['text'] is not None else x['value'], axis = 1)`
`60`	`65`
`61`		`-## pivot`
`62`		`-columns = ['article-desc', 'desc', 'location', 'start-date']`
	`66`	`+## pivot, join variables with multiple values with ;`
`63`	`67`	`indexes = ['event_id', 'coder_id', 'article_id', 'publication', 'pub_date', 'title']`
`64`		`-df_wide = df_long[df_long['variable'].isin(columns)].\`
`65`		`- pivot(index = indexes, columns = 'variable', values = 'value')`
`66`		`-`
`67`		`-## rename a few things to be MySQL and SQLAlchemy friendly`
`68`		`-df_wide = df_wide.rename(columns = {'article-desc': 'article_desc', 'start-date': 'start_date'})`
`69`		`-`
`70`		`-## reset indexes`
`71`		`-df_wide = df_wide.reset_index()`
`72`		`-`
`73`		`-## replace empty values with NaN`
`74`		`-df_wide[df_wide == ''] = np.nan`
`75`		`-`
`76`		`-## upload to MySQL`
`77`		`-df_wide.to_sql(name = 'event_metadata',`
`78`		`- con = mysql_engine,`
`79`		`- if_exists= 'replace',`
`80`		`- index = True,`
`81`		`- index_label = 'id',`
`82`		`- dtype = {`
`83`		`- 'id': sqlalchemy.types.Integer(),`
`84`		`- 'coder_id': sqlalchemy.types.Text(),`
`85`		`- 'event_id': sqlalchemy.types.Integer(),`
`86`		`- 'article_id': sqlalchemy.types.Integer(),`
`87`		`- 'article_desc': sqlalchemy.types.UnicodeText(),`
`88`		`- 'desc': sqlalchemy.types.UnicodeText(),`
`89`		`- 'location': sqlalchemy.types.Text(),`
`90`		`- 'start_date': sqlalchemy.types.Date(),`
`91`		`- 'publication': sqlalchemy.types.Text(),`
`92`		`- 'pub_date': sqlalchemy.types.Date(),`
`93`		`- 'title': sqlalchemy.types.Text()`
`94`		`- })`
	`68`	`+df_wide = pd.pivot_table(data = df_long,`
	`69`	`+ index = indexes,`
	`70`	`+ columns = 'variable',`
	`71`	`+ values = 'value',`
	`72`	`+ aggfunc = lambda x: ';'.join(x))`
	`73`	`+`
	`74`	`+df_wide.to_csv('../exports/pivoted-events_{}.csv'.format(dt.datetime.now().strftime('%Y-%m-%d')))`

`‎scripts/generate_event_metadata.py‎`

Lines changed: 94 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,94 @@`
	`1`	`+import pandas as pd`
	`2`	`+import numpy as np`
	`3`	`+import sqlalchemy`
	`4`	`+`
	`5`	`+import os`
	`6`	`+import sys`
	`7`	`+import yaml`
	`8`	`+`
	`9`	`+sys.path.insert(0, os.path.join(os.path.abspath('.'), 'scripts'))`
	`10`	`+`
	`11`	`+from context import config`
	`12`	`+`
	`13`	`+## MySQL setup`
	`14`	`+mysql_engine = sqlalchemy.create_engine(`
	`15`	`+ 'mysql://%s:%s@localhost/%s?unix_socket=%s&charset=%s' %`
	`16`	`+ (config.MYSQL_USER,`
	`17`	`+ config.MYSQL_PASS,`
	`18`	`+ config.MYSQL_DB,`
	`19`	`+ config.MYSQL_SOCK,`
	`20`	`+ 'utf8mb4'), convert_unicode=True)`
	`21`	`+`
	`22`	`+## get the users to skip`
	`23`	`+non_users = ['test1', 'admin', 'tina', 'alex', 'ellen', 'ishita', 'andrea', 'karishma']`
	`24`	`+`
	`25`	`+## get the disqualifying information rows`
	`26`	`+disqualifying_variables = yaml.load(`
	`27`	`+ open(os.path.join(os.path.abspath('..'), 'yes-no.yaml'), 'r'),`
	`28`	`+ Loader = yaml.BaseLoader)`
	`29`	`+disqualifying_variables = [x[0] for x in disqualifying_variables['Disqualifying information']]`
	`30`	`+`
	`31`	`+query = """SELECT`
	`32`	`+ event_id,`
	`33`	`+ u.username coder_id,`
	`34`	`+ variable,`
	`35`	`+ value,`
	`36`	`+ cec.text,`
	`37`	`+ am.id article_id,`
	`38`	`+ am.pub_date,`
	`39`	`+ am.publication,`
	`40`	`+ am.title`
	`41`	`+FROM coder_event_creator cec`
	`42`	`+LEFT JOIN article_metadata am ON (cec.article_id = am.id)`
	`43`	`+LEFT JOIN user u ON (cec.coder_id = u.id)"""`
	`44`	`+`
	`45`	`+## get the query`
	`46`	`+df_long = pd.read_sql(query, con = mysql_engine)`
	`47`	`+`
	`48`	`+## there should not be duplicates but here we are`
	`49`	`+df_long = df_long.drop_duplicates()`
	`50`	`+`
	`51`	`+## remove test users`
	`52`	`+df_long = df_long[~df_long['coder_id'].isin(non_users)]`
	`53`	`+`
	`54`	`+## get disqualified events and remove`
	`55`	`+disqualified_events = df_long[df_long['variable'].isin(disqualifying_variables)].event_id.unique()`
	`56`	`+df_long = df_long[~df_long['event_id'].isin(disqualified_events)]`
	`57`	`+`
	`58`	`+## move text field into value if not null`
	`59`	`+df_long['value'] = df_long.apply(lambda x: x['text'] if x['text'] is not None else x['value'], axis = 1)`
	`60`	`+`
	`61`	`+## pivot`
	`62`	`+columns = ['article-desc', 'desc', 'location', 'start-date']`
	`63`	`+indexes = ['event_id', 'coder_id', 'article_id', 'publication', 'pub_date', 'title']`
	`64`	`+df_wide = df_long[df_long['variable'].isin(columns)].\`
	`65`	`+ pivot(index = indexes, columns = 'variable', values = 'value')`
	`66`	`+`
	`67`	`+## rename a few things to be MySQL and SQLAlchemy friendly`
	`68`	`+df_wide = df_wide.rename(columns = {'article-desc': 'article_desc', 'start-date': 'start_date'})`
	`69`	`+`
	`70`	`+## reset indexes`
	`71`	`+df_wide = df_wide.reset_index()`
	`72`	`+`
	`73`	`+## replace empty values with NaN`
	`74`	`+df_wide[df_wide == ''] = np.nan`
	`75`	`+`
	`76`	`+## upload to MySQL`
	`77`	`+df_wide.to_sql(name = 'event_metadata',`
	`78`	`+ con = mysql_engine,`
	`79`	`+ if_exists= 'replace',`
	`80`	`+ index = True,`
	`81`	`+ index_label = 'id',`
	`82`	`+ dtype = {`
	`83`	`+ 'id': sqlalchemy.types.Integer(),`
	`84`	`+ 'coder_id': sqlalchemy.types.Text(),`
	`85`	`+ 'event_id': sqlalchemy.types.Integer(),`
	`86`	`+ 'article_id': sqlalchemy.types.Integer(),`
	`87`	`+ 'article_desc': sqlalchemy.types.UnicodeText(),`
	`88`	`+ 'desc': sqlalchemy.types.UnicodeText(),`
	`89`	`+ 'location': sqlalchemy.types.Text(),`
	`90`	`+ 'start_date': sqlalchemy.types.Date(),`
	`91`	`+ 'publication': sqlalchemy.types.Text(),`
	`92`	`+ 'pub_date': sqlalchemy.types.Date(),`
	`93`	`+ 'title': sqlalchemy.types.Text()`
	`94`	`+ })`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit a8ac600

File tree

2 files changed

2 files changed

`‎scripts/cec_long_to_wide.py‎`

`‎scripts/generate_event_metadata.py‎`

0 commit comments