1+ import pandas as pd
2+ import numpy as np
3+ import sqlalchemy
4+ 5+ import os
6+ import sys
7+ import yaml
8+ 9+ sys .path .insert (0 , os .path .join (os .path .abspath ('.' ), 'scripts' ))
10+ 11+ from context import config
12+ 13+ ## MySQL setup
14+ mysql_engine = sqlalchemy .create_engine (
15+ 'mysql://%s:%s@localhost/%s?unix_socket=%s&charset=%s' %
16+ (config .MYSQL_USER ,
17+ config .MYSQL_PASS ,
18+ config .MYSQL_DB ,
19+ config .MYSQL_SOCK ,
20+ 'utf8mb4' ), convert_unicode = True )
21+ 22+ ## get the users to skip
23+ non_users = ['test1' , 'admin' , 'tina' , 'alex' , 'ellen' , 'ishita' , 'andrea' , 'karishma' ]
24+ 25+ ## get the disqualifying information rows
26+ disqualifying_variables = yaml .load (
27+ open (os .path .join (os .path .abspath ('..' ), 'yes-no.yaml' ), 'r' ),
28+ Loader = yaml .BaseLoader )
29+ disqualifying_variables = [x [0 ] for x in disqualifying_variables ['Disqualifying information' ]]
30+ 31+ query = """SELECT
32+ event_id,
33+ u.username coder_id,
34+ variable,
35+ value,
36+ cec.text,
37+ am.id article_id,
38+ am.pub_date,
39+ am.publication,
40+ am.title
41+ FROM coder_event_creator cec
42+ LEFT JOIN article_metadata am ON (cec.article_id = am.id)
43+ LEFT JOIN user u ON (cec.coder_id = u.id)"""
44+ 45+ ## get the query
46+ df_long = pd .read_sql (query , con = mysql_engine )
47+ 48+ ## there should not be duplicates but here we are
49+ df_long = df_long .drop_duplicates ()
50+ 51+ ## remove test users
52+ df_long = df_long [~ df_long ['coder_id' ].isin (non_users )]
53+ 54+ ## get disqualified events and remove
55+ disqualified_events = df_long [df_long ['variable' ].isin (disqualifying_variables )].event_id .unique ()
56+ df_long = df_long [~ df_long ['event_id' ].isin (disqualified_events )]
57+ 58+ ## move text field into value if not null
59+ df_long ['value' ] = df_long .apply (lambda x : x ['text' ] if x ['text' ] is not None else x ['value' ], axis = 1 )
60+ 61+ ## pivot
62+ columns = ['article-desc' , 'desc' , 'location' , 'start-date' ]
63+ indexes = ['event_id' , 'coder_id' , 'article_id' , 'publication' , 'pub_date' , 'title' ]
64+ df_wide = df_long [df_long ['variable' ].isin (columns )].\
65+ pivot (index = indexes , columns = 'variable' , values = 'value' )
66+ 67+ ## rename a few things to be MySQL and SQLAlchemy friendly
68+ df_wide = df_wide .rename (columns = {'article-desc' : 'article_desc' , 'start-date' : 'start_date' })
69+ 70+ ## reset indexes
71+ df_wide = df_wide .reset_index ()
72+ 73+ ## replace empty values with NaN
74+ df_wide [df_wide == '' ] = np .nan
75+ 76+ ## upload to MySQL
77+ df_wide .to_sql (name = 'event_metadata' ,
78+ con = mysql_engine ,
79+ if_exists = 'replace' ,
80+ index = True ,
81+ index_label = 'id' ,
82+ dtype = {
83+ 'id' : sqlalchemy .types .Integer (),
84+ 'coder_id' : sqlalchemy .types .Text (),
85+ 'event_id' : sqlalchemy .types .Integer (),
86+ 'article_id' : sqlalchemy .types .Integer (),
87+ 'article_desc' : sqlalchemy .types .UnicodeText (),
88+ 'desc' : sqlalchemy .types .UnicodeText (),
89+ 'location' : sqlalchemy .types .Text (),
90+ 'start_date' : sqlalchemy .types .Date (),
91+ 'publication' : sqlalchemy .types .Text (),
92+ 'pub_date' : sqlalchemy .types .Date (),
93+ 'title' : sqlalchemy .types .Text ()
94+ })
0 commit comments