I used this API to download some data.
The code is supposed to create a dataframe table using specific fields of an object, and it seems to work, but running slow.
I think it would run faster if I used something instead of json_nomarlize
.
def create_table(data: List[dict]) -> pd.DataFrame:
"""
Creates pd.DataFrame using `infocard` values,
calculates `income` using `["unified_source"]["step_11"][k]` values.
Note that `[k]` keys are randomly generated.
Please see sample data at the end of the post.
"""
infocard_container = []
step_11_container = []
for i in range(len(data)):
infocard_container.append(json_normalize(data[i]["infocard"]))
step_11_subcontainer = []
try: # if document contains ["unified_source"]["step_11"] field
for k in data[i]["unified_source"]["step_11"].keys():
s_df = json_normalize(data[i]["unified_source"]["step_11"][k])[
["person", "sizeIncome"]
]
income_sum = s_df.loc[s_df["person"].eq("1")].sum()["sizeIncome"]
step_11_subcontainer.append(income_sum)
except KeyError: # if not, NaN would be appended
step_11_subcontainer.append(np.nan)
step_11_container.append(step_11_subcontainer)
df = pd.concat(infocard_container)
df["income"] = [sum(i) for i in step_11_container]
assert len(data) == len(df)
return df
I need all data[0]["infocard"]
values, and within data[0]["unified_source"]["step_11"][k]
values, I need person
, sizeIncome
values (and source_ua_company_code
if possible - it's not in the code). I'm adding all sizeIncome
values if the person
== 1
Sample data (data[0]):
{'guid': 'nacp_3bb5b983-edd9...',
'infocard': {'first_name': 'NAME',
'patronymic': 'SNAME',
'last_name': 'SURNAME',
'office': '"OFFICE"',
'position': 'POSITION"',
'source': 'NACP',
'id': 'nacp_3bb5b983-edd9...',
'url': 'https://declarations.com.ua/declaration/nacp_3bb5b983-edd9...',
'document_type': 'Yearly',
'is_corrected': False,
'created_date': '2018-03-27T00:00:00',
'declaration_year': 2017},
'raw_source': {'url': 'https://public-api.nazk.gov.ua/v1/declaration/nacp_3bb5b983-edd9...'},
'unified_source': {'step_0': {'declarationType': '1',
'declarationYear1': '2017'},
'step_1': {'actual_cityType': '[hidden]',
'actual_country': '',
'actual_postCode': '[hidden]',
'actual_street': '[hidden]',
'actual_streetType': '[hidden]',
'changedName': False,
'city': '[hidden]',
'cityPath': '[hidden]',
'cityType': '[hidden]',
'city_extendedstatus': '1',
'corruptionAffected': 'No',
'country': '1',
'countryPath': '',
'district': '[hidden]',
'eng_actualAddress': '[hidden]',
'eng_actualPostCode': '[hidden]',
'eng_postCode': '',
'eng_sameRegLivingAddress': '[hidden]',
'firstname': 'NAME',
'housePartNum_extendedstatus': '1',
'lastname': 'SURNAME',
'middlename': 'SNAME',
'postCategory': '',
'postCategory_extendedstatus': '1',
'postCode': '[hidden]',
'postType': '',
'postType_extendedstatus': '1',
'previous_firstname': '',
'previous_lastname': '',
'previous_middlename': '',
'region': '[hidden]',
'responsiblePosition': 'Ні',
'sameRegLivingAddress': '[hidden]',
'street': '[hidden]',
'streetType': '[hidden]',
'ukr_actualAddress': '[hidden]',
'workPlace': 'workPlace',
'workPost': 'workPost',
'dnt_organization_group': 'n'},
'step_11': {'1493274700481': {'incomeSource': 'j',
'iteration': '1493274700481',
'objectType': 'salary',
'otherObjectType': '',
'person': '1',
'rights': {'1': {'citizen': '',
'eng_company_address': '',
'eng_company_code': '',
'eng_company_name': '',
'eng_firstname': '',
'eng_fullname': '',
'eng_lastname': '',
'eng_middlename': '',
'eng_middlename_extendedstatus': '',
'eng_postCode': '',
'eng_postCode_extendedstatus': '',
'otherOwnership': '',
'ownershipType': 'property',
'percent-ownership': '',
'postCode': '[hidden]',
'rightBelongs': '1',
'rights_cityPath': '',
'ua_apartmentsNum_extendedstatus': '',
'ua_city': '',
'ua_company_code': '',
'ua_company_name': '',
'ua_firstname': '',
'ua_houseNum_extendedstatus': '',
'ua_housePartNum_extendedstatus': '',
'ua_lastname': '',
'ua_middlename': '',
'ua_middlename_extendedstatus': '',
'ua_postCode': '',
'ua_postCode_extendedstatus': '',
'ua_street': '[hidden]',
'ua_streetType': '[hidden]',
'ua_street_extendedstatus': '',
'ukr_company_address': '',
'ukr_company_name': '',
'ukr_firstname': '',
'ukr_fullname': '',
'ukr_lastname': '',
'ukr_middlename': '',
'ukr_middlename_extendedstatus': ''}},
'sizeIncome': 44505.0,
'source_citizen': '_',
'source_eng_company_address': '',
'source_eng_company_code': '',
'source_eng_company_name': '',
'source_eng_fullname': '',
'source_ua_company_code': '000000',
'source_ua_company_code_extendedstatus': '0',
'source_ua_company_name': '_',
'source_ua_firstname': '',
'source_ua_lastname': '',
'source_ua_middlename': '',
'source_ua_sameRegLivingAddress': '',
'source_ukr_company_address': '',
'source_ukr_company_name': '',
'source_ukr_fullname': '',
'dnt_sizeIncome_hidden': False,
'dnt_objectType_encoded': 'salarymain',
'dnt_is_foreign': False},
'1493274779231': {'incomeSource': '1',
'iteration': '1493274779231',
'objectType': 'business',
'otherObjectType': '',
'person': '1',
'rights': {'1': {'citizen': '',
'eng_company_address': '',
'eng_company_code': '',
'eng_company_name': '',
'eng_firstname': '',
'eng_fullname': '',
'eng_lastname': '',
'eng_middlename': '',
'eng_middlename_extendedstatus': '',
'eng_postCode': '',
'eng_postCode_extendedstatus': '',
'otherOwnership': '',
'ownershipType': 'property',
'percent-ownership': '',
'postCode': '[hidden]',
'rightBelongs': '1',
'rights_cityPath': '',
'ua_apartmentsNum_extendedstatus': '',
'ua_city': '',
'ua_company_code': '',
'ua_company_name': '',
'ua_firstname': '',
'ua_houseNum_extendedstatus': '',
'ua_housePartNum_extendedstatus': '',
'ua_lastname': '',
'ua_middlename': '',
'ua_middlename_extendedstatus': '',
'ua_postCode': '',
'ua_postCode_extendedstatus': '',
'ua_street': '[hidden]',
'ua_streetType': '[hidden]',
'ua_street_extendedstatus': '',
'ukr_company_address': '',
'ukr_company_name': '',
'ukr_firstname': '',
'ukr_fullname': '',
'ukr_lastname': '',
'ukr_middlename': '',
'ukr_middlename_extendedstatus': ''}},
'sizeIncome': 19100.0,
'source_citizen': '',
'source_eng_company_address': '',
'source_eng_company_code': '',
'source_eng_company_name': '',
'source_eng_fullname': '',
'source_ua_company_code': '',
'source_ua_company_name': '',
'source_ua_firstname': '',
'source_ua_lastname': '',
'source_ua_middlename': '',
'source_ua_sameRegLivingAddress': '',
'source_ukr_company_address': '',
'source_ukr_company_name': '',
'source_ukr_fullname': '',
'dnt_sizeIncome_hidden': False,
'dnt_objectType_encoded': 'business',
'dnt_is_foreign': False},
'1493275433175': {'incomeSource': 'j',
'iteration': '1493275433175',
'objectType': 'pension',
'otherObjectType': '',
'person': '1',
'rights': {'1': {'citizen': '',
'eng_company_address': '',
'eng_company_code': '',
'eng_company_name': '',
'eng_firstname': '',
'eng_fullname': '',
'eng_lastname': '',
'eng_middlename': '',
'eng_middlename_extendedstatus': '',
'eng_postCode': '',
'eng_postCode_extendedstatus': '',
'otherOwnership': '',
'ownershipType': 'property',
'percent-ownership': '',
'postCode': '[hidden]',
'rightBelongs': '1',
'rights_cityPath': '',
'ua_apartmentsNum_extendedstatus': '',
'ua_city': '',
'ua_company_code': '',
'ua_company_name': '',
'ua_firstname': '',
'ua_houseNum_extendedstatus': '',
'ua_housePartNum_extendedstatus': '',
'ua_lastname': '',
'ua_middlename': '',
'ua_middlename_extendedstatus': '',
'ua_postCode': '',
'ua_postCode_extendedstatus': '',
'ua_street': '[hidden]',
'ua_streetType': '[hidden]',
'ua_street_extendedstatus': '',
'ukr_company_address': '',
'ukr_company_name': '',
'ukr_firstname': '',
'ukr_fullname': '',
'ukr_lastname': '',
'ukr_middlename': '',
'ukr_middlename_extendedstatus': ''}},
'sizeIncome': 20075.0,
'source_citizen': '_',
'source_eng_company_address': '',
'source_eng_company_code': '',
'source_eng_company_name': '',
'source_eng_fullname': '',
'source_ua_company_code': '',
'source_ua_company_code_extendedstatus': '2',
'source_ua_company_name': '_',
'source_ua_firstname': '',
'source_ua_lastname': '',
'source_ua_middlename': '',
'source_ua_sameRegLivingAddress': '',
'source_ukr_company_address': '',
'source_ukr_company_name': '',
'source_ukr_fullname': '',
'dnt_sizeIncome_hidden': False,
'dnt_objectType_encoded': 'pension',
'dnt_is_foreign': False}},
'step_3': {'1493273226784': {'city': '[hidden]',
'cityPath': '[hidden]',
'costAssessment': 0,
'costAssessment_extendedstatus': '2',
'costDate': 0,
'costDate_extendedstatus': '2',
'country': '1',
'district': '[hidden]',
'iteration': '1493273226784',
'objectType': '_',
'otherObjectType': '',
'owningDate': 'date',
'person': '1',
'postCode': '[hidden]',
'regNumber_extendedstatus': '1',
'region': '[hidden]',
'rights': {'1': {'citizen': '',
'eng_company_address': '',
'eng_company_code': '',
'eng_company_name': '',
'eng_firstname': '',
'eng_fullname': '',
'eng_lastname': '',
'eng_middlename': '',
'eng_middlename_extendedstatus': '',
'eng_postCode': '',
'eng_postCode_extendedstatus': '',
'otherOwnership': '',
'ownershipType': '_',
'percent-ownership': '50',
'postCode': '[hidden]',
'rightBelongs': '1',
'rights_cityPath': '',
'ua_apartmentsNum_extendedstatus': '',
'ua_city': '',
'ua_company_code': '',
'ua_company_name': '',
'ua_firstname': '',
'ua_houseNum_extendedstatus': '',
'ua_housePartNum_extendedstatus': '',
'ua_lastname': '',
'ua_middlename': '',
'ua_middlename_extendedstatus': '',
'ua_postCode': '',
'ua_postCode_extendedstatus': '',
'ua_street': '[hidden]',
'ua_streetType': '[hidden]',
'ua_street_extendedstatus': '',
'ukr_company_address': '',
'ukr_company_name': '',
'ukr_firstname': '',
'ukr_fullname': '',
'ukr_lastname': '',
'ukr_middlename': '',
'ukr_middlename_extendedstatus': '',
'dnt_ownershipType_encoded': 'ownproperty'}},
'totalArea': 32.3,
'ua_cityType': '_',
'ua_housePartNum_extendedstatus': '1',
'ua_postCode': '_',
'ua_street': '[hidden]',
'ua_streetType': '[hidden]',
'dnt_costDate_hidden': True,
'dnt_costAssessment_hidden': True,
'dnt_totalArea_hidden': False,
'dnt_objectType_encoded': 'apt'}}},
'related_entities': {'people': {'family': []},
'documents': {'corrected': [], 'originals': []},
'companies': {'owned': [], 'related': ['_'], 'all': ['_']}}}
1 Answer 1
Use some generators.
Separate out your logic for iterating through your data. For instance, don't form a "subcontainer" list at all since you're just summing it. Also, use values()
instead of keys()
for your situation.
def step_11(datum: dict)
for v in datum["unified_source"]["step_11"].values():
s_df = json_normalize(v)[
["person", "sizeIncome"]
]
yield s_df.loc[s_df["person"].eq("1")].sum()["sizeIncome"]
...
df['income'] = [sum(step_11(d)) for d in data]
-
1\$\begingroup\$
data
andi
will raise aNameError
unless they are global. Should this befor v in datum[...].values()
? \$\endgroup\$C.Nivs– C.Nivs2019年09月24日 17:20:45 +00:00Commented Sep 24, 2019 at 17:20