Putting the results of a first JSON file of to subparts of a second one when key matches

Question 1

I have a geojson file with results for each province and one that gives other results for each constituency (an administrative part of the province). I would like to make a third one that puts the results of the first one for each 'constituency' level of the second one that has the same province with the first one, so I did the following code. It works but it's not optimized at all for the actual file which is far bigger, and which I can share with you if you think it can help me optimize the code.

import json
import pandas as pd
def find_segment(province_queried):
 with open('research.geojson', encoding='utf-8-sig') as f:
 dct_research = json.load(f)
 for feature in dct_research['features']:
 for key in feature.get("properties", {}).get("results", {}):
 province = feature.get("properties", {}).get("name")
 segments = feature.get("properties", {}).get("segments")
 if province == province_queried:
 return segments
def main():
 with open('maroc-swing.json') as f:
 dct_constituencies = json.load(f)
 i = 0
 j = 0
 d = []
 for feature in dct_constituencies['features']:
 for key in feature.get("properties", {}).get("results", {}):
 province = feature.get("properties", {}).get("name_1")
 constituency = feature.get("properties", {}).get("name_4", {})
 segments = find_segment(province)
 d.append({"Party Affiliation": key,
 "Province": province,
 "Constituency Name": constituency,
 "segments": segments})
 column_names = ["Province", "Constituency Name", "Party Affiliation", "segments"]
 df = pd.DataFrame(d, columns=column_names)
 df.to_csv("constituencies_with_segments.csv")
if __name__ == '__main__':
 main()

If you prefer it with drawings I'm basically transformint the right hand file:

enter image description here

Into:

enter image description here

It means that all the constituents of the same province will have the same results that come from 'research.json'. Right now I'm trying to do it in the key name_2.

Here it is constituences.json:

{
 "type": "FeatureCollection",
 "totalFeatures": 1515,
 "features": [
 {
 "type": "Feature",
 "id": "fd597jf1799.1",
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -7.27163887,
 33.24041367
 ],
 [
 -7.27286911,
 33.24623871
 ],
 [
 -7.26732922,
 33.25904083
 ]
 ]
 ]
 ]
 },
 "geometry_name": "geom",
 "properties": {
 "id_0": 152,
 "iso": "MAR",
 "name_0": "Morocco",
 "id_1": 1,
 "name_1": "Chaouia - Ouardigha",
 "id_2": 1,
 "name_2": "Ben Slimane",
 "id_3": 1,
 "name_3": "Ben Slimane",
 "id_4": 1,
 "name_4": "Ahlaf",
 "varname_4": null,
 "ccn_4": 0,
 "cca_4": null,
 "type_4": "Commune Rural",
 "engtype_4": "Rural Commune",
 "bbox": [
 -7.27286911,
 33.22112656,
 -6.93353081,
 33.38970184
 ],
 "swing_count": 1,
 "polling_station_count": 15,
 "turnout": 0.4780299144225693,
 "results": {
 "PI": 187,
 "PJD": 88,
 "PAM": 59,
 "USFP": 1530,
 "APFGD": 2,
 "PPS": 15,
 "RNI": 708,
 "MP": 56,
 "UC": 3,
 "FFD": 0,
 "MDS": 0,
 "AAR": 0,
 "P Neo-Democrates": 8,
 "PEDD": 0,
 "PRD": 2,
 "PRV": 0,
 "PDI": 0,
 "PGVM": 0,
 "PALAMAL": 0,
 "PCS": 0,
 "PUD": 0,
 "PDN": 1,
 "PLJS": 0,
 "PSD": 0,
 "P Annahda": 0,
 "PA": 0,
 "UMD": 0,
 "USAPMD": 10
 },
 "voter_file": {
 "nbre_sieges": 3,
 "nbre_inscrits": 5953,
 "nbre_votants": 2997,
 "nbre_nuls": 328,
 "nbre_exprimees": 2669
 },
 "swing_ratio": 0.06666666666666667
 }
 },
 {
 "type": "Feature",
 "id": "fd597jf1799.2",
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -7.00001287,
 33.63414383
 ],
 [
 -7.00081205,
 33.6269989
 ],
 [
 -6.99825382,
 33.60465622
 ]
 ]
 ]
 ]
 },
 "geometry_name": "geom",
 "properties": {
 "id_0": 152,
 "iso": "MAR",
 "name_0": "Morocco",
 "id_1": 1,
 "name_1": "Chaouia - Ouardigha",
 "id_2": 1,
 "name_2": "Ben Slimane",
 "id_3": 1,
 "name_3": "Ben Slimane",
 "id_4": 2,
 "name_4": "Ain Tizgha",
 "varname_4": null,
 "ccn_4": 0,
 "cca_4": null,
 "type_4": "Commune Rural",
 "engtype_4": "Rural Commune",
 "bbox": [
 -7.12737417,
 33.57954407,
 -6.99144888,
 33.78071213
 ],
 "swing_count": 11,
 "polling_station_count": 23,
 "turnout": 0.3912592182242994,
 "results": {
 "PI": 1837,
 "PJD": 366,
 "PAM": 143,
 "USFP": 22,
 "APFGD": 44,
 "PPS": 773,
 "RNI": 109,
 "MP": 111,
 "UC": 9,
 "FFD": 0,
 "MDS": 0,
 "AAR": 0,
 "P Neo-Democrates": 76,
 "PEDD": 27,
 "PRD": 2,
 "PRV": 0,
 "PDI": 0,
 "PGVM": 0,
 "PALAMAL": 0,
 "PCS": 0,
 "PUD": 0,
 "PDN": 1,
 "PLJS": 0,
 "PSD": 0,
 "P Annahda": 0,
 "PA": 0,
 "UMD": 2,
 "USAPMD": 514
 },
 "voter_file": {
 "nbre_sieges": 3,
 "nbre_inscrits": 8262,
 "nbre_votants": 4479,
 "nbre_nuls": 443,
 "nbre_exprimees": 4036
 },
 "swing_ratio": 0.4782608695652174
 }
 }
 ],
 "crs": {
 "type": "name",
 "properties": {
 "name": "urn:ogc:def:crs:EPSG::4326"
 }
 },
 "bbox": [
 -13.2287693,
 27.62881088,
 -0.93655348,
 35.96390533
 ]
}

And here's 'research.json':

{
 "type": "FeatureCollection",
 "features": [
 {
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -7.18458319,
 33.81124878
 ],
 [
 -7.18458319,
 33.81097412
 ],
 [
 -7.18319511,
 33.81097412
 ]
 ]
 ]
 ]
 },
 "type": "Feature",
 "id": "md898kw3185.1",
 "properties": {
 "name": "Ben Slimane",
 "type": "Province",
 "segments": {
 "UND": {
 "I don't know yet": 16,
 "No": 3,
 "Yes": 5,
 "total": 24,
 "intention_rate": 20.83
 },
 "ABS": {
 "I don't know yet": 1,
 "No": 10,
 "Yes": 1,
 "total": 12,
 "intention_rate": 8.33
 },
 "PJD": {
 "I don't know yet": 1,
 "Yes": 3,
 "total": 4,
 "intention_rate": 75
 },
 "PAM": {
 "I don't know yet": 1,
 "Yes": 1,
 "total": 2,
 "intention_rate": 50
 },
 "OTH": {
 "I don't know yet": 1,
 "No": 4,
 "Yes": 4,
 "total": 9,
 "intention_rate": 44.44
 },
 "RNI": {
 "Yes": 2,
 "total": 2,
 "intention_rate": 100
 },
 "IST": {
 "I don't know yet": 1,
 "Yes": 1,
 "total": 2,
 "intention_rate": 50
 }
 },
 "sample_size": 55
 }
 },
 {
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -6.3649292,
 33.22292328
 ],
 [
 -6.38369083,
 33.21116257
 ],
 [
 -6.39487886,
 33.19342422
 ]
 ]
 ]
 ]
 },
 "type": "Feature",
 "id": "md898kw3185.2",
 "properties": {
 "name": "Khouribga",
 "type": "Province",
 "segments": {
 "UND": {
 "I don't know yet": 46,
 "No": 12,
 "Yes": 13,
 "total": 71,
 "intention_rate": 18.31
 },
 "ABS": {
 "I don't know yet": 4,
 "No": 79,
 "Yes": 1,
 "total": 84,
 "intention_rate": 1.19
 },
 "PJD": {
 "I don't know yet": 14,
 "No": 1,
 "Yes": 4,
 "total": 19,
 "intention_rate": 21.05
 },
 "PAM": {
 "I don't know yet": 12,
 "No": 1,
 "Yes": 7,
 "total": 20,
 "intention_rate": 35
 },
 "OTH": {
 "I don't know yet": 3,
 "No": 3,
 "Yes": 2,
 "total": 8,
 "intention_rate": 25
 },
 "RNI": {
 "I don't know yet": 3,
 "Yes": 3,
 "total": 6,
 "intention_rate": 50
 },
 "IST": {
 "I don't know yet": 5,
 "Yes": 1,
 "total": 6,
 "intention_rate": 16.67
 }
 },
 "sample_size": 214
 }
 },
 {
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -3.77662611,
 34.86683655
 ],
 [
 -3.7705431,
 34.86468506
 ],
 [
 -3.75482011,
 34.86924362
 ]
 ]
 ]
 ]
 },
 "type": "Feature",
 "id": "md898kw3185.57",
 "properties": {
 "name": "Taza",
 "type": "Province",
 "segments": {
 "UND": {
 "I don't know yet": 16,
 "No": 28,
 "Yes": 14,
 "total": 58,
 "intention_rate": 24.14
 },
 "ABS": {
 "I don't know yet": 2,
 "No": 29,
 "Yes": 1,
 "total": 32,
 "intention_rate": 3.12
 },
 "PJD": {
 "I don't know yet": 9,
 "No": 4,
 "Yes": 23,
 "total": 36,
 "intention_rate": 63.89
 },
 "PAM": {
 "I don't know yet": 4,
 "No": 1,
 "Yes": 1,
 "total": 6,
 "intention_rate": 16.67
 },
 "OTH": {
 "I don't know yet": 3,
 "No": 3,
 "Yes": 5,
 "total": 11,
 "intention_rate": 45.45
 },
 "RNI": {
 "total": 0,
 "intention_rate": 0
 },
 "IST": {
 "I don't know yet": 2,
 "No": 2,
 "Yes": 5,
 "total": 9,
 "intention_rate": 55.56
 }
 },
 "sample_size": 152
 }
 }
 ]
}

Sadly my attempt takes too much time. Do you know how I can optimize it, or do you have a better idea in mind?

Question 2

find_segment loads the json file every time it is called. It would be much faster to load the data once and return a dict mapping province to segments. Also, key doesn't appear to be used, so remove that for-loop

def load_research(filename='research.geojson'):
 """loads json data from filename and return a dict of segments
 keyed by province."""
 with open(filename, encoding='utf-8-sig') as f:
 dct_research = json.load(f)
 data = {}
 for feature in dct_research['features']:
 province = feature.get("properties", {}).get("name")
 segments = feature.get("properties", {}).get("segments")
 data[province] = segments
 return data

In main() it looks like province, constituency, and segments only depend on feature and not on key. So calculate them before the for key ... loop. That should speed things up a lot.

def main():
 # load data into a dict keyed by province
 research = load_research()
 with open('maroc-swing.json') as f:
 dct_constituencies = json.load(f)
 d = []
 for feature in dct_constituencies['features']:
 province = feature.get("properties", {}).get("name_1", "")
 constituency = feature.get("properties", {}).get("name_4", "")
 # this is now a dict lookup rather than loading a json file
 segments = research[province]
 for key in feature.get("properties", {}).get("results", {}):
 d.append({"Party Affiliation": key,
 "Province": province,
 "Constituency Name": constituency,
 "segments": segments})
 column_names = ["Province", "Constituency Name", "Party Affiliation", "segments"]
 df = pd.DataFrame(d, columns=column_names)
 df.to_csv("constituencies_with_segments.csv")

NB. I haven't tested the code, so there may be some typos etc.

RootTwo RootTwo 10.6k1 gold badge14 silver badges30 bronze badges · Accepted Answer · 2020-05-23 22:56:28Z

find_segment loads the json file every time it is called. It would be much faster to load the data once and return a dict mapping province to segments. Also, key doesn't appear to be used, so remove that for-loop

def load_research(filename='research.geojson'):
 """loads json data from filename and return a dict of segments
 keyed by province."""
 with open(filename, encoding='utf-8-sig') as f:
 dct_research = json.load(f)
 data = {}
 for feature in dct_research['features']:
 province = feature.get("properties", {}).get("name")
 segments = feature.get("properties", {}).get("segments")
 data[province] = segments
 return data

In main() it looks like province, constituency, and segments only depend on feature and not on key. So calculate them before the for key ... loop. That should speed things up a lot.

def main():
 # load data into a dict keyed by province
 research = load_research()
 with open('maroc-swing.json') as f:
 dct_constituencies = json.load(f)
 d = []
 for feature in dct_constituencies['features']:
 province = feature.get("properties", {}).get("name_1", "")
 constituency = feature.get("properties", {}).get("name_4", "")
 # this is now a dict lookup rather than loading a json file
 segments = research[province]
 for key in feature.get("properties", {}).get("results", {}):
 d.append({"Party Affiliation": key,
 "Province": province,
 "Constituency Name": constituency,
 "segments": segments})
 column_names = ["Province", "Constituency Name", "Party Affiliation", "segments"]
 df = pd.DataFrame(d, columns=column_names)
 df.to_csv("constituencies_with_segments.csv")

NB. I haven't tested the code, so there may be some typos etc.

Stack Exchange Network

Putting the results of a first JSON file of to subparts of a second one when key matches

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Putting the results of a first JSON file of to subparts of a second one when key matches

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions