2
\$\begingroup\$

I have a geojson file with results for each province and one that gives other results for each constituency (an administrative part of the province). I would like to make a third one that puts the results of the first one for each 'constituency' level of the second one that has the same province with the first one, so I did the following code. It works but it's not optimized at all for the actual file which is far bigger, and which I can share with you if you think it can help me optimize the code.

import json
import pandas as pd
def find_segment(province_queried):
 with open('research.geojson', encoding='utf-8-sig') as f:
 dct_research = json.load(f)
 for feature in dct_research['features']:
 for key in feature.get("properties", {}).get("results", {}):
 province = feature.get("properties", {}).get("name")
 segments = feature.get("properties", {}).get("segments")
 if province == province_queried:
 return segments
def main():
 with open('maroc-swing.json') as f:
 dct_constituencies = json.load(f)
 i = 0
 j = 0
 d = []
 for feature in dct_constituencies['features']:
 for key in feature.get("properties", {}).get("results", {}):
 province = feature.get("properties", {}).get("name_1")
 constituency = feature.get("properties", {}).get("name_4", {})
 segments = find_segment(province)
 d.append({"Party Affiliation": key,
 "Province": province,
 "Constituency Name": constituency,
 "segments": segments})
 column_names = ["Province", "Constituency Name", "Party Affiliation", "segments"]
 df = pd.DataFrame(d, columns=column_names)
 df.to_csv("constituencies_with_segments.csv")
if __name__ == '__main__':
 main()

If you prefer it with drawings I'm basically transformint the right hand file:

enter image description here

Into:

enter image description here

It means that all the constituents of the same province will have the same results that come from 'research.json'. Right now I'm trying to do it in the key name_2.

Here it is constituences.json:

{
 "type": "FeatureCollection",
 "totalFeatures": 1515,
 "features": [
 {
 "type": "Feature",
 "id": "fd597jf1799.1",
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -7.27163887,
 33.24041367
 ],
 [
 -7.27286911,
 33.24623871
 ],
 [
 -7.26732922,
 33.25904083
 ]
 ]
 ]
 ]
 },
 "geometry_name": "geom",
 "properties": {
 "id_0": 152,
 "iso": "MAR",
 "name_0": "Morocco",
 "id_1": 1,
 "name_1": "Chaouia - Ouardigha",
 "id_2": 1,
 "name_2": "Ben Slimane",
 "id_3": 1,
 "name_3": "Ben Slimane",
 "id_4": 1,
 "name_4": "Ahlaf",
 "varname_4": null,
 "ccn_4": 0,
 "cca_4": null,
 "type_4": "Commune Rural",
 "engtype_4": "Rural Commune",
 "bbox": [
 -7.27286911,
 33.22112656,
 -6.93353081,
 33.38970184
 ],
 "swing_count": 1,
 "polling_station_count": 15,
 "turnout": 0.4780299144225693,
 "results": {
 "PI": 187,
 "PJD": 88,
 "PAM": 59,
 "USFP": 1530,
 "APFGD": 2,
 "PPS": 15,
 "RNI": 708,
 "MP": 56,
 "UC": 3,
 "FFD": 0,
 "MDS": 0,
 "AAR": 0,
 "P Neo-Democrates": 8,
 "PEDD": 0,
 "PRD": 2,
 "PRV": 0,
 "PDI": 0,
 "PGVM": 0,
 "PALAMAL": 0,
 "PCS": 0,
 "PUD": 0,
 "PDN": 1,
 "PLJS": 0,
 "PSD": 0,
 "P Annahda": 0,
 "PA": 0,
 "UMD": 0,
 "USAPMD": 10
 },
 "voter_file": {
 "nbre_sieges": 3,
 "nbre_inscrits": 5953,
 "nbre_votants": 2997,
 "nbre_nuls": 328,
 "nbre_exprimees": 2669
 },
 "swing_ratio": 0.06666666666666667
 }
 },
 {
 "type": "Feature",
 "id": "fd597jf1799.2",
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -7.00001287,
 33.63414383
 ],
 [
 -7.00081205,
 33.6269989
 ],
 [
 -6.99825382,
 33.60465622
 ]
 ]
 ]
 ]
 },
 "geometry_name": "geom",
 "properties": {
 "id_0": 152,
 "iso": "MAR",
 "name_0": "Morocco",
 "id_1": 1,
 "name_1": "Chaouia - Ouardigha",
 "id_2": 1,
 "name_2": "Ben Slimane",
 "id_3": 1,
 "name_3": "Ben Slimane",
 "id_4": 2,
 "name_4": "Ain Tizgha",
 "varname_4": null,
 "ccn_4": 0,
 "cca_4": null,
 "type_4": "Commune Rural",
 "engtype_4": "Rural Commune",
 "bbox": [
 -7.12737417,
 33.57954407,
 -6.99144888,
 33.78071213
 ],
 "swing_count": 11,
 "polling_station_count": 23,
 "turnout": 0.3912592182242994,
 "results": {
 "PI": 1837,
 "PJD": 366,
 "PAM": 143,
 "USFP": 22,
 "APFGD": 44,
 "PPS": 773,
 "RNI": 109,
 "MP": 111,
 "UC": 9,
 "FFD": 0,
 "MDS": 0,
 "AAR": 0,
 "P Neo-Democrates": 76,
 "PEDD": 27,
 "PRD": 2,
 "PRV": 0,
 "PDI": 0,
 "PGVM": 0,
 "PALAMAL": 0,
 "PCS": 0,
 "PUD": 0,
 "PDN": 1,
 "PLJS": 0,
 "PSD": 0,
 "P Annahda": 0,
 "PA": 0,
 "UMD": 2,
 "USAPMD": 514
 },
 "voter_file": {
 "nbre_sieges": 3,
 "nbre_inscrits": 8262,
 "nbre_votants": 4479,
 "nbre_nuls": 443,
 "nbre_exprimees": 4036
 },
 "swing_ratio": 0.4782608695652174
 }
 }
 ],
 "crs": {
 "type": "name",
 "properties": {
 "name": "urn:ogc:def:crs:EPSG::4326"
 }
 },
 "bbox": [
 -13.2287693,
 27.62881088,
 -0.93655348,
 35.96390533
 ]
}

And here's 'research.json':

{
 "type": "FeatureCollection",
 "features": [
 {
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -7.18458319,
 33.81124878
 ],
 [
 -7.18458319,
 33.81097412
 ],
 [
 -7.18319511,
 33.81097412
 ]
 ]
 ]
 ]
 },
 "type": "Feature",
 "id": "md898kw3185.1",
 "properties": {
 "name": "Ben Slimane",
 "type": "Province",
 "segments": {
 "UND": {
 "I don't know yet": 16,
 "No": 3,
 "Yes": 5,
 "total": 24,
 "intention_rate": 20.83
 },
 "ABS": {
 "I don't know yet": 1,
 "No": 10,
 "Yes": 1,
 "total": 12,
 "intention_rate": 8.33
 },
 "PJD": {
 "I don't know yet": 1,
 "Yes": 3,
 "total": 4,
 "intention_rate": 75
 },
 "PAM": {
 "I don't know yet": 1,
 "Yes": 1,
 "total": 2,
 "intention_rate": 50
 },
 "OTH": {
 "I don't know yet": 1,
 "No": 4,
 "Yes": 4,
 "total": 9,
 "intention_rate": 44.44
 },
 "RNI": {
 "Yes": 2,
 "total": 2,
 "intention_rate": 100
 },
 "IST": {
 "I don't know yet": 1,
 "Yes": 1,
 "total": 2,
 "intention_rate": 50
 }
 },
 "sample_size": 55
 }
 },
 {
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -6.3649292,
 33.22292328
 ],
 [
 -6.38369083,
 33.21116257
 ],
 [
 -6.39487886,
 33.19342422
 ]
 ]
 ]
 ]
 },
 "type": "Feature",
 "id": "md898kw3185.2",
 "properties": {
 "name": "Khouribga",
 "type": "Province",
 "segments": {
 "UND": {
 "I don't know yet": 46,
 "No": 12,
 "Yes": 13,
 "total": 71,
 "intention_rate": 18.31
 },
 "ABS": {
 "I don't know yet": 4,
 "No": 79,
 "Yes": 1,
 "total": 84,
 "intention_rate": 1.19
 },
 "PJD": {
 "I don't know yet": 14,
 "No": 1,
 "Yes": 4,
 "total": 19,
 "intention_rate": 21.05
 },
 "PAM": {
 "I don't know yet": 12,
 "No": 1,
 "Yes": 7,
 "total": 20,
 "intention_rate": 35
 },
 "OTH": {
 "I don't know yet": 3,
 "No": 3,
 "Yes": 2,
 "total": 8,
 "intention_rate": 25
 },
 "RNI": {
 "I don't know yet": 3,
 "Yes": 3,
 "total": 6,
 "intention_rate": 50
 },
 "IST": {
 "I don't know yet": 5,
 "Yes": 1,
 "total": 6,
 "intention_rate": 16.67
 }
 },
 "sample_size": 214
 }
 },
 {
 "geometry": {
 "type": "MultiPolygon",
 "coordinates": [
 [
 [
 [
 -3.77662611,
 34.86683655
 ],
 [
 -3.7705431,
 34.86468506
 ],
 [
 -3.75482011,
 34.86924362
 ]
 ]
 ]
 ]
 },
 "type": "Feature",
 "id": "md898kw3185.57",
 "properties": {
 "name": "Taza",
 "type": "Province",
 "segments": {
 "UND": {
 "I don't know yet": 16,
 "No": 28,
 "Yes": 14,
 "total": 58,
 "intention_rate": 24.14
 },
 "ABS": {
 "I don't know yet": 2,
 "No": 29,
 "Yes": 1,
 "total": 32,
 "intention_rate": 3.12
 },
 "PJD": {
 "I don't know yet": 9,
 "No": 4,
 "Yes": 23,
 "total": 36,
 "intention_rate": 63.89
 },
 "PAM": {
 "I don't know yet": 4,
 "No": 1,
 "Yes": 1,
 "total": 6,
 "intention_rate": 16.67
 },
 "OTH": {
 "I don't know yet": 3,
 "No": 3,
 "Yes": 5,
 "total": 11,
 "intention_rate": 45.45
 },
 "RNI": {
 "total": 0,
 "intention_rate": 0
 },
 "IST": {
 "I don't know yet": 2,
 "No": 2,
 "Yes": 5,
 "total": 9,
 "intention_rate": 55.56
 }
 },
 "sample_size": 152
 }
 }
 ]
}

Sadly my attempt takes too much time. Do you know how I can optimize it, or do you have a better idea in mind?

Jamal
35.2k13 gold badges134 silver badges238 bronze badges
asked May 22, 2020 at 19:42
\$\endgroup\$

1 Answer 1

2
\$\begingroup\$

find_segment loads the json file every time it is called. It would be much faster to load the data once and return a dict mapping province to segments. Also, key doesn't appear to be used, so remove that for-loop

def load_research(filename='research.geojson'):
 """loads json data from filename and return a dict of segments
 keyed by province."""
 with open(filename, encoding='utf-8-sig') as f:
 dct_research = json.load(f)
 data = {}
 for feature in dct_research['features']:
 province = feature.get("properties", {}).get("name")
 segments = feature.get("properties", {}).get("segments")
 data[province] = segments
 return data

In main() it looks like province, constituency, and segments only depend on feature and not on key. So calculate them before the for key ... loop. That should speed things up a lot.

def main():
 # load data into a dict keyed by province
 research = load_research()
 with open('maroc-swing.json') as f:
 dct_constituencies = json.load(f)
 d = []
 for feature in dct_constituencies['features']:
 province = feature.get("properties", {}).get("name_1", "")
 constituency = feature.get("properties", {}).get("name_4", "")
 # this is now a dict lookup rather than loading a json file
 segments = research[province]
 for key in feature.get("properties", {}).get("results", {}):
 d.append({"Party Affiliation": key,
 "Province": province,
 "Constituency Name": constituency,
 "segments": segments})
 column_names = ["Province", "Constituency Name", "Party Affiliation", "segments"]
 df = pd.DataFrame(d, columns=column_names)
 df.to_csv("constituencies_with_segments.csv")

NB. I haven't tested the code, so there may be some typos etc.

answered May 23, 2020 at 22:56
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.