5
\$\begingroup\$

I am trying to convert a JSON file to CSV format using Python. I am using the JSON.loads() method and then using json_normalize() to flatten the objects. The code is working fine for few input rows.

I was wondering if there is better way of doing this. By better I mean:

Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?

This is the input file, one row format:

{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}

Code

import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
 for line in f:
 json_obj = json.loads(line)
 ID = json_obj['ID']
 Item = json_obj['Item']
 dataMain = json_normalize(json_obj)
 dataMain=dataMain.drop(['Item','Address'], axis=1)
 #dataMain.to_csv("main.csv",index=False)
 dataItem = json_normalize(json_obj,'Item',['ID'])
 dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
 #dataItem.to_csv("Item.csv",index=False)
 dataDisc = pd.DataFrame()
 dataTx = pd.DataFrame()
 for rt in Item:
 NM=rt['NM']
 rt['ID'] = ID
 if 'Disc' in rt:
 data = json_normalize(rt, 'Disc', ['NM','ID'])
 dataDisc = dataDisc.append(data, sort=False)
 if 'TxLns' in rt:
 tx=rt['TxLns']
 tx['NM'] = NM
 tx['ID'] = ID
 if 'TX' in tx:
 data = json_normalize(tx, 'TX', ['NM','ID'])
 dataTx = dataTx.append(data, sort=False)
 dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
 dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
 dataAddress = json_normalize(json_obj,'Address',['ID'])
 data_IT = pd.merge(dataMain, dataTX, on=['ID'])
 data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
 data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)

this is the output:

ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019年08月01日,400,12,10,[email protected],0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019年08月01日,400,12,10,[email protected],0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019年08月01日,400,12,10,[email protected],0000000003,item_CD3,27,,,,,firstname,lastname,address
\$\endgroup\$
3
  • \$\begingroup\$ Are you aware Python has a CSV library? \$\endgroup\$ Commented Aug 12, 2019 at 8:04
  • \$\begingroup\$ I am aware of csv library, but how will it help? \$\endgroup\$ Commented Aug 12, 2019 at 8:31
  • 1
    \$\begingroup\$ So is this a jsonl file? That is, every line in the file is a valid json object? \$\endgroup\$ Commented Aug 12, 2019 at 17:16

1 Answer 1

2
\$\begingroup\$

Is it efficient in terms of time and space complexity?

I don't think so. Your outer for should be replaced with a pd.load_json(lines=True). You have a lot of merge() and I think they should all go away.

Your inner for should be replaced with an .explode(). In fact, the structure of the data calls for several (four logical, across three calls) explodes, because this operation is severely denormalized.

Assuming that you're keeping denormalized form, then the following outputs the same frame as your original method for the same sample input:

import io
import json
import typing
import pandas as pd
from pandas import json_normalize
def process_op(f: typing.TextIO) -> pd.DataFrame:
 data_final=pd.DataFrame()
 for line in f:
 json_obj = json.loads(line)
 ID = json_obj['ID']
 Item = json_obj['Item']
 dataMain = json_normalize(json_obj)
 dataMain=dataMain.drop(['Item','Address'], axis=1)
 dataItem = json_normalize(json_obj,'Item',['ID'])
 dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
 dataDisc = pd.DataFrame()
 dataTx = pd.DataFrame()
 for rt in Item:
 NM=rt['NM']
 rt['ID'] = ID
 if 'Disc' in rt:
 data = json_normalize(rt, 'Disc', ['NM','ID'])
 dataDisc = pd.concat((dataDisc, data))
 if 'TxLns' in rt:
 tx=rt['TxLns']
 tx['NM'] = NM
 tx['ID'] = ID
 if 'TX' in tx:
 data = json_normalize(tx, 'TX', ['NM','ID'])
 dataTx = pd.concat((dataTx, data))
 dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
 dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
 dataAddress = json_normalize(json_obj,'Address',['ID'])
 data_IT = pd.merge(dataMain, dataTX, on=['ID'])
 data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
 data_final= pd.concat((data_final, data_merge))
 data_final=data_final.drop_duplicates(keep = 'first')
 data_final['Date'] = pd.to_datetime(data_final['Date'])
 return data_final
def process(f: typing.TextIO) -> pd.DataFrame:
 df = pd.read_json(
 path_or_buf=f, lines=True, orient='records',
 dtype={'ID': str},
 )
 cust = pd.json_normalize(df['Cust'])
 df = pd.concat(
 objs=(
 df.drop(columns=['Cust']),
 pd.DataFrame({
 'Cust.CustID': cust['CustID'],
 'Cust.Email': cust['Email'],
 }),
 ), axis='columns',
 )
 df = df.explode('Address', ignore_index=True)
 df = pd.concat(
 objs=(
 df.drop(columns=['Address']),
 pd.json_normalize(df['Address']),
 ), axis='columns',
 )
 df = df.explode('Item', ignore_index=True)
 df = pd.concat(
 objs=(
 df.drop(columns=['Item']),
 pd.json_normalize(df['Item']),
 ), axis='columns',
 )
 df = df.explode(['Disc', 'TxLns.TX'], ignore_index=True)
 disc = pd.json_normalize(df['Disc'])
 tx = pd.json_normalize(df['TxLns.TX'])
 return pd.concat(
 objs=(
 df[['ID', 'Date', 'Total', 'QTY', 'Cust.CustID', 'Cust.Email', 'NM']],
 df['CD'].rename('CD_x'),
 df['SRL'],
 disc['CD'].rename('CD_y'),
 disc['Amount'],
 tx[['TXNM', 'TXCD']],
 df[['FirstName', 'LastName', 'Address']],
 ), axis='columns',
 )
def test() -> None:
 '''
 Expanded:
 {
 "ID": "02",
 "Date": "2019年08月01日",
 "Total": 400,
 "QTY": 12,
 "Item": [
 {
 "NM": "0000000001",
 "CD": "item_CD1",
 "SRL": "25",
 "Disc": [
 {
 "CD": "discount_CD1",
 "Amount": 2
 }
 ],
 "TxLns": {
 "TX": [
 {
 "TXNM": "000001-001",
 "TXCD": "TX_CD1"
 }
 ]
 }
 },
 {
 "NM": "0000000002",
 "CD": "item_CD2",
 "SRL": "26",
 "Disc": [
 {
 "CD": "discount_CD2",
 "Amount": 4
 }
 ],
 "TxLns": {
 "TX": [
 {
 "TXNM": "000002-001",
 "TXCD": "TX_CD2"
 }
 ]
 }
 },
 {
 "NM": "0000000003",
 "CD": "item_CD3",
 "SRL": "27"
 }
 ],
 "Cust": {
 "CustID": 10,
 "Email": "[email protected]"
 },
 "Address": [
 {
 "FirstName": "firstname",
 "LastName": "lastname",
 "Address": "address"
 }
 ]
 }
 '''
 with io.StringIO(
 '{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}'
 ) as f:
 reference = process_op(f)
 f.seek(0)
 df = process(f)
 pd.testing.assert_frame_equal(reference, df)
if __name__ == '__main__':
 test()

But I don't think that's necessarily the best idea. To move toward a normal form, you should avoid mashing everything into one frame, have separate frames with non-redundant rows, and relate them using the index of the parent frame and some ID column in the child frame (effectively foreign keys).

answered Feb 1 at 21:03
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.