JSON to CSV in python using json.loads and json_normalize

Question 1

I am trying to convert a JSON file to CSV format using Python. I am using the JSON.loads() method and then using json_normalize() to flatten the objects. The code is working fine for few input rows.

I was wondering if there is better way of doing this. By better I mean:

Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?

This is the input file, one row format:

{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}

Code

import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
 for line in f:
 json_obj = json.loads(line)
 ID = json_obj['ID']
 Item = json_obj['Item']
 dataMain = json_normalize(json_obj)
 dataMain=dataMain.drop(['Item','Address'], axis=1)
 #dataMain.to_csv("main.csv",index=False)
 dataItem = json_normalize(json_obj,'Item',['ID'])
 dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
 #dataItem.to_csv("Item.csv",index=False)
 dataDisc = pd.DataFrame()
 dataTx = pd.DataFrame()
 for rt in Item:
 NM=rt['NM']
 rt['ID'] = ID
 if 'Disc' in rt:
 data = json_normalize(rt, 'Disc', ['NM','ID'])
 dataDisc = dataDisc.append(data, sort=False)
 if 'TxLns' in rt:
 tx=rt['TxLns']
 tx['NM'] = NM
 tx['ID'] = ID
 if 'TX' in tx:
 data = json_normalize(tx, 'TX', ['NM','ID'])
 dataTx = dataTx.append(data, sort=False)
 dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
 dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
 dataAddress = json_normalize(json_obj,'Address',['ID'])
 data_IT = pd.merge(dataMain, dataTX, on=['ID'])
 data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
 data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)

this is the output:

ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019年08月01日,400,12,10,[email protected],0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019年08月01日,400,12,10,[email protected],0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019年08月01日,400,12,10,[email protected],0000000003,item_CD3,27,,,,,firstname,lastname,address

Question 2

Are you aware Python has a CSV library?

Question 3

I am aware of csv library, but how will it help?

Question 4

So is this a jsonl file? That is, every line in the file is a valid json object?

Question 5

Is it efficient in terms of time and space complexity?

I don't think so. Your outer for should be replaced with a pd.load_json(lines=True). You have a lot of merge() and I think they should all go away.

Your inner for should be replaced with an .explode(). In fact, the structure of the data calls for several (four logical, across three calls) explodes, because this operation is severely denormalized.

Assuming that you're keeping denormalized form, then the following outputs the same frame as your original method for the same sample input:

import io
import json
import typing
import pandas as pd
from pandas import json_normalize
def process_op(f: typing.TextIO) -> pd.DataFrame:
 data_final=pd.DataFrame()
 for line in f:
 json_obj = json.loads(line)
 ID = json_obj['ID']
 Item = json_obj['Item']
 dataMain = json_normalize(json_obj)
 dataMain=dataMain.drop(['Item','Address'], axis=1)
 dataItem = json_normalize(json_obj,'Item',['ID'])
 dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
 dataDisc = pd.DataFrame()
 dataTx = pd.DataFrame()
 for rt in Item:
 NM=rt['NM']
 rt['ID'] = ID
 if 'Disc' in rt:
 data = json_normalize(rt, 'Disc', ['NM','ID'])
 dataDisc = pd.concat((dataDisc, data))
 if 'TxLns' in rt:
 tx=rt['TxLns']
 tx['NM'] = NM
 tx['ID'] = ID
 if 'TX' in tx:
 data = json_normalize(tx, 'TX', ['NM','ID'])
 dataTx = pd.concat((dataTx, data))
 dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
 dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
 dataAddress = json_normalize(json_obj,'Address',['ID'])
 data_IT = pd.merge(dataMain, dataTX, on=['ID'])
 data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
 data_final= pd.concat((data_final, data_merge))
 data_final=data_final.drop_duplicates(keep = 'first')
 data_final['Date'] = pd.to_datetime(data_final['Date'])
 return data_final
def process(f: typing.TextIO) -> pd.DataFrame:
 df = pd.read_json(
 path_or_buf=f, lines=True, orient='records',
 dtype={'ID': str},
 )
 cust = pd.json_normalize(df['Cust'])
 df = pd.concat(
 objs=(
 df.drop(columns=['Cust']),
 pd.DataFrame({
 'Cust.CustID': cust['CustID'],
 'Cust.Email': cust['Email'],
 }),
 ), axis='columns',
 )
 df = df.explode('Address', ignore_index=True)
 df = pd.concat(
 objs=(
 df.drop(columns=['Address']),
 pd.json_normalize(df['Address']),
 ), axis='columns',
 )
 df = df.explode('Item', ignore_index=True)
 df = pd.concat(
 objs=(
 df.drop(columns=['Item']),
 pd.json_normalize(df['Item']),
 ), axis='columns',
 )
 df = df.explode(['Disc', 'TxLns.TX'], ignore_index=True)
 disc = pd.json_normalize(df['Disc'])
 tx = pd.json_normalize(df['TxLns.TX'])
 return pd.concat(
 objs=(
 df[['ID', 'Date', 'Total', 'QTY', 'Cust.CustID', 'Cust.Email', 'NM']],
 df['CD'].rename('CD_x'),
 df['SRL'],
 disc['CD'].rename('CD_y'),
 disc['Amount'],
 tx[['TXNM', 'TXCD']],
 df[['FirstName', 'LastName', 'Address']],
 ), axis='columns',
 )
def test() -> None:
 '''
 Expanded:
 {
 "ID": "02",
 "Date": "2019年08月01日",
 "Total": 400,
 "QTY": 12,
 "Item": [
 {
 "NM": "0000000001",
 "CD": "item_CD1",
 "SRL": "25",
 "Disc": [
 {
 "CD": "discount_CD1",
 "Amount": 2
 }
 ],
 "TxLns": {
 "TX": [
 {
 "TXNM": "000001-001",
 "TXCD": "TX_CD1"
 }
 ]
 }
 },
 {
 "NM": "0000000002",
 "CD": "item_CD2",
 "SRL": "26",
 "Disc": [
 {
 "CD": "discount_CD2",
 "Amount": 4
 }
 ],
 "TxLns": {
 "TX": [
 {
 "TXNM": "000002-001",
 "TXCD": "TX_CD2"
 }
 ]
 }
 },
 {
 "NM": "0000000003",
 "CD": "item_CD3",
 "SRL": "27"
 }
 ],
 "Cust": {
 "CustID": 10,
 "Email": "[email protected]"
 },
 "Address": [
 {
 "FirstName": "firstname",
 "LastName": "lastname",
 "Address": "address"
 }
 ]
 }
 '''
 with io.StringIO(
 '{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}'
 ) as f:
 reference = process_op(f)
 f.seek(0)
 df = process(f)
 pd.testing.assert_frame_equal(reference, df)
if __name__ == '__main__':
 test()

But I don't think that's necessarily the best idea. To move toward a normal form, you should avoid mashing everything into one frame, have separate frames with non-redundant rows, and relate them using the index of the parent frame and some ID column in the child frame (effectively foreign keys).

score 2 · Answer 1 · 2025-02-01 21:03:54Z

Is it efficient in terms of time and space complexity?

I don't think so. Your outer for should be replaced with a pd.load_json(lines=True). You have a lot of merge() and I think they should all go away.

Your inner for should be replaced with an .explode(). In fact, the structure of the data calls for several (four logical, across three calls) explodes, because this operation is severely denormalized.

Assuming that you're keeping denormalized form, then the following outputs the same frame as your original method for the same sample input:

import io
import json
import typing
import pandas as pd
from pandas import json_normalize
def process_op(f: typing.TextIO) -> pd.DataFrame:
 data_final=pd.DataFrame()
 for line in f:
 json_obj = json.loads(line)
 ID = json_obj['ID']
 Item = json_obj['Item']
 dataMain = json_normalize(json_obj)
 dataMain=dataMain.drop(['Item','Address'], axis=1)
 dataItem = json_normalize(json_obj,'Item',['ID'])
 dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
 dataDisc = pd.DataFrame()
 dataTx = pd.DataFrame()
 for rt in Item:
 NM=rt['NM']
 rt['ID'] = ID
 if 'Disc' in rt:
 data = json_normalize(rt, 'Disc', ['NM','ID'])
 dataDisc = pd.concat((dataDisc, data))
 if 'TxLns' in rt:
 tx=rt['TxLns']
 tx['NM'] = NM
 tx['ID'] = ID
 if 'TX' in tx:
 data = json_normalize(tx, 'TX', ['NM','ID'])
 dataTx = pd.concat((dataTx, data))
 dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
 dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
 dataAddress = json_normalize(json_obj,'Address',['ID'])
 data_IT = pd.merge(dataMain, dataTX, on=['ID'])
 data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
 data_final= pd.concat((data_final, data_merge))
 data_final=data_final.drop_duplicates(keep = 'first')
 data_final['Date'] = pd.to_datetime(data_final['Date'])
 return data_final
def process(f: typing.TextIO) -> pd.DataFrame:
 df = pd.read_json(
 path_or_buf=f, lines=True, orient='records',
 dtype={'ID': str},
 )
 cust = pd.json_normalize(df['Cust'])
 df = pd.concat(
 objs=(
 df.drop(columns=['Cust']),
 pd.DataFrame({
 'Cust.CustID': cust['CustID'],
 'Cust.Email': cust['Email'],
 }),
 ), axis='columns',
 )
 df = df.explode('Address', ignore_index=True)
 df = pd.concat(
 objs=(
 df.drop(columns=['Address']),
 pd.json_normalize(df['Address']),
 ), axis='columns',
 )
 df = df.explode('Item', ignore_index=True)
 df = pd.concat(
 objs=(
 df.drop(columns=['Item']),
 pd.json_normalize(df['Item']),
 ), axis='columns',
 )
 df = df.explode(['Disc', 'TxLns.TX'], ignore_index=True)
 disc = pd.json_normalize(df['Disc'])
 tx = pd.json_normalize(df['TxLns.TX'])
 return pd.concat(
 objs=(
 df[['ID', 'Date', 'Total', 'QTY', 'Cust.CustID', 'Cust.Email', 'NM']],
 df['CD'].rename('CD_x'),
 df['SRL'],
 disc['CD'].rename('CD_y'),
 disc['Amount'],
 tx[['TXNM', 'TXCD']],
 df[['FirstName', 'LastName', 'Address']],
 ), axis='columns',
 )
def test() -> None:
 '''
 Expanded:
 {
 "ID": "02",
 "Date": "2019年08月01日",
 "Total": 400,
 "QTY": 12,
 "Item": [
 {
 "NM": "0000000001",
 "CD": "item_CD1",
 "SRL": "25",
 "Disc": [
 {
 "CD": "discount_CD1",
 "Amount": 2
 }
 ],
 "TxLns": {
 "TX": [
 {
 "TXNM": "000001-001",
 "TXCD": "TX_CD1"
 }
 ]
 }
 },
 {
 "NM": "0000000002",
 "CD": "item_CD2",
 "SRL": "26",
 "Disc": [
 {
 "CD": "discount_CD2",
 "Amount": 4
 }
 ],
 "TxLns": {
 "TX": [
 {
 "TXNM": "000002-001",
 "TXCD": "TX_CD2"
 }
 ]
 }
 },
 {
 "NM": "0000000003",
 "CD": "item_CD3",
 "SRL": "27"
 }
 ],
 "Cust": {
 "CustID": 10,
 "Email": "[email protected]"
 },
 "Address": [
 {
 "FirstName": "firstname",
 "LastName": "lastname",
 "Address": "address"
 }
 ]
 }
 '''
 with io.StringIO(
 '{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}'
 ) as f:
 reference = process_op(f)
 f.seek(0)
 df = process(f)
 pd.testing.assert_frame_equal(reference, df)
if __name__ == '__main__':
 test()

But I don't think that's necessarily the best idea. To move toward a normal form, you should avoid mashing everything into one frame, have separate frames with non-redundant rows, and relate them using the index of the parent frame and some ID column in the child frame (effectively foreign keys).

Stack Exchange Network

JSON to CSV in python using json.loads and json_normalize

Code

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

JSON to CSV in python using json.loads and json_normalize

Code

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions