I am trying to convert a JSON file to CSV format using Python. I am using the JSON.loads()
method and then using json_normalize()
to flatten the objects. The code is working fine for few input rows.
I was wondering if there is better way of doing this. By better I mean:
Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?
This is the input file, one row format:
{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}
Code
import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
#dataMain.to_csv("main.csv",index=False)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
#dataItem.to_csv("Item.csv",index=False)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = dataDisc.append(data, sort=False)
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = dataTx.append(data, sort=False)
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)
this is the output:
ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019年08月01日,400,12,10,[email protected],0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019年08月01日,400,12,10,[email protected],0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019年08月01日,400,12,10,[email protected],0000000003,item_CD3,27,,,,,firstname,lastname,address
1 Answer 1
Is it efficient in terms of time and space complexity?
I don't think so. Your outer for
should be replaced with a pd.load_json(lines=True)
. You have a lot of merge()
and I think they should all go away.
Your inner for
should be replaced with an .explode()
. In fact, the structure of the data calls for several (four logical, across three calls) explodes, because this operation is severely denormalized.
Assuming that you're keeping denormalized form, then the following outputs the same frame as your original method for the same sample input:
import io
import json
import typing
import pandas as pd
from pandas import json_normalize
def process_op(f: typing.TextIO) -> pd.DataFrame:
data_final=pd.DataFrame()
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = pd.concat((dataDisc, data))
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = pd.concat((dataTx, data))
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final= pd.concat((data_final, data_merge))
data_final=data_final.drop_duplicates(keep = 'first')
data_final['Date'] = pd.to_datetime(data_final['Date'])
return data_final
def process(f: typing.TextIO) -> pd.DataFrame:
df = pd.read_json(
path_or_buf=f, lines=True, orient='records',
dtype={'ID': str},
)
cust = pd.json_normalize(df['Cust'])
df = pd.concat(
objs=(
df.drop(columns=['Cust']),
pd.DataFrame({
'Cust.CustID': cust['CustID'],
'Cust.Email': cust['Email'],
}),
), axis='columns',
)
df = df.explode('Address', ignore_index=True)
df = pd.concat(
objs=(
df.drop(columns=['Address']),
pd.json_normalize(df['Address']),
), axis='columns',
)
df = df.explode('Item', ignore_index=True)
df = pd.concat(
objs=(
df.drop(columns=['Item']),
pd.json_normalize(df['Item']),
), axis='columns',
)
df = df.explode(['Disc', 'TxLns.TX'], ignore_index=True)
disc = pd.json_normalize(df['Disc'])
tx = pd.json_normalize(df['TxLns.TX'])
return pd.concat(
objs=(
df[['ID', 'Date', 'Total', 'QTY', 'Cust.CustID', 'Cust.Email', 'NM']],
df['CD'].rename('CD_x'),
df['SRL'],
disc['CD'].rename('CD_y'),
disc['Amount'],
tx[['TXNM', 'TXCD']],
df[['FirstName', 'LastName', 'Address']],
), axis='columns',
)
def test() -> None:
'''
Expanded:
{
"ID": "02",
"Date": "2019年08月01日",
"Total": 400,
"QTY": 12,
"Item": [
{
"NM": "0000000001",
"CD": "item_CD1",
"SRL": "25",
"Disc": [
{
"CD": "discount_CD1",
"Amount": 2
}
],
"TxLns": {
"TX": [
{
"TXNM": "000001-001",
"TXCD": "TX_CD1"
}
]
}
},
{
"NM": "0000000002",
"CD": "item_CD2",
"SRL": "26",
"Disc": [
{
"CD": "discount_CD2",
"Amount": 4
}
],
"TxLns": {
"TX": [
{
"TXNM": "000002-001",
"TXCD": "TX_CD2"
}
]
}
},
{
"NM": "0000000003",
"CD": "item_CD3",
"SRL": "27"
}
],
"Cust": {
"CustID": 10,
"Email": "[email protected]"
},
"Address": [
{
"FirstName": "firstname",
"LastName": "lastname",
"Address": "address"
}
]
}
'''
with io.StringIO(
'{"ID": "02","Date": "2019年08月01日","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "[email protected]"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}'
) as f:
reference = process_op(f)
f.seek(0)
df = process(f)
pd.testing.assert_frame_equal(reference, df)
if __name__ == '__main__':
test()
But I don't think that's necessarily the best idea. To move toward a normal form, you should avoid mashing everything into one frame, have separate frames with non-redundant rows, and relate them using the index of the parent frame and some ID column in the child frame (effectively foreign keys).
jsonl
file? That is, every line in the file is a valid json object? \$\endgroup\$