I have written a code that will parse JSON file from a cuckoo report. The issue is am not a good coder. I have got the program to give me the desired output.
The real challenge that I am facing is I have written multiple file open and close. How can I reduce it to a minimum? Because the JSON file are huge and the parsing time is approx. 10 sec. What am I doing wrong?
Here is my code:
from pprint import pprint
import pprint
import os, json
import pandas as pd
import glob
for filename in glob.glob('*.json'):
print filename
with open(filename) as json_file:
data = json.load(json_file)
to_dict = data
for item in to_dict['behavior']['summary']['file_created']:
pid1 = to_dict['behavior']['summary']['file_created']
for item in to_dict['behavior']['summary']['regkey_written']:
pid2 = to_dict['behavior']['summary']['regkey_written']
for item in to_dict['behavior']['summary']['dll_loaded']:
pid3 = to_dict['behavior']['summary']['dll_loaded']
for item in to_dict['behavior']['summary']['file_opened']:
pid4 = to_dict['behavior']['summary']['file_opened']
for item in to_dict['behavior']['summary']['regkey_opened']:
pid5 = to_dict['behavior']['summary']['regkey_opened']
for item in to_dict['behavior']['summary']['file_written']:
pid6 = to_dict['behavior']['summary']['file_written']
#
for item in to_dict['behavior']['summary']['file_exists']:
pid7 = to_dict['behavior']['summary']['file_exists']
for item in to_dict['behavior']['processes']:
pid8 = to_dict['behavior']['processes']
write_in_file = open('resultFile.txt','w')
pprint.pprint(pid1,write_in_file)
pprint.pprint(pid2,write_in_file)
pprint.pprint(pid3,write_in_file)
pprint.pprint(pid4,write_in_file)
pprint.pprint(pid5,write_in_file)
pprint.pprint(pid6,write_in_file)
pprint.pprint(pid7,write_in_file)
pprint.pprint(pid8,write_in_file)
#pprint.pprint(pid9,write_in_file)
#pprint.pprint(pid10,write_in_file)
write_in_file.close()
################################################### Code to Reduce the Noise ##########################################
with open('resultFile.txt', 'r') as file :
filedata = file.read()
## Replace the target string
filedata = filedata.replace('[', ' ')
filedata = filedata.replace('{',' ')
filedata = filedata.replace('}', ' ')
filedata = filedata.replace('\\\\', ' ')
filedata = filedata.replace('[', ' ')
filedata = filedata.replace(']', ' ')
filedata = filedata.replace(',', ' ')
write_in_file.close()
with open('resultFile.txt', 'w') as file:
file.write(filedata)
write_in_file.close()
################################################### Code to count the Frequency ######################################
with open('resultFile.txt', 'r') as file :
filedata = file.read()
text=filedata.split()
mydict={}
for word in text:
if word not in mydict.keys():
mydict[word]=1
else:
count=mydict[word]
mydict[word]=count+1
write_in_file = open('resultFile.txt','w')
pprint.pprint (mydict,write_in_file) #<-------In case you want to see which line is being counted how many times -just uncomment this line.# This is for debugging purpose
#pprint.pprint (mydict.values(),write_in_file) #<-------This is what we want.
write_in_file.close()
################################################### Code to Take care of the Numbering System ######################################
infile=open('resultFile.txt', 'r')
lines=infile.readlines()
infile.close()
outtext = ['%d %s' % (i, line) for i, line in enumerate(lines)]
outfile = open("resultFile.txt","w")
outfile.writelines(str("".join(outtext)))
outfile.close()
#
with open('resultFile.txt', 'r') as file :
filedata = file.read()
filedata = filedata.replace(',', ' ')
filedata = filedata.replace('[', ' ')
filedata = filedata.replace(']', ' ')
write_in_file.close()
#filedata = filedata.replace('', '')
with open('resultFile.txt', 'w') as file:
file.write(filedata)
write_in_file.close()
So is there any other way I could reduce any operations?
My final output that I want to look like is as follows:
0 1
1 12
2 3
3 98
4 2
. .
. .
. .
where the serial numbers are the unique numbers of entities and the corresponding number is the frequency (the number of times that entity appears)
1 Answer 1
Some notes:
import pandas as pd
is never used. So it may be deleted.
data = json.load(json_file)
to_dict = data
The data variable is used only in the subsequent statement to_dict = data
. Why not use directly
to_dict = json.load(json_file)
for item in to_dict['behavior']['summary']['file_created']:
pid1 = to_dict['behavior']['summary']['file_created']
(and subsequent): Do you really assign only the last item to pid1
(as you repeatedly rewrite pid1
in the for
loop)?
This is horrible:
for item in to_dict['behavior']['summary']['file_created']:
pid1 = to_dict['behavior']['summary']['file_created']
for item in to_dict['behavior']['summary']['regkey_written']:
pid2 = to_dict['behavior']['summary']['regkey_written']
for item in to_dict['behavior']['summary']['dll_loaded']:
pid3 = to_dict['behavior']['summary']['dll_loaded']
for item in to_dict['behavior']['summary']['file_opened']:
pid4 = to_dict['behavior']['summary']['file_opened']
for item in to_dict['behavior']['summary']['regkey_opened']:
pid5 = to_dict['behavior']['summary']['regkey_opened']
for item in to_dict['behavior']['summary']['file_written']:
pid6 = to_dict['behavior']['summary']['file_written']
#
for item in to_dict['behavior']['summary']['file_exists']:
pid7 = to_dict['behavior']['summary']['file_exists']
for item in to_dict['behavior']['processes']:
pid8 = to_dict['behavior']['processes']
DRY (Don't Repeat Yourself). Refactor it. Use e. g. a dictionary instead of 8
variables pid1
, ..., pid8
. Use for
loop instead of 8
times the same thing. As a minimal change you may write it as
pids = {}
actions = ['file_created', 'regkey_written', 'dll_loaded', (and so on)]
for action in actions:
for item in to_dict['behavior']['summary'][action]:
pids[action] = to_dict['behavior']['summary'][action]
Consequently, you may use pids
directory for looping - instead of your current
write_in_file = open('resultFile.txt','w')
pprint.pprint(pid1,write_in_file)
pprint.pprint(pid2,write_in_file)
pprint.pprint(pid3,write_in_file)
pprint.pprint(pid4,write_in_file)
pprint.pprint(pid5,write_in_file)
pprint.pprint(pid6,write_in_file)
pprint.pprint(pid7,write_in_file)
pprint.pprint(pid8,write_in_file)
#pprint.pprint(pid9,write_in_file)
#pprint.pprint(pid10,write_in_file)
write_in_file.close()
you may then write
with open('resultFile.txt','w') as write_in_file:
for action in actions:
pprint.pprint(pids[action], write_in_file)
filedata = filedata.replace(',', ' ')
filedata = filedata.replace('[', ' ')
filedata = filedata.replace(']', ' ')
may become
filedata = filedata.replace(',', ' ').replace('[', ' ').replace(']', ' ')
(as chaining string methods is recommended) or
for c in ',[]':
filedata = filedata.replace(c, ' ')
for
loop? It looks like you're taking only the last file to fillto_dict
. \$\endgroup\$