Extract matrices from hierarchical file structure

Question 1

I have a set of JSON files stored in a folder; the folder structure is as follows:

Main_folder
----F1
-----I1
------X.json
-----I2
-------X.json
.
.
.
.
.
.

In all, the main folder contains 36 subfolders which in turn contain 10 subfolders each having a JSON file.

The JSON file is a dictionary of a lists X={'a':[contains 73 items each of which is a dictionary that represents a 2x3 matrix],'b':[contains 73 items each of which is a dictionary]}

I want to read these JSON files, get the 2x3 matrices and store them in .npy files.

My current code does that but it ends up saving the same npy file multiple times in the folders. For example, X_0.npy should be stored once in Main_folder/F1/I1, but that same file is being saved in other folders also.

import os, json,codecs
import numpy as np
from pathlib import Path
def read_pose_from_json(json_file):
 #read json file and obtain 73x6 pose array
 with open(json_file) as f1:
 json1_str=f1.read()
 json1_data=json.loads(json1_str)
 data=json1_data.copy()
 l1=data['world_coordinate']
 pose_list=[]
 for i in range(len(l1)):
 X=list(l1[i].values())[0]
 X=np.array(X).ravel()
 pose_list.append(X)
 return np.array(pose_list)
path_to_json='blender_files/'
paths=[]
paths1=[]
#For all json files ,read them obtain pose array(73x6) and store it in a npy file so that it can
#later be accesed
for (dirpath,dirnames,filenames) in os.walk(path_to_json):
 paths+=[os.path.join(dirpath,file) for file in filenames if file.startswith('bones_') and file.endswith('.json')]
 for i in range(len(paths)):
 pose_array=read_pose_from_json(paths[i])
 new_pose_file=Path(paths[i]).stem+".npy"
 new_pose_file=os.path.join(dirpath,new_pose_file)
 with open(new_pose_file,'wb') as f:
 np.save(f,pose_array)

What steps can I take to optimize this code, and ensure that redundant copies of the same file are not created?

I have enclosed a screenshot which shows what the code is currently doing: redudant npy files being saved

In this image inside Model23/image7 there are multiple npy files being saved.

I was able to resolve the issue,I moved the loop

for i in range(len(pathfiles)

outside the loop that recursively looks in all folders and sub-folders. so for now the correct code is

use os.walk() to list all json files and store them in list path

then iterate through paths list and obtain matrices and save them

If anyone else can suggest a better solution,that would be appreciated.

Question 2

Are your data uniformly non-jagged (as in, 36 x 9 x 73 x 2 x 3)? If so, I'll recommend that this go to one npz and not multiple.

Question 3

Not enough for a full review, but you can simplify matching the files using pathlib: files = Path('blender_files').glob('*/*/bones_*.json')

Question 4

You say 9 subfolders each having a JSON file but your bones_20 example has 10 items?

Question 5

sorry about that,yes each subfolder has 10 images going from 0-9

Question 6

For the purposes of code review we don't support references to external sites (or even links to external site) - instead you'll need to paste an actual sample in your question.

Question 7

If this is true:

the main folder contains 36 subfolders which in turn contain 10 subfolders

and we assume your F1 / I1 naming scheme is accurate and the contradictory Model / image naming scheme is not, then my recommendation is:

Don't make separate output files; make one six-dimensional array output to one file. The result turns out to be about 12MB based on test data that have the same shape as yours.
Don't ravel; preserve the original dimensions
Don't f1.read() and pass that to json.loads; just use json.load
Don't copy()
Don't cast values() to a list and then index it. If you know the key, use the key. If you (strangely) don't know the key, then use next(iter()).
Don't make and dynamically grow a pose_list. Instead, pre-allocate a Numpy array and assign to that using slices.
paths1 is unused; delete it
Move your code out of the global namespace
Don't os.walk and don't glob; if you know the indices, just use them directly to construct paths
Use PEP484 type hints

Suggested

import json
from random import random
import numpy as np
from pathlib import Path
def read_pose_from_json(json_file: Path) -> np.ndarray:
 """read json file and obtain 73x2x3 pose array"""
 with json_file.open() as f:
 data = json.load(f)['world_coordinate']
 poses = np.empty((len(data), 2, 3))
 for i, entry in enumerate(data):
 poses[i, :, :] = next(iter(entry.values()))
 return poses
def main() -> None:
 n_f = 36
 n_i = 10
 n_images = 10
 all_poses = np.empty((
 n_f, n_i, n_images, 73, 2, 3,
 ))
 for f_index in range(n_f):
 for i_index in range(n_i):
 subdir = Path(f'blender_files/F{f_index+1}/I{i_index+1}')
 for image_index in range(n_images):
 path = subdir / f'bones_{image_index}.json'
 all_poses[
 f_index, i_index, image_index, :, :, :,
 ] = read_pose_from_json(path)
 np.save('all_poses', all_poses)
def make_testing_dir() -> None:
 for f_index in range(1, 37):
 for i_index in range(1, 11):
 subdir = Path(f'blender_files/F{f_index}/I{i_index}')
 subdir.mkdir(parents=True, exist_ok=True)
 for image_index in range(10):
 data = {
 'world_coordinate': [ # l1
 { # l1[i]
 'spooky_unknown_key': [ # l1[i].values()[0]
 [random() for x in range(3)]
 for y in range(2)
 ],
 'zzz-ignored-key': None, # ...
 }
 for pose_index in range(73)
 ]
 }
 filename = subdir / f'bones_{image_index}.json'
 with filename.open('w') as f:
 json.dump(data, f)
if __name__ == '__main__':
 # make_testing_dir()
 main()

Question 8

thanks,for the suggestions I will use them in my future codes.

Reinderien Reinderien 71k5 gold badges76 silver badges256 bronze badges · Answer 1 · 2022-01-20 23:49:59Z

If this is true:

the main folder contains 36 subfolders which in turn contain 10 subfolders

and we assume your F1 / I1 naming scheme is accurate and the contradictory Model / image naming scheme is not, then my recommendation is:

Don't make separate output files; make one six-dimensional array output to one file. The result turns out to be about 12MB based on test data that have the same shape as yours.
Don't ravel; preserve the original dimensions
Don't f1.read() and pass that to json.loads; just use json.load
Don't copy()
Don't cast values() to a list and then index it. If you know the key, use the key. If you (strangely) don't know the key, then use next(iter()).
Don't make and dynamically grow a pose_list. Instead, pre-allocate a Numpy array and assign to that using slices.
paths1 is unused; delete it
Move your code out of the global namespace
Don't os.walk and don't glob; if you know the indices, just use them directly to construct paths
Use PEP484 type hints

Suggested

import json
from random import random
import numpy as np
from pathlib import Path
def read_pose_from_json(json_file: Path) -> np.ndarray:
 """read json file and obtain 73x2x3 pose array"""
 with json_file.open() as f:
 data = json.load(f)['world_coordinate']
 poses = np.empty((len(data), 2, 3))
 for i, entry in enumerate(data):
 poses[i, :, :] = next(iter(entry.values()))
 return poses
def main() -> None:
 n_f = 36
 n_i = 10
 n_images = 10
 all_poses = np.empty((
 n_f, n_i, n_images, 73, 2, 3,
 ))
 for f_index in range(n_f):
 for i_index in range(n_i):
 subdir = Path(f'blender_files/F{f_index+1}/I{i_index+1}')
 for image_index in range(n_images):
 path = subdir / f'bones_{image_index}.json'
 all_poses[
 f_index, i_index, image_index, :, :, :,
 ] = read_pose_from_json(path)
 np.save('all_poses', all_poses)
def make_testing_dir() -> None:
 for f_index in range(1, 37):
 for i_index in range(1, 11):
 subdir = Path(f'blender_files/F{f_index}/I{i_index}')
 subdir.mkdir(parents=True, exist_ok=True)
 for image_index in range(10):
 data = {
 'world_coordinate': [ # l1
 { # l1[i]
 'spooky_unknown_key': [ # l1[i].values()[0]
 [random() for x in range(3)]
 for y in range(2)
 ],
 'zzz-ignored-key': None, # ...
 }
 for pose_index in range(73)
 ]
 }
 filename = subdir / f'bones_{image_index}.json'
 with filename.open('w') as f:
 json.dump(data, f)
if __name__ == '__main__':
 # make_testing_dir()
 main()

thanks,for the suggestions I will use them in my future codes.

Stack Exchange Network

Extract matrices from hierarchical file structure

1 Answer 1

Suggested

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Extract matrices from hierarchical file structure

1 Answer 1

Suggested

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions