I have a MultiIndex
DataFrame
df = pd.DataFrame({
('paramA','levelA'):np.random.randint(100, size=(5)),
('paramA','levelB'):np.random.randint(100, size=(5)),
('paramB','levelA'):np.random.randint(100, size=(5)),
('paramB','levelB'):np.random.randint(100, size=(5))
},
).T
df.index.set_names(['parameter','level'], inplace=True)
df.columns = np.arange('2021-10-22T00', '2021-10-22T05', dtype='datetime64[h]')
df.columns.set_names('validTime', inplace=True)
df['units']=['a','a','b','b']
which looks something like this
validTime 2021年10月22日 00:00:00 ... units
parameter level ...
paramA levelA 32 ... a
levelB 50 ... a
paramB levelA 56 ... b
levelB 28 ... b
[4 rows x 6 columns]
and a method to generate....
List[Dict[str, str|List[Dict[str, str|List[Dict[str, str|int]]]]]]
I start by splitting the units
and format the columns
df_values:pd.DataFrame = df.iloc[:,:-1]
df_values.rename(columns=df_values.columns.to_series().dt.strftime('%Y-%m-%d %H:%m:%SZ'))
units:pd.Series = df.iloc[:,-1]
then perform groupby(x).apply(...).values.tolist()
to generate the object
df_values.groupby('parameter').apply(
lambda parameter: {
'parameter': parameter.index.get_level_values('parameter')[0],
'unit': units[parameter.index.get_level_values('parameter')[0]][0],
'levels': parameter.groupby('level').apply(
lambda level: {
'level': level.index.get_level_values('level')[0],
'values': level.apply(
lambda value: {
'validTime': str(value.name),
'value': value.values[0]
}).values.tolist()
}).values.tolist()
}).values.tolist()
2 Answers 2
Don't use np.random.randint
; it's deprecated.
When initialising units
- and in some other places - prefer immutable tuples rather than lists.
Problem one with your data is that units
is denormalised and repeats itself within the param
index level. This needs to be pulled away into its own series indexed only by param
.
Problem two with your data is that validTime
pretends to be columns but functionally is a misrepresented index. This can be fixed with stack
.
When you're manipulating sub-sub-dictionaries and the like, all hope of vectorisation is given up and so apply
doesn't buy you much. Also note that your lambda
s compile to one anonymous function each. Since this is already happening, you might as well replace them with one named function each that is an explicit generator with argument names and parameter and return types defined.
Your current method that relies on apply
suffers from losing the grouped index value and having to recall it again with get_level_values
. This can be avoided by simple iteration over the group object.
It's also worth mentioning that since you intend for this to be an API response, I presume that you need to JSON-serialise this and your current code is broken for that case since the in-built json
module doesn't know how to serialise Numpy integers. In your current values
indexing operation you would need an int
cast; with the method I show that will not be necessary.
Suggested
import json
from typing import Iterator
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng(seed=0)
def example_data() -> pd.DataFrame:
def rand() -> np.ndarray:
return rng.integers(low=0, high=100, size=5)
df = pd.DataFrame({
('paramA', 'levelA'): rand(),
('paramA', 'levelB'): rand(),
('paramB', 'levelA'): rand(),
('paramB', 'levelB'): rand()
}).T
df.index.set_names(('parameter', 'level'), inplace=True)
df.columns = np.arange('2021-10-22T00', '2021-10-22T05', dtype='datetime64[h]')
df.columns.set_names('validTime', inplace=True)
df['units'] = ('a', 'a', 'b', 'b')
return df
def process_op(df: pd.DataFrame) -> list:
df_values: pd.DataFrame = df.iloc[:, :-1]
df_values.rename(columns=df_values.columns.to_series().dt.strftime('%Y-%m-%d %H:%m:%SZ'))
units: pd.Series = df.iloc[:, -1]
return df_values.groupby('parameter').apply(
lambda parameter: {
'parameter': parameter.index.get_level_values('parameter')[0],
'unit': units[parameter.index.get_level_values('parameter')[0]][0],
'levels': parameter.groupby('level').apply(
lambda level: {
'level': level.index.get_level_values('level')[0],
'values': level.apply(
lambda value: {
'validTime': str(value.name),
'value': int(value.values[0]),
}
).values.tolist()
}
).values.tolist(),
}
).values.tolist()
def process_new(df: pd.DataFrame) -> tuple:
def iter_param() -> Iterator[dict]:
for param_value, param_group in df.groupby(level=0):
yield {
'parameter': param_value,
'unit': units[param_value],
'levels': tuple(iter_level(param_group)),
}
def iter_level(param_group: pd.Series) -> Iterator[dict]:
for level_value, level_group in param_group.groupby(level=1):
yield {
'level': level_value,
'values': tuple(iter_time(level_group)),
}
def iter_time(level_group: pd.Series) -> Iterator[dict]:
for (param, level, time), value in level_group.iteritems():
yield {
'validTime': str(time),
'value': value,
}
# Group by the "param" index level, ignore the "level" index level,
# and take the first unit value of each group
units = df.groupby(level=0).units.first()
# validTime is functionally an index but misrepresented as columns; fix that
df = df.drop(columns=['units']).stack()
return tuple(iter_param())
def test() -> None:
df = example_data()
result = process_new(df)
print(json.dumps(result, indent=4))
if __name__ == '__main__':
test()
Output
[
{
"parameter": "paramA",
"unit": "a",
"levels": [
{
"level": "levelA",
"values": [
{
"validTime": "2021年10月22日 00:00:00",
"value": 85
},
{
"validTime": "2021年10月22日 01:00:00",
"value": 63
},
{
"validTime": "2021年10月22日 02:00:00",
"value": 51
},
{
"validTime": "2021年10月22日 03:00:00",
"value": 26
},
{
"validTime": "2021年10月22日 04:00:00",
"value": 30
}
]
},
{
"level": "levelB",
"values": [
{
"validTime": "2021年10月22日 00:00:00",
"value": 4
},
{
"validTime": "2021年10月22日 01:00:00",
"value": 7
},
{
"validTime": "2021年10月22日 02:00:00",
"value": 1
},
{
"validTime": "2021年10月22日 03:00:00",
"value": 17
},
{
"validTime": "2021年10月22日 04:00:00",
"value": 81
}
]
}
]
},
{
"parameter": "paramB",
"unit": "b",
"levels": [
{
"level": "levelA",
"values": [
{
"validTime": "2021年10月22日 00:00:00",
"value": 64
},
{
"validTime": "2021年10月22日 01:00:00",
"value": 91
},
{
"validTime": "2021年10月22日 02:00:00",
"value": 50
},
{
"validTime": "2021年10月22日 03:00:00",
"value": 60
},
{
"validTime": "2021年10月22日 04:00:00",
"value": 97
}
]
},
{
"level": "levelB",
"values": [
{
"validTime": "2021年10月22日 00:00:00",
"value": 72
},
{
"validTime": "2021年10月22日 01:00:00",
"value": 63
},
{
"validTime": "2021年10月22日 02:00:00",
"value": 54
},
{
"validTime": "2021年10月22日 03:00:00",
"value": 55
},
{
"validTime": "2021年10月22日 04:00:00",
"value": 93
}
]
}
]
}
]
As already pointed out in the comments, these nested lambdas give a hard time to the reader to know what is going on and when. Consider splitting them in tiny helper functions.
You also extract the units
columns so it doesn't bother you further down your apply
s, consider reshaping and reindexing your data so you can groupby(['parameters', 'units'])
juuuuust in case there is a mismatch somewhere.
Your parameter.groupby('level')
, combined with your [0]
indexing is just a fancy apply(..., axis=1)
as your consider each level unique in their respective parameter.
You also don't need to use values.tolist()
each time, as the Series
returned by apply
allow you to call to_list()
directly for the same effect.
Proposed improvements:
import pandas as pd
import numpy as np
def build_parameter_response(parameter):
param, _, unit = parameter.index[0]
levels = parameter.apply(build_level_response, axis=1)
return {'parameter': param, 'unit': unit, 'levels': levels.to_list()}
def build_level_response(level):
_, name, _ = level.name
values = level.reset_index().apply(build_time_response, axis=1)
return {'level': name, 'values': values.to_list()}
def build_time_response(times):
time, value = times
return {'validTime': str(time), 'value': value}
def build_response(dataframe):
return (dataframe
.set_index('units', append=True)
.groupby(['parameter', 'units'])
.apply(build_parameter_response)
.to_list())
if __name__ == '__main__':
df = pd.DataFrame({
('paramA','levelA'):np.random.randint(100, size=(5)),
('paramA','levelB'):np.random.randint(100, size=(5)),
('paramB','levelA'):np.random.randint(100, size=(5)),
('paramB','levelB'):np.random.randint(100, size=(5))
},
).T
df.index.set_names(['parameter','level'], inplace=True)
df.columns = np.arange('2021-10-22T00', '2021-10-22T05', dtype='datetime64[h]')
df.columns.set_names('validTime', inplace=True)
df['units']=['a','a','b','b']
from pprint import pprint as print
print(build_response(df))
You can also leverage the to_dict()
methods of Series
to confuse a bit less and use a list-comprehension to build the final form of your datastructure:
import pandas as pd
import numpy as np
def build_parameter_response(parameter):
param, _, unit = parameter.index[0]
levels = parameter.apply(build_level_response, axis=1)
return {'parameter': param, 'unit': unit, 'levels': levels.to_list()}
def build_level_response(level):
_, _, name = level.name
values = [
{'validTime': str(time), 'value': value}
for time, value in level.to_dict().items()
]
return {'level': name, 'values': values}
def build_response(dataframe):
return (dataframe
.set_index('units', append=True)
.groupby(['parameter', 'units'])
.apply(build_parameter_response)
.to_list())
if __name__ == '__main__':
df = pd.DataFrame({
('paramA','levelA'):np.random.randint(100, size=(5)),
('paramA','levelB'):np.random.randint(100, size=(5)),
('paramB','levelA'):np.random.randint(100, size=(5)),
('paramB','levelB'):np.random.randint(100, size=(5))
},
).T
df.index.set_names(['parameter','level'], inplace=True)
df.columns = np.arange('2021-10-22T00', '2021-10-22T05', dtype='datetime64[h]')
df.columns.set_names('validTime', inplace=True)
df['units']=['a','a','b','b']
from pprint import pprint as print
print(build_response(df))
-
\$\begingroup\$
pprint
is not strictly appropriate here, as OP indicates that they're returning this as an API response - so ajson
call that uses explicit indentation will be more representative. \$\endgroup\$Reinderien– Reinderien2022年02月01日 17:16:35 +00:00Commented Feb 1, 2022 at 17:16 -
\$\begingroup\$ @Reinderien Yes and no, neither approaches will trully reflect the calling code from the OP. The
if __name__ == ’__main__’
part is mostly for testing purposes and to ensure that the code under review and the proposed improvements both produces the same results. But to be fair, this is mostly artifacts from my own tests ;) \$\endgroup\$301_Moved_Permanently– 301_Moved_Permanently2022年02月01日 19:19:00 +00:00Commented Feb 1, 2022 at 19:19
.values.tolist()
are a red flag, combined with the fact that just looking at it I haven't got a clue what it does! \$\endgroup\$