See Serializing (nested) data structures in a human-readable format for more details.
This is the latest version, I have fixed all bugs and now this piece of code works completely as intended.
And I have added many more test cases. So far all tests are successful. I think I can't find an edge case that will cause the code to return wrong results now.
My code is now greatly improved than the original, but I am not fully satisfied with the current code yet. As this is code review and I have already received answers I can't update the code posted in the original question so I have to post a new question.
Latest Version
from typing import Union
def represent(obj: Union[dict, frozenset, list, set, tuple], indent: int=4) -> str:
singles = (frozenset, list, set, tuple)
supported = (dict, *singles)
if not isinstance(obj, supported):
raise TypeError('argument `obj` should be an instance of a built-in container data type')
if not isinstance(indent, int):
raise TypeError('argument `indent` should be an `int`')
if indent <= 0:
raise ValueError('argument `indent` should be greater than 0')
if indent % 2:
raise ValueError('argument `indent` should be a multiple of 2')
increment = indent
def worker(obj, indent, add_level=True, not_value=True):
ls = list()
indent_level = ' '*(indent-increment)
open_indent = indent_level * add_level
if not isinstance(obj, supported):
return indent_level * not_value + repr(obj)
enclosures = {
dict: ('{', '}'),
frozenset: ('frozenset({', '})'),
list: ('[', ']'),
set: ('{', '}'),
tuple: ('(', ')')
}
singletons = {
dict: 'dict()',
frozenset: 'frozenset({})',
list: '[]',
set: 'set()',
tuple: '()'
}
for cls in supported:
if isinstance(obj, cls):
if not obj:
return open_indent + singletons[cls]
start, end = enclosures[cls]
obj_cls = cls
break
start = open_indent + start
if obj_cls in singles:
single_tuple = (obj_cls == tuple and len(obj) == 1)
for i in obj:
item = worker(i, indent+increment)
if single_tuple:
item += ','
ls.append(item)
else:
for k, v in obj.items():
item = f"{' '*indent}{repr(k)}: {worker(v, indent+increment, False, False)}"
ls.append(item)
return "{0}\n{1}\n{2}{3}".format(start, ',\n'.join(ls), indent_level, end)
return worker(obj, indent)
Test cases
var = {1: {1: {1: {1: {1: {1: 0}, 2: 0}, 2: {1: 0}, 3: 0},
2: {1: {1: 0}, 2: 0},
3: {1: 0}},
2: {1: {1: {1: 0}, 2: 0}, 2: {1: 0}, 3: 0},
3: {1: {1: 0}, 2: 0}},
2: {1: {1: {1: {1: 0}, 2: 0}, 2: {1: 0}, 3: 0},
2: {1: {1: 0}, 2: 0},
3: {1: 0}},
3: {1: {1: {1: 0}, 2: 0}, 2: {1: 0}, 3: 0}}
repred = represent(var)
print(repred)
print(f'{(eval(repred) == var)=}')
var1 = {'a': {'a': {'a': [0, 0, 0], 'b': [0, 0, 1], 'c': [0, 0, 2]},
'b': {'a': [0, 1, 0], 'b': [0, 1, 1], 'c': [0, 1, 2]},
'c': {'a': [0, 2, 0], 'b': [0, 2, 1], 'c': [0, 2, 2]}},
'b': {'a': {'a': [1, 0, 0], 'b': [1, 0, 1], 'c': [1, 0, 2]},
'b': {'a': [1, 1, 0], 'b': [1, 1, 1], 'c': [1, 1, 2]},
'c': {'a': [1, 2, 0], 'b': [1, 2, 1], 'c': [1, 2, 2]}},
'c': {'a': {'a': [2, 0, 0], 'b': [2, 0, 1], 'c': [2, 0, 2]},
'b': {'a': [2, 1, 0], 'b': [2, 1, 1], 'c': [2, 1, 2]},
'c': {'a': [2, 2, 0], 'b': [2, 2, 1], 'c': [2, 2, 2]}}}
print(represent(var1))
var2 = [[0, 1, 2], [3, 4, 5], 6, [7, 8, 9]]
print(represent(var2))
print(represent(set()))
print(represent(dict()))
print(represent((1,)))
print(represent(([1,2,3,[4,5,6,[7,8,9]]],)))
print(represent([(1,),(2,),(3,),[(4,),(5,),(6,),[(7,),(8,),(9,)]]]))
print(represent([{(1,),(2,),(3,)},{(4,),(5,),(6,)},{(7,),(8,),(9,)}]))
print(represent({0: {0: set()}}))
Correct output
{
1: {
1: {
1: {
1: {
1: {
1: 0
},
2: 0
},
2: {
1: 0
},
3: 0
},
2: {
1: {
1: 0
},
2: 0
},
3: {
1: 0
}
},
2: {
1: {
1: {
1: 0
},
2: 0
},
2: {
1: 0
},
3: 0
},
3: {
1: {
1: 0
},
2: 0
}
},
2: {
1: {
1: {
1: {
1: 0
},
2: 0
},
2: {
1: 0
},
3: 0
},
2: {
1: {
1: 0
},
2: 0
},
3: {
1: 0
}
},
3: {
1: {
1: {
1: 0
},
2: 0
},
2: {
1: 0
},
3: 0
}
}
(eval(repred) == var)=True
{
'a': {
'a': {
'a': [
0,
0,
0
],
'b': [
0,
0,
1
],
'c': [
0,
0,
2
]
},
'b': {
'a': [
0,
1,
0
],
'b': [
0,
1,
1
],
'c': [
0,
1,
2
]
},
'c': {
'a': [
0,
2,
0
],
'b': [
0,
2,
1
],
'c': [
0,
2,
2
]
}
},
'b': {
'a': {
'a': [
1,
0,
0
],
'b': [
1,
0,
1
],
'c': [
1,
0,
2
]
},
'b': {
'a': [
1,
1,
0
],
'b': [
1,
1,
1
],
'c': [
1,
1,
2
]
},
'c': {
'a': [
1,
2,
0
],
'b': [
1,
2,
1
],
'c': [
1,
2,
2
]
}
},
'c': {
'a': {
'a': [
2,
0,
0
],
'b': [
2,
0,
1
],
'c': [
2,
0,
2
]
},
'b': {
'a': [
2,
1,
0
],
'b': [
2,
1,
1
],
'c': [
2,
1,
2
]
},
'c': {
'a': [
2,
2,
0
],
'b': [
2,
2,
1
],
'c': [
2,
2,
2
]
}
}
}
[
[
0,
1,
2
],
[
3,
4,
5
],
6,
[
7,
8,
9
]
]
set()
dict()
(
1,
)
(
[
1,
2,
3,
[
4,
5,
6,
[
7,
8,
9
]
]
],
)
[
(
1,
),
(
2,
),
(
3,
),
[
(
4,
),
(
5,
),
(
6,
),
[
(
7,
),
(
8,
),
(
9,
)
]
]
]
[
{
(
1,
),
(
2,
),
(
3,
)
},
{
(
6,
),
(
4,
),
(
5,
)
},
{
(
7,
),
(
8,
),
(
9,
)
}
]
{
0: {
0: set()
}
}
Please help me make it better!
Edit
Well, I think I have made all the improvements I can make to the script now, I think I have done all possible improvements I can to the script now, so I will not edit this question again. What other improvements can be possibly made?
2 Answers 2
White space
53 lines without a blank line is excessive. Add vertical white-space to break the function into logical chunks. Validation, initialization, worker
. Same thing inside the worker
.
Type-hints
Depending on the version of Python you're targeting, you can remove Unipn[...]
, and just use the "or" operator:
def represent(obj: dict | frozenset | list | set | tuple, indent: int=4) -> str:
The worker
could use type-hints too:
def worker(obj, indent: int, add_level: bool = True, not_value: bool = True) -> str:
Multiplication by boolean?
indent_level * add_level
and indent_level * not_value
? Those are booleans, not integers. Yes, they can be interpreted as 0 and 1, but yuck!
add_level and not_value?
What is not_value
? That name is quite obfuscating. Moreover, you only ever call worker with the two parameters both defaulted to True
, or both explicitly set to False
. Why two parameters then?
local(local(local(local(...))))
def represent(obj: Union[dict, frozenset, list, set, tuple], indent: int=4) -> str:
...
supported = (dict, *singles)
...
def worker(obj, indent, add_level=True, not_value=True):
...
if not isinstance(obj, supported):
...
enclosures = { ... }
singletons = { ... }
You define supported
in represented
scope, and use it inside worker
scope. All is good.
But you redefine enclosures
and singletons
each and every time you recurse into a deeper level of worker
. This creates and recreates these variables on the stack at each level. They are constants, and could easily be defined at worker
scope, saving both execution time and memory.
As @riskypenguin said, you could move these out of represent()
to a module-level constant, so they are created once instead of each time represent()
is called.
f-strings
f"...{repr(k)}..."
could be written with the !r
format code: f"...{k!r}..."
Indenting may be accomplished using field width specifiers. As in f"{'':{indent}}..."
instead of creating and recreating strings of spaces all the time.
String concatenation
The code is constantly creating new strings and combining them using string concatenation (+
) or .join()
calls. This probably has exponential time and space complexity as the nesting level increases.
Instead of concatenating strings, you should accumulate the output into one StringIO
buffer, which is designed for constantly appending strings into a increasing blob of text.
print_stdout
If the programmer overhead of using string_buf.write(...)
or print(..., file=string_buf)
is too high, you could just print()
each string line to stdout. Call that version of the function print_represent(...)
.
Then you could write a wrapper function which returns the result as string using contextlib.redirect_stdout
:
def represent(obj: Union[dict, frozenset, list, set, tuple], indent: int=4) -> str:
with contextlib.redirect_stdout(io.StringIO()) as string_buf:
print_represent(obj, indent)
return string_buf.getvalue()
This function is now considerably longer and harder to understand than before. I find nested functions (especially of this complexity) really harm readability, since you need to keep track of even more layers of context to understand the full functionality. I strongly suggest splitting this function into top-level functions to increase readability and seperate logic.
singles
, supported
, enclosures
, singletons
(not an accurate name by the way, singleton pattern on wikipedia) are basically collections of different attributes that could and should be contained within a single object. I suggest using a collections.namedtuple
, but there are other approaches as well.
from typing import Union
from collections import namedtuple
SupportedType = namedtuple("SupportedType", ["base_class", "enclosures", "empty_repr"])
def represent(obj: Union[dict, frozenset, list, set, tuple], indent: int = 4) -> str:
supported_types = (
SupportedType(base_class=dict, enclosures=('{', '}'), empty_repr='dict()'),
SupportedType(base_class=list, enclosures=('[', ']'), empty_repr='[]'),
SupportedType(base_class=set, enclosures=('{', '}'), empty_repr='set()'),
SupportedType(base_class=tuple, enclosures=('{', '}'), empty_repr='()'),
SupportedType(base_class=frozenset, enclosures=('frozenset({', '})'), empty_repr='frozenset()'),
)
for supported_type in supported_types:
if isinstance(obj, supported_type.base_class):
supported_base_class = supported_type.base_class
start, end = supported_type.enclosures
empty_repr = supported_type.empty_repr
break
else:
raise TypeError('argument `obj` should be an instance of a built-in container data type')
...
This way you have all the relevant information regarding your supported types in proper context and can get rid of the multiple isinstance
checks. You could extend this to also make each instance of SupportedType contain a reference to the correct represent_object_of_type
(or similiar) function. This lets you get rid of un-elegant (hardcoded) checks like isinstance(obj, dict)
.
supported_types
should also move to a module-level constant (if that makes sense for your project).
Explore related questions
See similar questions with these tags.