Description
I had to create a dictionary structure where one value can be referenced by several keys.
For example the keys a
, b
, c
reference the value 1 and d
, e
reference the value 2. If two groups of keys intersect then I want to merge them, for example the group a
, b
and b
, c
should result to a
, b
, c
.
I also need an efficient solution:
- linear time for creation (init) - \$O(n)\$.
- constant time for access (update and get) - \$O(1)\$
- no need for inserting and delete since I have the final list of keys at the init
I'm come from Java world and I'm not totally confortable with the Python philosophy so I think some review could be useful.
Main code
from collections.abc import Mapping
from typing import Iterator, Set, FrozenSet, AbstractSet, ValuesView, Tuple
class MultiKeysDict(Mapping):
"""
Dictionary to manage multiple key to one value.
The keys groupes should be set at the initialization and can't change later.
If two keys groupes share a key so they will reference the same value.
"""
class Value:
"""
Store the value data
"""
def __init__(self, default_value):
self.data = default_value
def __init__(self, grp_keys: Set[FrozenSet], default_value = None):
"""
Create a dictionary based keys groups.
Every group will share the same value. Intersected group will share the same value.
:param grp_keys: The groups of keys.
:param default_value: The default value.
"""
self._storage = dict()
self._values = set()
# Build all keys
reverse_dict = dict() # For groupe merging
for keys in grp_keys:
refs = set()
for key in keys:
if key in self._storage:
refs.add(self._storage[key])
# New group of keys
if not refs:
new_value = MultiKeysDict.Value(default_value)
self._storage.update(dict.fromkeys(keys, new_value))
reverse_dict[new_value] = set(keys)
self._values.add(new_value)
# Extend an existing group of keys
elif len(refs) == 1:
old_value = refs.pop()
self._storage.update(dict.fromkeys(keys, old_value))
reverse_dict[old_value].update(keys)
# Merge several group of keys (troubles start here)
else:
old_value = refs.pop()
keys_to_merge = [key for ref in refs for key in reverse_dict[ref]]
self._storage.update(dict.fromkeys(keys_to_merge, old_value))
reverse_dict[old_value].update(keys_to_merge)
# Remove merged references
for ref in refs:
del reverse_dict[ref]
self._values.discard(ref)
def __getitem__(self, k):
return self._storage[k].data
def __setitem__(self, key, value):
self._storage[key].data = value
def __len__(self) -> int:
return len(self._values)
def __iter__(self) -> Iterator:
return (value.data for value in self._values)
def keys(self) -> AbstractSet:
return self._storage.keys()
def values(self) -> ValuesView:
return [value.data for value in self._values] # Should be ValuesView type?
def items(self) -> AbstractSet[Tuple]:
return {(key, self.__getitem__(key)) for key in self.keys()}
Some tests
import unittest
from tools.multikey_dict import MultiKeysDict
class TestMultiKeyDict(unittest.TestCase):
def test_basic_init(self):
# Expected: [a, b] [c, d] [e, f, g]
d = MultiKeysDict({
frozenset({'a', 'b'}),
frozenset({'c', 'd'}),
frozenset({'e', 'f', 'g'})
})
d['a'] = 42
d['c'] = 'lolilol'
d['f'] = [1, 2, 3]
# [a, b]
self.assertEqual(d['a'], d['b'])
self.assertEqual(d['b'], 42)
# [c, d]
self.assertEqual(d['c'], d['d'])
self.assertEqual(d['d'], 'lolilol')
# [e, f, g]
self.assertIs(d['e'], d['f'])
self.assertIs(d['e'], d['g'])
self.assertEqual(d['f'], [1, 2, 3])
def test_intersection_init(self):
# Expected: [a, b, c] [d, e]
d = MultiKeysDict({
frozenset({'a', 'b'}),
frozenset({'b', 'c'}),
frozenset({'d', 'e'})
})
d['a'] = 1
d['d'] = 2
# [a, b, c]
self.assertEqual(d['b'], 1)
self.assertEqual(d['c'], 1)
self.assertIs(d['a'], d['c'])
# [d, e]
self.assertIs(d['d'], d['e'])
def test_merge_init(self):
# Expected: [a, b, c, d] [e, f]
d = MultiKeysDict({
frozenset({'a', 'b'}),
frozenset({'c', 'd'}),
frozenset({'b', 'c'}),
frozenset({'e', 'f'})
})
d['a'] = 1
# [a, b, c, d]
self.assertEqual(d['a'], 1)
self.assertEqual(d['b'], 1)
self.assertEqual(d['c'], 1)
self.assertEqual(d['d'], 1)
The code seems to work as expected but I'm not sure about my implementation, especially the values()
and items()
part.
2 Answers 2
collections.abc.Mapping
is meant to be immutable. You wantMutableMapping
.- The result from
list(d)
is unpythonic, it's standard to return the same asMapping.keys
. - You default all values to
None
, this smells really fishy to me. This means on an empty dictionary it says it's full, it also meansd[key]
magically returnsNone
. Andkey in d
is alwaysTrue
. - Personally I'd create two dictionaries, the first would translate from known keys to the
frozenset
. The second would be the the actual dictionary with the keys as thefrozenset
. It's a bit strange to me that you'd pass poorly constructed sets to
MultiKeysDict
, but it's possible to have it merge the keys provided. However this runs in \$O(n^2)\$ time. I provided this as aclassmethod
.If you prefer it to run on all creations then you can just change the call slightly and call it from
__init__
.
import collections
class MultiKeysDict(collections.abc.MutableMapping):
def __init__(self, translations):
self._data = {}
self._translations = {
k: set_
for set_ in translations
for k in set_
}
@classmethod
def from_overlapping(cls, sets):
handled = set()
for set_ in sets:
to_merge = {s for s in handled if s & set_}
for s in to_merge:
handled.remove(s)
set_ |= s
handled.add(set_)
return cls(handled)
def _translate(self, key):
if key not in self._data:
key = self._translations[key]
return key
def __getitem__(self, key):
return self._data[self._translate(key)]
def __setitem__(self, key, value):
self._data[self._translate(key)] = value
def __delitem__(self, key):
del self._data[self._translate(key)]
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
-
\$\begingroup\$ I will take a look about
MutableMapping
andlist(d)
. About theNone
issue I'm not sure to get it. Thein
condition will returnTrue
if the key is set andFalse
if not, which seems to be the standard behaviours fordict
, no ? \$\endgroup\$Opsse– Opsse2019年06月06日 08:01:17 +00:00Commented Jun 6, 2019 at 8:01 -
\$\begingroup\$ About the implementation you suggest, I think this doesn't fit the keys intersection case that I explained in the description (or I misunderstood something). This raise a
KeyError
in my second unit test forself.assertEqual(d['c'], 1)
. \$\endgroup\$Opsse– Opsse2019年06月06日 08:39:47 +00:00Commented Jun 6, 2019 at 8:39 -
\$\begingroup\$ @Opsse
d = {}; 'a' in d is False; d['a']
results in an index error. Yours results in'a' in d is True; d['a'] is None
. This is non-standard and frankly bizarre behavior. As for the second aspect yes, I'll fix that in a bit. \$\endgroup\$2019年06月06日 10:33:25 +00:00Commented Jun 6, 2019 at 10:33 -
\$\begingroup\$ I agree that it makes more sense to move the key merging logic out of the init, I will do the same. I understand your point about the
None
default, but in my use case I need to set a default value that I will update later. So maybe I should make the default value mandatory at the init. \$\endgroup\$Opsse– Opsse2019年06月06日 13:03:01 +00:00Commented Jun 6, 2019 at 13:03 -
\$\begingroup\$ @Opsse You are free to ignore my advice. I've said what is standard and normal usage, if you want to divert from that, then that's up to you. \$\endgroup\$2019年06月06日 13:10:32 +00:00Commented Jun 6, 2019 at 13:10
At first glance, your code looks quite good. I would like to share some of my thoughts with you nevertheless.
- Some variable names could be improved to make the code more readable. For instance
grp_keys
, seems to hint towards keys to groups. To me it would be more intuitive if the name was something likekey_groups
, which does sound more what you actually want from the user. One can likely argue about this. The nameself._storage
is also quite generic. - There should be no whitespace around the
=
when used for keyword arguments, i.e.default_value=None
instead ofdefault_value = None
(relevant PEP8 section) - Some of the comments could use a second look. E.g.
groupes
should likely begroups
and there are a few sentences that don't make much sense, e.g.Every group will share the same value. Intersected group will share the same value.
should likely beEvery key of a group will share the same value. Intersecting groups will also share the same value.
- Using
Set[FrozenSet]
to initialize your class might be overly restrictive. Your code should work fine with other sequence types and likely even iterables.
It took me quite some time to understand what's going on in __init__
. While thinking about an alternative solution I arrived at something similar to @Peilonrayz' answer. (削除) so I won't duplicate that. Using this approach invalidates the last point mentioned above and you should stick with your current approach. (削除ここまで) My approach can be found below. I'm not sure if it meets your complexity requirements, but it did pass your tests.
A word of warning: As @Peilonrayz rightfully pointed out in a comment, the presented implementation will fail for cases like MultiKeysDict([frozenset('ab'), frozenset('cd'), frozenset('bc')])
. The changes need to fix that would lead to what he presented in his answer.
class MultiKeysDict(Mapping):
"""
Dictionary to manage multiple key to one value.
The keys groups has to be set at initialization and can't change later.
If two keys groups share a key they will reference the same value.
"""
class Value:
"""
Store the value data
"""
def __init__(self, default_value):
self.data = default_value
def __repr__(self):
return f"Value({self.data!r})"
def __init__(self, key_groups: Set[FrozenSet], default_value=None):
"""Create a dictionary based on key groups.
Every key in a group will share the same value.
Intersecting groups will also share the same value.
:param key_groups: The groups of keys.
:param default_value: The default value.
"""
self._proxy = dict()
self._data = dict()
current_group_id = 0
for keys in key_groups:
known_keys = keys.intersection(self._proxy.keys())
if known_keys: # merge
key = next(iter(known_keys))
self._proxy.update(dict.fromkeys(keys, self._proxy[key]))
else:
self._proxy.update(dict.fromkeys(keys, current_group_id))
self._data[current_group_id] = MultiKeysDict.Value(default_value)
current_group_id += 1
def __getitem__(self, key):
return self._data[self._proxy[key]].data
def __setitem__(self, key, value):
self._data[self._proxy[key]].data = value
def __len__(self):
return len(self._data)
def __iter__(self):
return iter(self._data)
-
\$\begingroup\$ I agree with most of your points, I will fix it. I can only argue about
Set[FrozenSet]
, I don't think I can make it less restrictive sinceSet[Set]
is not possible (onlyFrozenSet
is hashable). And other collection type would be too generic because I don't want duplicated value. \$\endgroup\$Opsse– Opsse2019年06月06日 08:57:00 +00:00Commented Jun 6, 2019 at 8:57 -
\$\begingroup\$ About Peilonrayz answer I think it doesn't work. I explained why in comment. \$\endgroup\$Opsse– Opsse2019年06月06日 08:58:08 +00:00Commented Jun 6, 2019 at 8:58
-
\$\begingroup\$ Maybe I will have another look at my approach once I get back home and add it to the answer. \$\endgroup\$AlexV– AlexV2019年06月06日 09:31:53 +00:00Commented Jun 6, 2019 at 9:31
-
\$\begingroup\$ @Opsse: I added my approach to the answer. \$\endgroup\$AlexV– AlexV2019年06月06日 19:12:52 +00:00Commented Jun 6, 2019 at 19:12
-
\$\begingroup\$ FYI
MultiKeysDict([frozenset('ab'), frozenset('cd'), frozenset('bc')])._proxy == {'a': 0, 'b': 0, 'c': 0, 'd': 1}
. d should also be 0. (Note the input is a list to force this output) \$\endgroup\$2019年06月06日 20:03:40 +00:00Commented Jun 6, 2019 at 20:03
__init__
and it looks like it runs in \$O(n^2)\$ time. This is asrefs
worst case can begrp_keys
, meaning that you have both an outer and inner loop looping overgrp_keys
. \$\endgroup\$