#defaultdict
defaultdict
#defaultdict
defaultdict
Appending items to a list on a key is typically done with a collections.defaultdict(list)
.
from collections import defaultdict
result = collections.defaultdict(list)
for row in X.itertuples():
idx = row.Index
for item in row.B:
result[item].append(idx)
The method you use to combine the values in the dictionary is meant to include empty items as None
. Here you add them, and then filter them out, so using a defaultdict
will simplify this up a lot.
[(10, 2, 'OP', 0.001344002000003286), (10, 2, 'maarten', 0.0003913850000003549), (10, 2, 'jezrael', 0.005293956000002709), (100, 10, 'OP', 0.027166392000005146), (100, 10, 'maarten', 0.0004795910000012782), (100, 10, 'jezrael', 0.013824836999994261), (1000, 40, 'OP', 0.3434149869999956), (1000, 40, 'maarten', 0.0032574399999987236), (1000, 40, 'jezrael', 0.018533767000000978), (1000010_000, 200, 'OP', 33.48681208600001), (1000010_000, 200, 'maarten', 0.10972772499999905), (1000010_000, 200, 'jezrael', 0.7631061700000004), (350_000, 1000, 'maarten', 22.097186581000003), (350_000, 1000, 'jezrael', 516.128048978)]
The method using defaultdict
is a lot faster. The gap with @jezrael's method is decreasing, so perhaps for your amount of data that changes. This also only says something about the run-time,ot not the memory usage, where my method does not create a large intermediary DataFrame
.
Appending items to a list on a key is typically done with a collections.defaultdict(list)
.
for row in X.itertuples():
idx = row.Index
for item in row.B:
result[item].append(idx)
[(10, 2, 'OP', 0.001344002000003286), (10, 2, 'maarten', 0.0003913850000003549), (10, 2, 'jezrael', 0.005293956000002709), (100, 10, 'OP', 0.027166392000005146), (100, 10, 'maarten', 0.0004795910000012782), (100, 10, 'jezrael', 0.013824836999994261), (1000, 40, 'OP', 0.3434149869999956), (1000, 40, 'maarten', 0.0032574399999987236), (1000, 40, 'jezrael', 0.018533767000000978), (10000, 200, 'OP', 33.48681208600001), (10000, 200, 'maarten', 0.10972772499999905), (10000, 200, 'jezrael', 0.7631061700000004)]
The method using defaultdict
is a lot faster. The gap with @jezrael's method is decreasing, so perhaps for your amount of data that changes. This also only says something about the run-time,ot the memory usage.
Appending items to a list on a key is typically done with a defaultdict
.
from collections import defaultdict
result = collections.defaultdict(list)
for row in X.itertuples():
idx = row.Index
for item in row.B:
result[item].append(idx)
The method you use to combine the values in the dictionary is meant to include empty items as None
. Here you add them, and then filter them out, so using a defaultdict
will simplify this up a lot.
[(10, 2, 'OP', 0.001344002000003286), (10, 2, 'maarten', 0.0003913850000003549), (10, 2, 'jezrael', 0.005293956000002709), (100, 10, 'OP', 0.027166392000005146), (100, 10, 'maarten', 0.0004795910000012782), (100, 10, 'jezrael', 0.013824836999994261), (1000, 40, 'OP', 0.3434149869999956), (1000, 40, 'maarten', 0.0032574399999987236), (1000, 40, 'jezrael', 0.018533767000000978), (10_000, 200, 'OP', 33.48681208600001), (10_000, 200, 'maarten', 0.10972772499999905), (10_000, 200, 'jezrael', 0.7631061700000004), (350_000, 1000, 'maarten', 22.097186581000003), (350_000, 1000, 'jezrael', 516.128048978)]
The method using defaultdict
is a lot faster. This only says something about the run-time, not the memory usage, where my method does not create a large intermediary DataFrame
.
itertuples
When traversing over the rows of a DataFrame
, using itertuples is generally faster than iterrows
. The latter makes a new Series
for each row, the former a namedtuple
, which is generally faster.
#defaultdict
Appending items to a list on a key is typically done with a collections.defaultdict(list)
.
for row in X.itertuples():
idx = row.Index
for item in row.B:
result[item].append(idx)
naming
Try to follow pep-8.
snake_case
for variables, etc.- spaces around operators
- ...
timings
dummy data
def make_dummydata(rows, max_length, seed=0):
letters = string.ascii_letters
np.random.seed(seed)
random.seed(seed)
col1 = np.random.randint(0, 10, size=rows)
items_per_row = np.random.randint(0, max_length, size=rows) + 1
col2 = [random.choices(letters, k=amount) for amount in items_per_row]
return pd.DataFrame({"A": col1, "B": col2})
benchmark method
import timeit
def benchmark(cases, functions):
for rows, max_length in cases:
df = make_dummydata(rows, max_length)
for name, function in functions.items():
result = timeit.timeit(
stmt=f"function(df)",
globals={"df": df, "function": function},
number=1,
)
yield rows, max_length, name, result
results
cases = [(10, 2), (100, 10), (1000, 40), (10000, 200)]
functions = {
"OP": find_op,
"maarten": find_maarten,
"jezrael": find_jezrael,
}
list(benchmark())
[(10, 2, 'OP', 0.001344002000003286), (10, 2, 'maarten', 0.0003913850000003549), (10, 2, 'jezrael', 0.005293956000002709), (100, 10, 'OP', 0.027166392000005146), (100, 10, 'maarten', 0.0004795910000012782), (100, 10, 'jezrael', 0.013824836999994261), (1000, 40, 'OP', 0.3434149869999956), (1000, 40, 'maarten', 0.0032574399999987236), (1000, 40, 'jezrael', 0.018533767000000978), (10000, 200, 'OP', 33.48681208600001), (10000, 200, 'maarten', 0.10972772499999905), (10000, 200, 'jezrael', 0.7631061700000004)]
The method using defaultdict
is a lot faster. The gap with @jezrael's method is decreasing, so perhaps for your amount of data that changes. This also only says something about the run-time,ot the memory usage.