I use the following code to identify some interested values into a dataframe and them plot a time window before and after that value appeared. It works very well, but I would like to know if there is a less coding way/more pythonic way to accomplish this. Thanks in advance!
Before I go, I try to use only Seaborn on the plotting secction, but creating the subplots and filling then was easier, considering I don't want to share axis.
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
# Generate Data
rng = np.random.default_rng(12345)
df = pd.DataFrame(
index=pd.date_range(start="1/1/2019", periods=1035, freq='D'),
data={'value':rng.integers(-100, 30000, 1035)}
).reset_index()
# Creating a boolean for interesting values
df['select'] = df['value'] < 0
# Finding days with interested value and creating the periods
lt_dates = df.loc[df['select'], 'index'].to_list()
lt_days_after = [pd.DataFrame(index=pd.date_range(start=day, periods=14, freq='D')).reset_index() for day in lt_dates]
lt_days_before = [pd.DataFrame(index=pd.date_range(end=day, periods=14, freq='D')).reset_index() for day in lt_dates]
# Concatenating the periods
df_mask = pd.concat(objs=[pd.concat(lt_days_after), pd.concat(lt_days_before)]).sort_values('index').drop_duplicates(ignore_index=True)
# Flagging days
df['grouped'] = df['index'].isin(df_mask['index'])
# Creating the groups
df['slice'] = (~df['grouped']).cumsum()
groups = df.loc[df['grouped'], 'slice'].unique()
groups_dict = {y: x for x, y in enumerate(groups)}
# Filtering non interested values
df = df.loc[df['grouped']]
df['slice'].replace(groups_dict, inplace=True)
# Plotting
rows = len(groups_dict)
# Function to fill the subplots
def subplot_df(_ax, x):
sns.lineplot(
x="index",
y='value',
ci=None,
data=df[df['slice'] == x],
ax=_ax
)
_ax.set_xlabel('')
f, ax = plt.subplots(
nrows=rows,
figsize=(12, 2*rows),
sharex=False,
sharey=False
)
# Filling a single or multiple subplots
if rows != 0:
for x in range(rows):
subplot_df(ax[x], x)
else:
subplot_df(ax, 0)
1 Answer 1
Avoid passing strings for parameters like start
when they come from application constants and not the user; use date
instances instead.
You rely on reset_index
too much - this can go away and be replaced with direct construction of the dataframe. One consequence of your approach is that there's a column called index
, which is not a good idea because attribute access for the column is broken (i.e. you can't write df.index
).
Your use of isin
, the separate treatment of after
and before
, and your enumerate
and replace
can go away. Consider instead a broadcast comparison of time deltas.
Suggested
from datetime import date
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
rng = np.random.default_rng(12345)
df = pd.DataFrame({
'when': pd.date_range(
start=date(2019, 1, 1), periods=1035, freq='D'
),
'value': rng.integers(low=-100, high=30_000, size=1035)
})
# Creating a boolean for interesting values
selected = df.value < 0
all_selected = np.abs(
df['when'].values[:, np.newaxis] -
df['when'][selected].values[np.newaxis, :]
) < pd.to_timedelta(14, unit='D')
df['grouped'] = np.bitwise_or.reduce(all_selected, axis=1)
grouped = np.zeros(1+len(selected), dtype=bool)
grouped[1:] = df.grouped
changes = np.abs(np.diff(grouped))
df['slice'] = changes.cumsum() // 2
# Filtering non interesting values
df = df.loc[df.grouped, :]
# Plotting
rows = round(changes.sum() // 2)
# Function to fill the subplots
def subplot_df(ax: plt.Axes, x: int) -> None:
sns.lineplot(
x='when',
y='value',
ci=None,
data=df[df['slice'] == x],
ax=ax,
)
ax.set_xlabel('')
f, ax = plt.subplots(
nrows=rows,
figsize=(12, 2*rows),
sharex=False,
sharey=False,
)
# Filling a single or multiple subplots
if rows == 0:
subplot_df(ax, 0)
else:
for x in range(rows):
subplot_df(ax[x], x)
plt.show()
-
\$\begingroup\$ Can you explain to me two points? 1. I tried to read the np.bitwise_or.reduce documentation, but I am straggling to get what this function does. I don’t know if my language or my computation skills are limited. 2. And, on following the first question, I don’t understand mathematically speaking why make necessary the // division. Thanks for your support! \$\endgroup\$GregOliveira– GregOliveira2022年08月08日 13:11:42 +00:00Commented Aug 8, 2022 at 13:11
-
1\$\begingroup\$ See numpy.org/doc/stable/reference/generated/… \$\endgroup\$Reinderien– Reinderien2022年08月08日 14:15:36 +00:00Commented Aug 8, 2022 at 14:15
-
1\$\begingroup\$ Given a 4*n array of boolean, that applies 'or' over the smaller axis to produce an array of length
n
\$\endgroup\$Reinderien– Reinderien2022年08月08日 14:17:02 +00:00Commented Aug 8, 2022 at 14:17 -
\$\begingroup\$ Floor division is necessary in that case because you don't want group indices of 0.5 and so on: those need to be integers \$\endgroup\$Reinderien– Reinderien2022年08月08日 14:17:59 +00:00Commented Aug 8, 2022 at 14:17