I am trying to program the calculation of the relative strength index indicator described on a stockcharts.com page.
On the same page there are Excel formulas used for calculations.
My code below is accurate, but when I have many rows, its execution is quite slow, so I believe that it could be optimized.
import pandas as pd
import numpy as np
pd.set_option('display.width', 300)
close = [44.3389, 44.0902, 44.1497, 43.6124, 44.3278, 44.8264, 45.0955, 45.4245, 45.8433, 46.0826, 45.8931, 46.0328, 45.6140, 46.2820, 46.2820, 46.0028, 46.0328, 46.4116, 46.2222, 45.6439, 46.2122, 46.2521, 45.7137, 46.4515, 45.7835, 45.3548, 44.0288, 44.1783, 44.2181, 44.5672, 43.4205, 42.6628, 43.1314]
date = pd.date_range('20170101',periods=len(close))
df = pd.DataFrame(close, columns=['close'], index=date)
period = 14
df['delta'] = df['close'].diff()
df['up'] = df['dn'] = df['delta']
df.loc[df['up'][df['up'] < 0].index, 'up'] = 0
df.loc[df['dn'][df['dn'] > 0].index, 'dn'] = 0
df['dn'] = abs(df['dn'])
df['avg_gain'] = df['avg_loss'] = 0
df['rs'] = df['rsi'] = np.nan
df.loc[df.index[period], 'avg_gain'] = np.mean(df.iloc[1:period + 1]['up'])
df.loc[df.index[period], 'avg_loss'] = np.mean(df.iloc[1:period + 1]['dn'])
df.loc[df.index[period], 'rs'] = df.loc[df.index[period], 'avg_gain'] / df.loc[df.index[period], 'avg_loss']
df.loc[df.index[period], 'rsi'] = 100.0 - (100.0 / (1.0 + df.loc[df.index[period], 'rs']))
# can this part be optimized?
avg_gain = df[0:period + 1]['avg_gain'].values
avg_loss = df[0:period + 1]['avg_loss'].values
for idx in range(period + 1, len(df.index)):
avg_gain = np.append(avg_gain, ((avg_gain[idx - 1] * (period - 1)) + df.up.values[idx]) / period)
avg_loss = np.append(avg_loss, ((avg_loss[idx - 1] * (period - 1)) + df.dn.values[idx]) / period)
df['avg_gain'] = avg_gain
df['avg_loss'] = avg_loss
# end of can this part be optimized?
df['rs'] = df['avg_gain'] / df['avg_loss']
df['rsi'] = 100.0 - (100.0 / (1.0 + df['rs']))
print(df)
The for
loop is the biggest performance problem.
Another question I have is how do I avoid potential division by zero at this line?
df['rs'] = df['avg_gain'] / df['avg_loss']
1 Answer 1
Recurrence
Most importantly, your for
loop follows the form of a first-order non-homogeneous recurrence relation:
$$a_{n+1} = f_n a_n + g_n$$
$$ a_n = \left( \prod_{k=0}^{n-1} f_k \right) \left( A_0 + \sum_{m=0}^{n-1} \frac {g_m} {\prod_{k=0}^m f_k} \right) $$
where \$a_n\$ is avg_gain_loss
, \$f_n\$ is (period - 1)/period
, \$g_n\$ is up_down/period
, and \$A_0\$ is up_down[1: period+1].mean()
.
This successfully breaks the dependence of each avg_gain
row on the value of the previous row and allows for vectorisation. Because \$f_n\$ is constant, this reduces to the following with period \$p\$ and up_down
\$u\$:
$$ a_n = \left( \frac {p-1} p \right) ^n \left( A_0 + \frac 1 p \sum_{m=0} ^n u_m \left( \frac p {p-1} \right) ^m \right) $$
Pandas
Don't set up
and dn
through conditional indexing, and don't call abs()
; use clip()
.
When writing date literals, prefer pd.Timestamp()
over a string.
When assigning multiple columns, call df.assign()
. Or better yet: don't use Pandas until the end when you construct a frame; prior to that use pure Numpy.
To prevent the rs
series from dividing by zero, only perform the division over the dataframe after period
.
Delete all of these:
df['avg_gain'] = df['avg_loss'] = 0
df['rs'] = df['rsi'] = np.nan
df.loc[df.index[period], 'rs'] = df.loc[df.index[period], 'avg_gain'] / df.loc[df.index[period], 'avg_loss']
df.loc[df.index[period], 'rsi'] = 100.0 - (100.0 / (1.0 + df.loc[df.index[period], 'rs']))
They can be given "real" values later once they're ready.
Tests
Add a regression test. In this first approach I dumped the entire dataframe to a csv from the original implementation and then loaded it as a reference.
import pandas as pd
import numpy as np
close = np.array((
44.3389, 44.0902, 44.1497, 43.6124, 44.3278, 44.8264, 45.0955, 45.4245, 45.8433, 46.0826,
45.8931, 46.0328, 45.6140, 46.2820, 46.2820, 46.0028, 46.0328, 46.4116, 46.2222, 45.6439,
46.2122, 46.2521, 45.7137, 46.4515, 45.7835, 45.3548, 44.0288, 44.1783, 44.2181, 44.5672,
43.4205, 42.6628, 43.1314,
))
df = pd.DataFrame(
index=pd.date_range(
name='datetime', periods=len(close),
start=pd.Timestamp(year=2017, month=1, day=1),
),
data={'close': close},
)
delta = df['close'].diff()
df = df.assign(delta=delta, up=delta.clip(lower=0), dn=-delta.clip(upper=0))
n = close.size
period = 14
up_down = df[['up', 'dn']].values
ii = np.arange(len(df) - period)
a0 = up_down[1: period + 1].mean(axis=0)
avg_gain_loss = np.zeros((n, 2))
# first-order non-homogeneous recurrence relation
inner_sum = (
up_down[period:]/((period - 1)/period)**ii[:, np.newaxis]
).cumsum(axis=0)
avg_gain_loss[period:] = (inner_sum/period + a0) * (
((period - 1)/period)**ii
)[:, np.newaxis]
avg_gain, avg_loss = avg_gain_loss.T
rs = np.full(shape=n, fill_value=np.nan)
rs[period:] = avg_gain[period:]/avg_loss[period:] # avoid divide-by-zero
df = df.assign(
avg_gain=avg_gain, avg_loss=avg_loss,
rs=rs, rsi=100*(1 - 1/(1 + rs)),
)
ref = pd.read_csv('reference.csv', index_col=0)
assert np.allclose(ref.values, df.values, equal_nan=True)
Performance
Even without the recurrence solution above, you should almost never iteratively append()
-build an array. Pre-allocate it and assign by index.
Notice that the expressions for avg_gain
and avg_loss
are nearly identical. The expression should only be written once and executed on an array of n*2.
This demonstration is faster at scale by a factor up to ~ x380, depending on input size. Due to the datetime index, the longest possible input is about 80,000 before encountering an out-of-range error.
import time
import typing
import warnings
import pandas as pd
import numpy as np
def new_method(close: np.ndarray) -> pd.DataFrame:
n = close.size
delta = np.diff(close, prepend=np.nan)
up_down = np.empty(shape=(2, n), dtype=close.dtype)
delta.clip(min=0, out=up_down[0])
up_down[1] = -delta.clip(max=0)
up, dn = up_down
period = 14
ii = np.arange(n - period)
avg_gain_loss = np.zeros_like(up_down)
# first-order non-homogeneous recurrence relation
exp = ((period - 1)/period)**ii
a0 = up_down[:, 1: period + 1].mean(axis=1, keepdims=True)
inner_sum = (up_down[:, period:]/exp).cumsum(axis=1)
avg_gain_loss[:, period:] = (inner_sum/period + a0)*exp
avg_gain, avg_loss = avg_gain_loss
rs = np.full_like(avg_gain, fill_value=np.nan)
rs[period:] = avg_gain[period:]/avg_loss[period:] # avoid divide-by-zero
return pd.DataFrame(
index=pd.date_range(
name='datetime', periods=n,
start=pd.Timestamp(year=2017, month=1, day=1),
),
data={
'close': close, 'delta': delta, 'up': up, 'dn': dn,
'avg_gain': avg_gain, 'avg_loss': avg_loss,
'rs': rs, 'rsi': 100 - 100/(1 + rs),
},
)
def old_method(close: np.ndarray) -> pd.DataFrame:
date = pd.date_range('20170101', periods=len(close))
df = pd.DataFrame(close, columns=['close'], index=date)
period = 14
df['delta'] = df['close'].diff()
df['up'] = df['dn'] = df['delta']
df.loc[df['up'][df['up'] < 0].index, 'up'] = 0
df.loc[df['dn'][df['dn'] > 0].index, 'dn'] = 0
df['dn'] = abs(df['dn'])
df['avg_gain'] = df['avg_loss'] = 0
df['rs'] = df['rsi'] = np.nan
df.loc[df.index[period], 'avg_gain'] = np.mean(df.iloc[1:period + 1]['up'])
df.loc[df.index[period], 'avg_loss'] = np.mean(df.iloc[1:period + 1]['dn'])
df.loc[df.index[period], 'rs'] = df.loc[df.index[period], 'avg_gain'] / df.loc[df.index[period], 'avg_loss']
df.loc[df.index[period], 'rsi'] = 100.0 - (100.0 / (1.0 + df.loc[df.index[period], 'rs']))
avg_gain = df[0:period + 1]['avg_gain'].values
avg_loss = df[0:period + 1]['avg_loss'].values
for idx in range(period + 1, len(df.index)):
avg_gain = np.append(avg_gain, ((avg_gain[idx - 1] * (period - 1)) + df.up.values[idx]) / period)
avg_loss = np.append(avg_loss, ((avg_loss[idx - 1] * (period - 1)) + df.dn.values[idx]) / period)
df['avg_gain'] = avg_gain
df['avg_loss'] = avg_loss
df['rs'] = df['avg_gain'] / df['avg_loss']
df['rsi'] = 100.0 - (100.0 / (1.0 + df['rs']))
return df
def close_ref() -> np.ndarray:
return np.array((
44.3389, 44.0902, 44.1497, 43.6124, 44.3278, 44.8264, 45.0955, 45.4245, 45.8433, 46.0826,
45.8931, 46.0328, 45.6140, 46.2820, 46.2820, 46.0028, 46.0328, 46.4116, 46.2222, 45.6439,
46.2122, 46.2521, 45.7137, 46.4515, 45.7835, 45.3548, 44.0288, 44.1783, 44.2181, 44.5672,
43.4205, 42.6628, 43.1314,
))
def regression_test():
close = close_ref()
with warnings.catch_warnings():
warnings.simplefilter('ignore')
old = old_method(close)
new = new_method(close)
assert np.allclose(old.values, new.values, equal_nan=True, rtol=0, atol=1e-12)
def benchmark():
rand = np.random.default_rng(seed=0)
close = close_ref()
close = rand.normal(loc=close.mean(), scale=close.std(), size=80_000)
def run(method: typing.Callable[[np.ndarray], pd.DataFrame]) -> float:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
start = time.perf_counter()
method(close)
dur = time.perf_counter() - start
print(f'{method.__name__}: {dur:.4f}')
return dur
old = run(old_method)
new = run(new_method)
print(f'Speedup: x{old/new:.0f}')
if __name__ == '__main__':
regression_test()
benchmark()
old_method: 4.1401
new_method: 0.0106
Speedup: x389