I was recently given a task to implement a data-checker script which given an input file containing (date, last_price) values should check for 3 different kind of errors - missing values, stale values and outliers and returns list of errors.
I wrote the code below, and the feedback I got was that it was not "pythonic" enough. Can someone please let me know how I can make this more pythonic? What parts look good and what don't? It will be extremely helpful as I write more python code in future.
#!/usr/bin/python3.6
import sys
import csv
import pprint
import statistics
from datetime import date
class DataChecker:
class DatePrice:
def __init__(self, input_date, price):
try:
day, month, year = input_date.split("/")
self._price_date = date(int(year), int(month), int(day))
except ValueError:
# Don't tolerate invalid date
raise
try:
self._price = float(price)
except (TypeError, ValueError):
self._price = 0
@property
def date(self):
return self._price_date.strftime("%d/%m/%Y")
@property
def date_obj(self):
return self._price_date
@property
def price(self):
return self._price
def __repr__(self):
return f"{self.date}, {self.price}"
def __init__(self, input_date_price_values):
self._date_price_values = []
for date, price in input_date_price_values:
try:
self._date_price_values.append(self.DatePrice(date, price))
except ValueError:
pass
self._date_price_values.sort(key=lambda x: x.date_obj)
self._stale_price_dict = {}
self._outlier_low, self._outlier_high = self._calculate_outlier_thresholds_using_iqr(
self._date_price_values
)
def check_for_errors(self):
"""
returns -> List[tuple(date, float, error)]
errors = 'missing value', 'stale value' or 'outlier'
Uses 3 different error checkers to check for errors in data
1. Checks for missing values in data -> categorises missing values as any value == 0, or empty string or nulls
2. Checks for stale values in data -> categorises stale values as any value that remains unchanged
for 5 business days. For stale values it returns the last date on which it was repeated
3. Checks for outlier values in data -> Uses Interquartile range (IQR) and a low threshold of
first-quartile-value - 1.2 x IQR and high-threshold of third-quartile-value + 1.2 x IQR.
Any values outside this range are deemed as outliers
"""
errors = []
for datePrice in self._date_price_values:
if self._is_value_missing(datePrice.price):
self._add_to_errors(datePrice, "missing value", errors)
elif self._is_value_stale(datePrice.price):
self._add_to_errors(datePrice, "stale value", errors)
elif self._is_value_outlier(datePrice.price):
self._add_to_errors(datePrice, "outlier", errors)
else:
continue
return errors
def _add_to_errors(self, datePrice, error_string, errors):
error_tuple = (datePrice.date, datePrice.price, error_string)
errors.append(error_tuple)
def _is_value_missing(self, price):
if price is None or price == 0:
return True
return False
def _is_value_stale(self, price):
if price in self._stale_price_dict:
self._stale_price_dict[price] += 1
if self._stale_price_dict[price] >= 5: # 5 business days in week
return True
else:
self._stale_price_dict.clear()
self._stale_price_dict[price] = 1
return False
def _is_value_outlier(self, price):
if price < self._outlier_low or price > self._outlier_high:
return True
return False
def _calculate_outlier_thresholds_using_iqr(self, data_price_values):
price_values = sorted([dataPrice.price for dataPrice in data_price_values])
median_index = len(price_values) // 2
first_quartile = statistics.median(price_values[:median_index])
third_quartile = statistics.median(price_values[median_index + 1 :])
iqr = third_quartile - first_quartile
low_iqr = first_quartile - 1.2 * iqr
high_iqr = third_quartile + 1.2 * iqr
return low_iqr, high_iqr
def _calculate_outlier_thresholds_using_mean_deviation(self, data_price_values):
price_values = sorted([dataPrice.price for dataPrice in data_price_values])
mean_value = statistics.mean(price_values)
std_dev = statistics.stdev(price_values)
low_iqr = mean_value - 2 * std_dev
high_iqr = mean_value + 2 * std_dev
return low_iqr, high_iqr
def check_file_data(file_path):
with open(file_path) as data_file:
raw_data = csv.DictReader(data_file)
input_data = []
for row in raw_data:
input_data.append((row["Date"], row["Last Price"]))
data_checker = DataChecker(input_data)
errors = data_checker.check_for_errors()
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(errors)
print(f"Total Errors Found: {len(errors)}")
return errors
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Please provide filepath")
sys.exit()
file_path = sys.argv[1]
check_file_data(file_path)
To test, put the above code in a file called "data_checker.py" and test code below in a file called "test_data_checker.py" in the same directory and run:
python3.6 -m pytest -v test_data_checker.py
import pytest
from data_checker import DataChecker
test_data = [
(
pytest.param(
[("01/02/2010", "10"), ("02/02/2010", "10.09"), ("03/02/2010", "10.12")],
[],
id="no-errors-in-data",
)
),
(
pytest.param(
[("01/02/2010", "0.0"), ("02/02/2010", ""), ("03/02/2010", "10.12")],
[("01/02/2010", 0, "missing value"), ("02/02/2010", 0, "missing value")],
id="2-zero-values",
)
),
(
pytest.param(
[
("01/02/2010", "2"),
("02/02/2010", "1.12"),
("03/02/2010", "1.12"),
("04/02/2010", "1.12"),
("05/02/2010", "1.12"),
("06/02/2010", "1.11"),
],
[],
id="4-repeated-values-no-stale",
)
),
(
pytest.param(
[
("01/02/2010", "1.10"),
("02/02/2010", "1.12"),
("03/02/2010", "1.12"),
("04/02/2010", "1.12"),
("05/02/2010", "1.12"),
("06/02/2010", "1.12"),
("07/02/2010", "1.11"),
],
[("06/02/2010", 1.12, "stale value")],
id="1-stale-value",
)
),
(
pytest.param(
[
("01/02/2010", "0"),
("02/02/2010", "1.12"),
("03/02/2010", "1.12"),
("04/02/2010", "1.12"),
("05/02/2010", "1.12"),
("06/02/2010", "1.12"),
("07/02/2010", "1.11"),
],
[("01/02/2010", 0, "missing value"), ("06/02/2010", 1.12, "stale value")],
id="1-missing-1-stale-value",
)
),
(
pytest.param(
[
("01/02/2010", "1.11"),
("02/02/2010", "5"),
("03/02/2010", "1.12"),
("04/02/2010", "1.11"),
("05/02/2010", "1.12"),
("06/02/2010", "1.12"),
("07/02/2010", "1.11"),
],
[("02/02/2010", 5, "outlier")],
id="1-outlier-value",
)
),
(
pytest.param(
[
("01/02/2010", "0"),
("02/02/2010", "5"),
("03/02/2010", "1.12"),
("04/02/2010", "1.11"),
("05/02/2010", "1.12"),
("06/02/2010", "1.12"),
("07/02/2010", "1.12"),
("08/02/2010", "1.12"),
("09/02/2010", "1.12"),
],
[
("01/02/2010", 0, "missing value"),
("02/02/2010", 5, "outlier"),
("09/02/2010", 1.12, "stale value"),
],
id="missing-stale-outlier-value",
)
),
]
@pytest.mark.parametrize("input_data, expected_value", test_data)
def test_check_for_error(input_data, expected_value):
data_checker = DataChecker(input_data)
errors = data_checker.check_for_errors()
assert errors == expected_value
1 Answer 1
Naming
The class name DataChecker
is too generic. You should add a docstring
to describe what kind of data is being checked. It seems like price data of
some kind, but it would be useful to know what kind of prices as well.
Simpler
This function:
def _is_value_missing(self, price):
if price is None or price == 0:
return True
return False
can be simplified with a single return
:
def _is_value_missing(self, price):
return price is None or price == 0:
The same goes for function _is_value_outlier
.
Tools
ruff finds:
data_checker.py: F402 Import `date` from line 10 shadowed by loop variable
|
| def __init__(self, input_date_price_values):
| self._date_price_values = []
| for date, price in input_date_price_values:
| ^^^^ F402
| try:
| self._date_price_values.append(self.DatePrice(date, price))
|
Misc.
The check_for_errors
function has a very useful docstring.
This is a useful comment:
if self._stale_price_dict[price] >= 5: # 5 business days in week