3
\$\begingroup\$

I was recently given a task to implement a data-checker script which given an input file containing (date, last_price) values should check for 3 different kind of errors - missing values, stale values and outliers and returns list of errors.

I wrote the code below, and the feedback I got was that it was not "pythonic" enough. Can someone please let me know how I can make this more pythonic? What parts look good and what don't? It will be extremely helpful as I write more python code in future.

#!/usr/bin/python3.6
import sys
import csv
import pprint
import statistics
from datetime import date
class DataChecker:
 class DatePrice:
 def __init__(self, input_date, price):
 try:
 day, month, year = input_date.split("/")
 self._price_date = date(int(year), int(month), int(day))
 except ValueError:
 # Don't tolerate invalid date
 raise
 try:
 self._price = float(price)
 except (TypeError, ValueError):
 self._price = 0
 @property
 def date(self):
 return self._price_date.strftime("%d/%m/%Y")
 @property
 def date_obj(self):
 return self._price_date
 @property
 def price(self):
 return self._price
 def __repr__(self):
 return f"{self.date}, {self.price}"
 def __init__(self, input_date_price_values):
 self._date_price_values = []
 for date, price in input_date_price_values:
 try:
 self._date_price_values.append(self.DatePrice(date, price))
 except ValueError:
 pass
 self._date_price_values.sort(key=lambda x: x.date_obj)
 self._stale_price_dict = {}
 self._outlier_low, self._outlier_high = self._calculate_outlier_thresholds_using_iqr(
 self._date_price_values
 )
 def check_for_errors(self):
 """
 returns -> List[tuple(date, float, error)]
 errors = 'missing value', 'stale value' or 'outlier'
 Uses 3 different error checkers to check for errors in data
 1. Checks for missing values in data -> categorises missing values as any value == 0, or empty string or nulls
 2. Checks for stale values in data -> categorises stale values as any value that remains unchanged 
 for 5 business days. For stale values it returns the last date on which it was repeated
 3. Checks for outlier values in data -> Uses Interquartile range (IQR) and a low threshold of 
 first-quartile-value - 1.2 x IQR and high-threshold of third-quartile-value + 1.2 x IQR. 
 Any values outside this range are deemed as outliers
 """
 errors = []
 for datePrice in self._date_price_values:
 if self._is_value_missing(datePrice.price):
 self._add_to_errors(datePrice, "missing value", errors)
 elif self._is_value_stale(datePrice.price):
 self._add_to_errors(datePrice, "stale value", errors)
 elif self._is_value_outlier(datePrice.price):
 self._add_to_errors(datePrice, "outlier", errors)
 else:
 continue
 return errors
 def _add_to_errors(self, datePrice, error_string, errors):
 error_tuple = (datePrice.date, datePrice.price, error_string)
 errors.append(error_tuple)
 def _is_value_missing(self, price):
 if price is None or price == 0:
 return True
 return False
 def _is_value_stale(self, price):
 if price in self._stale_price_dict:
 self._stale_price_dict[price] += 1
 if self._stale_price_dict[price] >= 5: # 5 business days in week
 return True
 else:
 self._stale_price_dict.clear()
 self._stale_price_dict[price] = 1
 return False
 def _is_value_outlier(self, price):
 if price < self._outlier_low or price > self._outlier_high:
 return True
 return False
 def _calculate_outlier_thresholds_using_iqr(self, data_price_values):
 price_values = sorted([dataPrice.price for dataPrice in data_price_values])
 median_index = len(price_values) // 2
 first_quartile = statistics.median(price_values[:median_index])
 third_quartile = statistics.median(price_values[median_index + 1 :])
 iqr = third_quartile - first_quartile
 low_iqr = first_quartile - 1.2 * iqr
 high_iqr = third_quartile + 1.2 * iqr
 return low_iqr, high_iqr
 def _calculate_outlier_thresholds_using_mean_deviation(self, data_price_values):
 price_values = sorted([dataPrice.price for dataPrice in data_price_values])
 mean_value = statistics.mean(price_values)
 std_dev = statistics.stdev(price_values)
 low_iqr = mean_value - 2 * std_dev
 high_iqr = mean_value + 2 * std_dev
 return low_iqr, high_iqr
def check_file_data(file_path):
 with open(file_path) as data_file:
 raw_data = csv.DictReader(data_file)
 input_data = []
 for row in raw_data:
 input_data.append((row["Date"], row["Last Price"]))
 data_checker = DataChecker(input_data)
 errors = data_checker.check_for_errors()
 pp = pprint.PrettyPrinter(indent=4)
 pp.pprint(errors)
 print(f"Total Errors Found: {len(errors)}")
 return errors
if __name__ == "__main__":
 if len(sys.argv) < 2:
 print("Please provide filepath")
 sys.exit()
 file_path = sys.argv[1]
 check_file_data(file_path)

To test, put the above code in a file called "data_checker.py" and test code below in a file called "test_data_checker.py" in the same directory and run:

python3.6 -m pytest -v test_data_checker.py

import pytest
from data_checker import DataChecker
test_data = [
 (
 pytest.param(
 [("01/02/2010", "10"), ("02/02/2010", "10.09"), ("03/02/2010", "10.12")],
 [],
 id="no-errors-in-data",
 )
 ),
 (
 pytest.param(
 [("01/02/2010", "0.0"), ("02/02/2010", ""), ("03/02/2010", "10.12")],
 [("01/02/2010", 0, "missing value"), ("02/02/2010", 0, "missing value")],
 id="2-zero-values",
 )
 ),
 (
 pytest.param(
 [
 ("01/02/2010", "2"),
 ("02/02/2010", "1.12"),
 ("03/02/2010", "1.12"),
 ("04/02/2010", "1.12"),
 ("05/02/2010", "1.12"),
 ("06/02/2010", "1.11"),
 ],
 [],
 id="4-repeated-values-no-stale",
 )
 ),
 (
 pytest.param(
 [
 ("01/02/2010", "1.10"),
 ("02/02/2010", "1.12"),
 ("03/02/2010", "1.12"),
 ("04/02/2010", "1.12"),
 ("05/02/2010", "1.12"),
 ("06/02/2010", "1.12"),
 ("07/02/2010", "1.11"),
 ],
 [("06/02/2010", 1.12, "stale value")],
 id="1-stale-value",
 )
 ),
 (
 pytest.param(
 [
 ("01/02/2010", "0"),
 ("02/02/2010", "1.12"),
 ("03/02/2010", "1.12"),
 ("04/02/2010", "1.12"),
 ("05/02/2010", "1.12"),
 ("06/02/2010", "1.12"),
 ("07/02/2010", "1.11"),
 ],
 [("01/02/2010", 0, "missing value"), ("06/02/2010", 1.12, "stale value")],
 id="1-missing-1-stale-value",
 )
 ),
 (
 pytest.param(
 [
 ("01/02/2010", "1.11"),
 ("02/02/2010", "5"),
 ("03/02/2010", "1.12"),
 ("04/02/2010", "1.11"),
 ("05/02/2010", "1.12"),
 ("06/02/2010", "1.12"),
 ("07/02/2010", "1.11"),
 ],
 [("02/02/2010", 5, "outlier")],
 id="1-outlier-value",
 )
 ),
 (
 pytest.param(
 [
 ("01/02/2010", "0"),
 ("02/02/2010", "5"),
 ("03/02/2010", "1.12"),
 ("04/02/2010", "1.11"),
 ("05/02/2010", "1.12"),
 ("06/02/2010", "1.12"),
 ("07/02/2010", "1.12"),
 ("08/02/2010", "1.12"),
 ("09/02/2010", "1.12"),
 ],
 [
 ("01/02/2010", 0, "missing value"),
 ("02/02/2010", 5, "outlier"),
 ("09/02/2010", 1.12, "stale value"),
 ],
 id="missing-stale-outlier-value",
 )
 ),
]
@pytest.mark.parametrize("input_data, expected_value", test_data)
def test_check_for_error(input_data, expected_value):
 data_checker = DataChecker(input_data)
 errors = data_checker.check_for_errors()
 assert errors == expected_value
toolic
14.6k5 gold badges29 silver badges204 bronze badges
asked Jun 19, 2019 at 6:08
\$\endgroup\$
0

1 Answer 1

3
\$\begingroup\$

Naming

The class name DataChecker is too generic. You should add a docstring to describe what kind of data is being checked. It seems like price data of some kind, but it would be useful to know what kind of prices as well.

Simpler

This function:

def _is_value_missing(self, price):
 if price is None or price == 0:
 return True
 return False

can be simplified with a single return:

def _is_value_missing(self, price):
 return price is None or price == 0:

The same goes for function _is_value_outlier.

Tools

ruff finds:

data_checker.py: F402 Import `date` from line 10 shadowed by loop variable
 |
 | def __init__(self, input_date_price_values):
 | self._date_price_values = []
 | for date, price in input_date_price_values:
 | ^^^^ F402
 | try:
 | self._date_price_values.append(self.DatePrice(date, price))
 |

Misc.

The check_for_errors function has a very useful docstring.

This is a useful comment:

if self._stale_price_dict[price] >= 5: # 5 business days in week
answered Jan 22 at 11:37
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.