I'm writing a small ETL, which loads data from CSVs, cleans each table a bit, and then loads everything into a PostgreSQL database. I was planning to use pandas for its built-in capabilities, but am wondering whether to subclass DataFrame or whether to just do everything functionally.
The subclassed DataFrame code is pasted below. For maintainability by non-developers, I have a small YAML file with information about each table and column type.
import pandas
import numpy
import yaml
from os import path
CFG = yaml.load(open('config.yaml', 'r'))
class ETLDataTable(pandas.DataFrame):
_metadata = ['table_name', 'file_name', 'columns', 'notes']
@property
def _constructor(self):
return ETLDataTable
def __init__(self, table_name):
# Name of the database table
self.table_name = CFG[table_name]['table']
# Name of the CSV file
self.file_name = CFG[table_name]['file']
# Whether file has note fields
self.notes = CFG[table_name]['notes']
#Data Types to feed into read_csv
try:
self.columns = CFG[table_name]['columns']
except:
pass
_ = path.join(path.abspath(path.pardir), self.file_name)
super().__init__(pandas.read_csv(_))
def load_df(self, root_path, **kwargs):
"""Read the csv associated with the table name,
then import as a pandas DataFrame
"""
_ = path.join(path.abspath(path.pardir), self.file_name)
pandas.read_csv(csv_path,
na_values = ['00000000', ' ', ''],
encoding="latin1",
dtype="object",
**kwargs)
Going forward I was planning to add in some methods that are needed by every table: fixing bad dates, stripping empty strings, etc. Is this approach going to be more trouble than it's worth?
1 Answer 1
For the parsing and analysis of my tests, I did something like this:
import pandas as pd
from pathlib import Path
class MyTest:
def __init__(self, settings: dict, root_dir: Path):
self.table_name = settings['table']
self.file_name = settings['file']
...
self.columns = settings.get('columns', None)
filename = root_dir / self.file_name
data = read_data(filename, columns=self.columns)
self._data = fix_data(data)
def summary_x(self):
...
return None
def get_data_between(self, date1, date2):
# optionally parsing the dates
return self._data[self._data['data'].between(date1, date2)]
...
def read_data(filename, **kwargs) -> pd.DataFrame:
return pd.read_csv(
filename,
na_values=['00000000', ' ', ''],
encoding="latin1",
dtype="object",
**kwargs,
)
def fix_data(data: pd.DataFrame, date_cols=None) -> pd.DataFrame:
if date_cols:
date_cols = (date_cols,) if isinstance(date_cols, str) else date_cols
data[date_cols] = [fix_dates(data[col]) for col in date_cols]
...
return data
def fix_dates(data: pd.Series) -> pd.Series:
#optionally a column
pass
def remove_empty_strings(data):
pass
Once I was at the stage where I had multiple types of tests, I made a generic type, and subclassed this. But I see little value in subclassing pandas.Dataframe
, because then you also have to take care not to accidentally overwrite any of it's methods and attributes
Explore related questions
See similar questions with these tags.
DataFrame
. Just have adata
property \$\endgroup\$