Subclassing pandas DataFrame for an ETL

Question 1

I'm writing a small ETL, which loads data from CSVs, cleans each table a bit, and then loads everything into a PostgreSQL database. I was planning to use pandas for its built-in capabilities, but am wondering whether to subclass DataFrame or whether to just do everything functionally.

The subclassed DataFrame code is pasted below. For maintainability by non-developers, I have a small YAML file with information about each table and column type.

import pandas
import numpy
import yaml
from os import path
CFG = yaml.load(open('config.yaml', 'r'))
class ETLDataTable(pandas.DataFrame):
 _metadata = ['table_name', 'file_name', 'columns', 'notes']
 @property
 def _constructor(self):
 return ETLDataTable
 def __init__(self, table_name):
 # Name of the database table
 self.table_name = CFG[table_name]['table']
 # Name of the CSV file
 self.file_name = CFG[table_name]['file']
 # Whether file has note fields
 self.notes = CFG[table_name]['notes']
 #Data Types to feed into read_csv
 try:
 self.columns = CFG[table_name]['columns']
 except:
 pass
 _ = path.join(path.abspath(path.pardir), self.file_name)
 super().__init__(pandas.read_csv(_))
 def load_df(self, root_path, **kwargs):
 """Read the csv associated with the table name,
 then import as a pandas DataFrame
 """
 _ = path.join(path.abspath(path.pardir), self.file_name)
 pandas.read_csv(csv_path, 
 na_values = ['00000000', ' ', ''],
 encoding="latin1",
 dtype="object",
 **kwargs)

Going forward I was planning to add in some methods that are needed by every table: fixing bad dates, stripping empty strings, etc. Is this approach going to be more trouble than it's worth?

Question 2

You can make a datatype Class, but I would not have it subclass a DataFrame. Just have a data property

Question 3

To make sure I understand what you're saying: I'd have one attribute of the class be a dataframe?

Question 4

Does this use PyYaml or another yaml library?

Question 5

@Dannnno PyYaml, if I remember correctly. Why do you ask?

Question 6

@RebeccaAckerman to make sure we have the appropriate context; it would mostly matter if it was a custom yaml library you implemented yourself.

Question 7

For the parsing and analysis of my tests, I did something like this:

import pandas as pd
from pathlib import Path
class MyTest:
 def __init__(self, settings: dict, root_dir: Path):
 self.table_name = settings['table']
 self.file_name = settings['file']
 ...
 self.columns = settings.get('columns', None)
 filename = root_dir / self.file_name
 data = read_data(filename, columns=self.columns)
 self._data = fix_data(data)
 def summary_x(self):
 ...
 return None
 def get_data_between(self, date1, date2):
 # optionally parsing the dates
 return self._data[self._data['data'].between(date1, date2)]
 ...
def read_data(filename, **kwargs) -> pd.DataFrame:
 return pd.read_csv(
 filename,
 na_values=['00000000', ' ', ''],
 encoding="latin1",
 dtype="object",
 **kwargs,
 )
def fix_data(data: pd.DataFrame, date_cols=None) -> pd.DataFrame:
 if date_cols:
 date_cols = (date_cols,) if isinstance(date_cols, str) else date_cols
 data[date_cols] = [fix_dates(data[col]) for col in date_cols]
 ...
 return data
def fix_dates(data: pd.Series) -> pd.Series:
 #optionally a column
 pass
def remove_empty_strings(data):
 pass

Once I was at the stage where I had multiple types of tests, I made a generic type, and subclassed this. But I see little value in subclassing pandas.Dataframe, because then you also have to take care not to accidentally overwrite any of it's methods and attributes

Maarten Fabré Maarten Fabré 9,3901 gold badge15 silver badges27 bronze badges · Accepted Answer · 2018-03-29 10:07:09Z

For the parsing and analysis of my tests, I did something like this:

import pandas as pd
from pathlib import Path
class MyTest:
 def __init__(self, settings: dict, root_dir: Path):
 self.table_name = settings['table']
 self.file_name = settings['file']
 ...
 self.columns = settings.get('columns', None)
 filename = root_dir / self.file_name
 data = read_data(filename, columns=self.columns)
 self._data = fix_data(data)
 def summary_x(self):
 ...
 return None
 def get_data_between(self, date1, date2):
 # optionally parsing the dates
 return self._data[self._data['data'].between(date1, date2)]
 ...
def read_data(filename, **kwargs) -> pd.DataFrame:
 return pd.read_csv(
 filename,
 na_values=['00000000', ' ', ''],
 encoding="latin1",
 dtype="object",
 **kwargs,
 )
def fix_data(data: pd.DataFrame, date_cols=None) -> pd.DataFrame:
 if date_cols:
 date_cols = (date_cols,) if isinstance(date_cols, str) else date_cols
 data[date_cols] = [fix_dates(data[col]) for col in date_cols]
 ...
 return data
def fix_dates(data: pd.Series) -> pd.Series:
 #optionally a column
 pass
def remove_empty_strings(data):
 pass

Once I was at the stage where I had multiple types of tests, I made a generic type, and subclassed this. But I see little value in subclassing pandas.Dataframe, because then you also have to take care not to accidentally overwrite any of it's methods and attributes

Stack Exchange Network

Subclassing pandas DataFrame for an ETL

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Subclassing pandas DataFrame for an ETL

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions