This is a Django tag with 2 classes, but can be viewed as a module:
What does the module do? it loads a database, performs some transformations on the data, returns the output as a dictionary of json strings for the frontend of django to represent in slickgrid tables
It's all working fine, but my class is too abstract and ended using a lot of static methods because there is actually no state that I need.
This first class shouldn't be important, don't think there is something here to improve. I basically have 4 types in 3 categories and want to go from 4 pandas dataframes to 12 json strings that are passed to django frontend:
class RenderTag:
@staticmethod
def get_context_data():
annotations = Annotations()
df_type_1_category_1, df_type_1_category_2, df_type_1_category_3 = annotations.filter_categories(annotations.df_type_1)
df_type_2_category_1, df_type_2_category_2, df_type_2_category_3 = annotations.filter_categories(annotations.df_type_2)
df_type_3_category_1, df_type_3_category_2, df_type_3_category_3 = annotations.filter_categories(annotations.df_type_3)
df_type_4_category_1, df_type_4_category_2, df_type_4_category_3 = annotations.filter_categories(annotations.df_type_4)
# json data for js tables
json_for_category_1_1 = df_type_1_category_1.apply(lambda x: x.to_json(), axis=1)
json_for_category_1_2 = df_type_2_category_1.apply(lambda x: x.to_json(), axis=1)
json_for_category_1_3 = df_type_3_category_1.apply(lambda x: x.to_json(), axis=1)
json_for_category_1_4 = df_type_4_category_1.apply(lambda x: x.to_json(), axis=1)
json_for_category_2_1 = df_type_1_category_2.apply(lambda x: x.to_json(), axis=1)
json_for_category_2_2 = df_type_2_category_2.apply(lambda x: x.to_json(), axis=1)
json_for_category_2_3 = df_type_3_category_2.apply(lambda x: x.to_json(), axis=1)
json_for_category_2_4 = df_type_4_category_2.apply(lambda x: x.to_json(), axis=1)
json_for_category_3_1 = df_type_1_category_3.apply(lambda x: x.to_json(), axis=1)
json_for_category_3_2 = df_type_2_category_3.apply(lambda x: x.to_json(), axis=1)
json_for_category_3_3 = df_type_3_category_3.apply(lambda x: x.to_json(), axis=1)
json_for_category_3_4 = df_type_4_category_3.apply(lambda x: x.to_json(), axis=1)
context = {
"json_1_1": json_for_category_1_1.to_json(orient='split'),
"json_1_2": json_for_category_1_2.to_json(orient='split'),
"json_1_3": json_for_category_1_3.to_json(orient='split'),
"json_1_4": json_for_category_1_4.to_json(orient='split'),
"json_2_1": json_for_category_2_1.to_json(orient='split'),
"json_2_2": json_for_category_2_2.to_json(orient='split'),
"json_2_3": json_for_category_2_3.to_json(orient='split'),
"json_2_4": json_for_category_2_4.to_json(orient='split'),
"json_3_1": json_for_category_3_1.to_json(orient='split'),
"json_3_2": json_for_category_3_2.to_json(orient='split'),
"json_3_3": json_for_category_3_3.to_json(orient='split'),
"json_3_4": json_for_category_3_4.to_json(orient='split'),
}
return context
This class I think needs a lot of improvement:
class Annotations:
def __init__(self):
# loading data
self.df_type_2 = helpers.load_database("type_2").round(2)
self.df_type_3 = helpers.load_database("type_3").round(2)
self.df_type_1 = helpers.load_database("type_1").round(2)
# main transformations
# type_2 and 4
self.df_type_2, self.df_type_4 = self.split_2_into_2_and_4(self.df_type_2)
self.df_type_4 = self.do_transformations_for_4(self.df_type_4)
self.df_type_2 = self.do_transformations_for_2(self.df_type_2)
# type_1
self.df_type_1 = self.do_transformations_for_1(self.df_type_1)
# type_3
self.df_type_3 = self.do_transformations_for_3(self.df_type_3)
# and I have 4 methods that call a lot of static functions
def do_transformations_for_1(self, df):
"""
This is the main function that edits the data for type 1
We take the main df and then we run a series of manipulations
Args:
df(pd.DataFrame): the df that we want to process
Returns:
df(pd.DataFrame): the end dataframe that will be transferred to the js file
"""
df["id"] = df.index
df = df.pipe(self.do_something)\
.pipe(self.do_something_1)\
.pipe(self.do_something_2)\
.pipe(self.do_something_3)\
.pipe(self.do_something_4)\
.pipe(self.do_something_5)\
.pipe(self.add_colors_log2foldchange)\
.pipe(self.fill_na_with_empty_strings)\
.pipe(helpers.sort_df_by_columns, self.columns_to_sort_snv)
return df
def do_transformations_for_2(self, df):
"""
This is a function that runs only for type 2
We take the main df and then we run a series of manipulations
Args:
df(pd.DataFrame): the df that we want to process
Returns:
df(pd.DataFrame): the end dataframe that will be transferred to the js file
"""
df = df.pipe(self.do_something) \
.pipe(self.add_colors_log2foldchange) \
.pipe(self.do_something_7)\
.pipe(helpers.sort_df_by_columns, self.columns_to_sort_type_4)\
return df
def do_transformations_for_3(self, df):
"""
This is a function that runs only for type 3. We take the main df and then we run a series of manipulations
Args:
df(pd.DataFrame): the df that we want to process
Returns:
df(pd.DataFrame): the end dataframe that will be transferred to the js file
"""
df = df.pipe(self.do_something, False) \
.pipe(self.do_something_9) \
.pipe(self.add_colors_log2foldchange) \
.pipe(helpers.sort_df_by_columns, self.columns_to_sort_type_3)
return df
def do_transformations_for_4(self, df):
"""
This is a function that runs only for the type_4
We take the main df and then we run a series of manipulations
Args:
df(pd.DataFrame): the df that we want to process
Returns:
df(pd.DataFrame): the end dataframe that will be transferred to the js file
"""
df = df.pipe(self.do_something, True) \
.pipe(self.do_something_9) \
.pipe(self.add_colors_log2foldchange) \
.pipe(helpers.sort_df_by_columns, self.columns_to_sort_type_2)
return df
# many static methods that are only used once or twice, deleted many of them
@staticmethod
def unicode_lists_to_string(df, columns):
for column in columns:
df[column] = df[column].str.strip("[]").str.replace("u'|'",'').str.replace(",",";")
return df
@staticmethod
def transform_type_4_position(df: pd.DataFrame):
"""
Remove copy number from position in type_4 table and also add chr in front
Args:
df(pd.DataFrame):
Returns:
pd.DataFrame: with position modified
"""
df["vid_position"] = "chr" + df["vid"].str.split(":").str[:3].str.join(":")
return df
@staticmethod
def filter_categories(df):
"""
Split the df by categories because we want them in separate tables
Args:
df: main df
Returns:
Tuple[pd.DataFrame]: a tuple of 3 dataframes from categories 1,2,3
"""
df_1 = df[df["category"] == 1]
df_2 = df[df["category"] == 2]
df_3 = df[df["category"] == 3]
return df_1, df_2, df_3
@staticmethod
def add_colors_log2foldchange(df: pd.DataFrame):
"""
We want to add background colors to log2foldchange values, fron blue to red
Args:
df(pd.DataFrame): df with log2foldchange values
Returns:
df(pd.DataFrame): df with a new hex_color column
"""
df_new = helpers.add_colors_to_df(df, helpers.get_colors(), "log2_fold_change")
df_new["hex_color"] = df_new["hex_color"].str.replace("#", "")
return df_new
@staticmethod
def edit_support_alt_ref(df: pd.DataFrame) -> pd.DataFrame:
"""
Args:
df(pd.DataFrame):
Returns:
pd.DataFrame:
"""
def strip_germline_from_alt_ref(row):
if pd.notna(row):
if "]," in row:
row = row.split("],")
row = row[1]
row = row.replace("[", "").replace("]", "").replace(",", "|").replace(" ", "")
return row
df["paired_end_reads"] = df["paired_end_reads"].apply(strip_germline_from_alt_ref)
df["split_end_reads"] = df["split_end_reads"].apply(strip_germline_from_alt_ref)
return df
As you can see I do modifications for all 4 types in 4 methods.
I think I can go along with not using a class here, but sort of want to use a class here to be inline with the whole project... I use static methods because they are obviously easier to unit test. Pass a df and return a df, easy unit test.
1 Answer 1
This first class shouldn't be important, don't think there is something here to improve.
You're wrong :)
There's a sea of repeated code here. You need to seriously DRY it up. I don't have enough of your system to test this myself, so you need to; but you should do something like
class RenderTag:
@staticmethod
def get_context_data():
annotations = Annotations()
def to_json(x):
return x.to_json()
context = {}
for i in range(1, 5):
df_type = getattr(annotations, f'df_type_{i}')
categories = annotations.filter_categories(df_type)
for j, category in enumerate(categories, 1):
js = category.apply(to_json, axis=1).to_json(orient='split')
context[f'json{j}_{i}'] = js
return context
That assumes that Annotations
cannot change. You can make it even simpler if Annotations.df_type
is stored as a 3-tuple, instead of three separate attributes. Elsewhere in your code, you really should carry this philosophy forward - instead of hard-coding three or four variables with numbers in the name, just maintain one tuple (if immutable) or list (if mutable).