33"""
44import os
55import re
6+ import shutil
67import tempfile
78import warnings
89import collections
@@ -84,6 +85,7 @@ def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
8485 Default assumes all dataframes from the extracttable response, `et_resp`.
8586 If both `et_resp` and `dataframes` are provided, the later is considered for the processing
8687 """
88+ self .et_resp = et_resp
8789 if et_resp :
8890 self .dataframes = ConvertTo (server_response = et_resp ).output
8991
@@ -134,6 +136,7 @@ def split_merged_rows(self) -> List[pd.DataFrame]:
134136 reformat .append (row )
135137
136138 self .dataframes [df_idx ] = pd .DataFrame (reformat )
139+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
137140
138141 return self .dataframes
139142
@@ -147,12 +150,11 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
147150 """
148151 # TODO: Should we consider delimiter_pattern for the split?
149152 for df_idx , df in enumerate (self .dataframes ):
150- if not columns_idx :
151- columns_idx = df . columns
153+ cols_idx = df . columns if not columns_idx else columns_idx . copy ()
154+ cols_idx = [ str ( x ) for x in cols_idx ]
152155
153- columns_idx = [str (x ) for x in columns_idx ]
154156 reformat = []
155- for col_idx in columns_idx :
157+ for col_idx in cols_idx :
156158 tmp = df [col_idx ].str .split (expand = True )
157159
158160 if not any ([not any (tmp .isna ().any ()), force_split ]) or tmp .shape [- 1 ] == 1 :
@@ -163,6 +165,7 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
163165 reformat .extend ([tmp [each ].tolist () for each in tmp .columns ])
164166
165167 self .dataframes [df_idx ] = pd .DataFrame (reformat ).T
168+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
166169
167170 return self .dataframes
168171
@@ -185,11 +188,10 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
185188 decimal_position = int (decimal_position )
186189
187190 for df_idx , df in enumerate (self .dataframes ):
188- if not columns_idx :
189- columns_idx = df .columns
190- columns_idx = [str (x ) for x in columns_idx ]
191+ cols_idx = df .columns if not columns_idx else columns_idx .copy ()
192+ cols_idx = [str (x ) for x in cols_idx ]
191193
192- for col_idx in columns_idx :
194+ for col_idx in cols_idx :
193195 digits = df [col_idx ].str .count (pat = r'\d' ).sum ()
194196 chars = df [col_idx ].str .count (pat = r'[\w]' ).sum ()
195197
@@ -220,6 +222,8 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
220222 df [col_idx ][i ] = df [col_idx ][i ][:- (decimal_position + 1 )] + decimal_separator + df [col_idx ][i ][- decimal_position :]
221223
222224 self .dataframes [df_idx ] = df
225+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
226+ 223227 return self .dataframes
224228
225229 def fix_date_format (self , columns_idx : List [int ] = None , delimiter : str = "/" ):
@@ -233,11 +237,10 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
233237 """
234238 date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
235239 for df_idx , df in enumerate (self .dataframes ):
236- if not columns_idx :
237- columns_idx = df .columns
238- columns_idx = [str (x ) for x in columns_idx ]
240+ cols_idx = df .columns if not columns_idx else columns_idx .copy ()
241+ cols_idx = [str (x ) for x in cols_idx ]
239242
240- for col_idx in columns_idx :
243+ for col_idx in cols_idx :
241244 dates = df [col_idx ].str .count (pat = date_regex ).sum ()
242245
243246 if not (dates >= len (df ) * 0.75 ):
@@ -249,6 +252,7 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
249252 df [col_idx ].replace (regex = {date_regex : r'1円%s4円%s6円' % (delimiter , delimiter )}, inplace = True )
250253
251254 self .dataframes [df_idx ] = df
255+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
252256
253257 return self .dataframes
254258
@@ -263,14 +267,49 @@ def fix_characters(self, columns_idx: List[int] = None, replace_ref: dict = {}):
263267 :return: correted list of dataframes
264268 """
265269 for df_idx , df in enumerate (self .dataframes ):
266- if not columns_idx :
267- columns_idx = df .columns
268- columns_idx = [str (x ) for x in columns_idx ]
270+ cols_idx = df .columns if not columns_idx else columns_idx .copy ()
271+ cols_idx = [str (x ) for x in cols_idx ]
269272
270- for col_idx in columns_idx :
273+ for col_idx in cols_idx :
271274 for find_ch , repl_ch in replace_ref .items ():
272275 df [col_idx ] = df [col_idx ].str .replace (str (find_ch ), str (repl_ch ))
273276
274277 self .dataframes [df_idx ] = df
275- 278+ self . et_resp [ 'Tables' ][ df_idx ][ 'TableJson' ] = self . dataframes [ df_idx ]. to_dict ( orient = 'index' )
276279 return self .dataframes
280+ 281+ def save_output (self , output_folder : os .PathLike = "" , output_format : str = "csv" , indexing : bool = False ):
282+ """
283+ Save the objects of session data to user preferred location or a default folder
284+ :param output_folder: user preferred output location; default tmp directory
285+ :param output_format: needed only for tables CSV or XLSX
286+ :param indexing: row & column index consideration in the output
287+ :return: location of the output
288+ """
289+ input_fname = "corrected_"
290+ 291+ output_format = output_format .lower ()
292+ if output_format not in ("csv" , "xlsx" ):
293+ output_format = "csv"
294+ warnings .warn ("Invalid 'output_format' given. Defaulted to 'csv'" )
295+ 296+ table_outputs_path = ConvertTo (server_response = self .et_resp , output_format = output_format , indexing = indexing ).output
297+ 298+ if output_folder :
299+ if not os .path .exists (output_folder ):
300+ try :
301+ os .mkdir (output_folder )
302+ except Exception as e :
303+ warnings .warn (f"[Warn]: { str (e )} " )
304+ warnings .warn (f"Failed to created output_folder not exists. Saving the outputs to { output_folder } " )
305+ output_folder = os .path .dirname (table_outputs_path [0 ])
306+ else :
307+ output_folder = os .path .dirname (table_outputs_path [0 ])
308+ 309+ if output_folder != os .path .dirname (table_outputs_path [0 ]):
310+ for each_tbl_path in table_outputs_path :
311+ shutil .move (each_tbl_path ,
312+ os .path .join (output_folder , input_fname + os .path .basename (each_tbl_path )))
313+ 314+ return output_folder
315+
0 commit comments