Commit e7ad566

authored

Save Correction Output (#50)

* Save the corrected output to a folder

1 parent a8d6b26 commit e7ad566Copy full SHA for e7ad566

File tree

3 files changed

+69

-24

lines changed

ExtractTable

3 files changed

+69

-24

lines changed

`‎ExtractTable/init.py‎`

Lines changed: 12 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -211,13 +211,19 @@ def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv`
`211`	`211`
`212`	`212`	`if output_folder:`
`213`	`213`	`if not os.path.exists(output_folder):`
`214`		`- output_folder = os.path.split(table_outputs_path[0])[0]`
`215`		`- warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")`
`216`		`- else:`
`217`		`- for each_tbl_path in table_outputs_path:`
`218`		`- shutil.move(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))`
	`214`	`+ try:`
	`215`	`+ os.mkdir(output_folder)`
	`216`	`+ except Exception as e:`
	`217`	`+ warnings.warn(f"[Warn]: {str(e)}")`
	`218`	`+ warnings.warn(f"Failed to created output_folder not exists. Saving the outputs to {output_folder}")`
	`219`	`+ output_folder = os.path.dirname(table_outputs_path[0])`
`219`	`220`	`else:`
`220`		`- output_folder = os.path.split(table_outputs_path[0])[0]`
	`221`	`+ output_folder = os.path.dirname(table_outputs_path[0])`
	`222`	`+`
	`223`	`+ if output_folder != os.path.dirname(table_outputs_path[0]):`
	`224`	`+ for each_tbl_path in table_outputs_path:`
	`225`	`+ shutil.move(each_tbl_path,`
	`226`	`+ os.path.join(output_folder, input_fname + os.path.basename(each_tbl_path)))`
`221`	`227`
`222`	`228`	`for each_page in self.server_response.get("Lines", []):`
`223`	`229`	`page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")`

`‎ExtractTable/version.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-VERSION = (2, 3, 1)`
	`1`	`+VERSION = (2, 4, 0)`
`2`	`2`	`PRERELEASE = None # "alpha", "beta" or "rc"`
`3`	`3`	`REVISION = None`
`4`	`4`

`‎ExtractTable/common.py‎`

Lines changed: 56 additions & 17 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`"""`
`4`	`4`	`import os`
`5`	`5`	`import re`
	`6`	`+import shutil`
`6`	`7`	`import tempfile`
`7`	`8`	`import warnings`
`8`	`9`	`import collections`
`@@ -84,6 +85,7 @@ def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):`
`84`	`85`	Default assumes all dataframes from the extracttable response, `et_resp`.
`85`	`86`	If both `et_resp` and `dataframes` are provided, the later is considered for the processing
`86`	`87`	`"""`
	`88`	`+ self.et_resp = et_resp`
`87`	`89`	`if et_resp:`
`88`	`90`	`self.dataframes = ConvertTo(server_response=et_resp).output`
`89`	`91`
`@@ -134,6 +136,7 @@ def split_merged_rows(self) -> List[pd.DataFrame]:`
`134`	`136`	`reformat.append(row)`
`135`	`137`
`136`	`138`	`self.dataframes[df_idx] = pd.DataFrame(reformat)`
	`139`	`+ self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')`
`137`	`140`
`138`	`141`	`return self.dataframes`
`139`	`142`
`@@ -147,12 +150,11 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool`
`147`	`150`	`"""`
`148`	`151`	`# TODO: Should we consider delimiter_pattern for the split?`
`149`	`152`	`for df_idx, df in enumerate(self.dataframes):`
`150`		`- if not columns_idx:`
`151`		`- columns_idx=df.columns`
	`153`	`+ cols_idx=df.columnsif not columns_idxelsecolumns_idx.copy()`
	`154`	`+ cols_idx= [str(x) forxincols_idx]`
`152`	`155`
`153`		`- columns_idx = [str(x) for x in columns_idx]`
`154`	`156`	`reformat = []`
`155`		`- for col_idx in columns_idx:`
	`157`	`+ for col_idx in cols_idx:`
`156`	`158`	`tmp = df[col_idx].str.split(expand=True)`
`157`	`159`
`158`	`160`	`if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:`
`@@ -163,6 +165,7 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool`
`163`	`165`	`reformat.extend([tmp[each].tolist() for each in tmp.columns])`
`164`	`166`
`165`	`167`	`self.dataframes[df_idx] = pd.DataFrame(reformat).T`
	`168`	`+ self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')`
`166`	`169`
`167`	`170`	`return self.dataframes`
`168`	`171`
`@@ -185,11 +188,10 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s`
`185`	`188`	`decimal_position = int(decimal_position)`
`186`	`189`
`187`	`190`	`for df_idx, df in enumerate(self.dataframes):`
`188`		`- if not columns_idx:`
`189`		`- columns_idx = df.columns`
`190`		`- columns_idx = [str(x) for x in columns_idx]`
	`191`	`+ cols_idx = df.columns if not columns_idx else columns_idx.copy()`
	`192`	`+ cols_idx = [str(x) for x in cols_idx]`
`191`	`193`
`192`		`- for col_idx in columns_idx:`
	`194`	`+ for col_idx in cols_idx:`
`193`	`195`	`digits = df[col_idx].str.count(pat=r'\d').sum()`
`194`	`196`	`chars = df[col_idx].str.count(pat=r'[\w]').sum()`
`195`	`197`
`@@ -220,6 +222,8 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s`
`220`	`222`	`df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]`
`221`	`223`
`222`	`224`	`self.dataframes[df_idx] = df`
	`225`	`+ self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')`
	`226`	`+`
`223`	`227`	`return self.dataframes`
`224`	`228`
`225`	`229`	`def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):`
`@@ -233,11 +237,10 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):`
`233`	`237`	`"""`
`234`	`238`	`date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}\|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'`
`235`	`239`	`for df_idx, df in enumerate(self.dataframes):`
`236`		`- if not columns_idx:`
`237`		`- columns_idx = df.columns`
`238`		`- columns_idx = [str(x) for x in columns_idx]`
	`240`	`+ cols_idx = df.columns if not columns_idx else columns_idx.copy()`
	`241`	`+ cols_idx = [str(x) for x in cols_idx]`
`239`	`242`
`240`		`- for col_idx in columns_idx:`
	`243`	`+ for col_idx in cols_idx:`
`241`	`244`	`dates = df[col_idx].str.count(pat=date_regex).sum()`
`242`	`245`
`243`	`246`	`if not (dates >= len(df) * 0.75):`
`@@ -249,6 +252,7 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):`
`249`	`252`	`df[col_idx].replace(regex={date_regex: r'1円%s4円%s6円' % (delimiter, delimiter)}, inplace=True)`
`250`	`253`
`251`	`254`	`self.dataframes[df_idx] = df`
	`255`	`+ self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')`
`252`	`256`
`253`	`257`	`return self.dataframes`
`254`	`258`
`@@ -263,14 +267,49 @@ def fix_characters(self, columns_idx: List[int] = None, replace_ref: dict = {}):`
`263`	`267`	`:return: correted list of dataframes`
`264`	`268`	`"""`
`265`	`269`	`for df_idx, df in enumerate(self.dataframes):`
`266`		`- if not columns_idx:`
`267`		`- columns_idx = df.columns`
`268`		`- columns_idx = [str(x) for x in columns_idx]`
	`270`	`+ cols_idx = df.columns if not columns_idx else columns_idx.copy()`
	`271`	`+ cols_idx = [str(x) for x in cols_idx]`
`269`	`272`
`270`		`- for col_idx in columns_idx:`
	`273`	`+ for col_idx in cols_idx:`
`271`	`274`	`for find_ch, repl_ch in replace_ref.items():`
`272`	`275`	`df[col_idx] = df[col_idx].str.replace(str(find_ch), str(repl_ch))`
`273`	`276`
`274`	`277`	`self.dataframes[df_idx] = df`
`275`		`-`
	`278`	`+self.et_resp['Tables'][df_idx]['TableJson'] =self.dataframes[df_idx].to_dict(orient='index')`
`276`	`279`	`return self.dataframes`
	`280`	`+`
	`281`	`+ def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv", indexing: bool = False):`
	`282`	`+ """`
	`283`	`+ Save the objects of session data to user preferred location or a default folder`
	`284`	`+ :param output_folder: user preferred output location; default tmp directory`
	`285`	`+ :param output_format: needed only for tables CSV or XLSX`
	`286`	`+ :param indexing: row & column index consideration in the output`
	`287`	`+ :return: location of the output`
	`288`	`+ """`
	`289`	`+ input_fname = "corrected_"`
	`290`	`+`
	`291`	`+ output_format = output_format.lower()`
	`292`	`+ if output_format not in ("csv", "xlsx"):`
	`293`	`+ output_format = "csv"`
	`294`	`+ warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")`
	`295`	`+`
	`296`	`+ table_outputs_path = ConvertTo(server_response=self.et_resp, output_format=output_format, indexing=indexing).output`
	`297`	`+`
	`298`	`+ if output_folder:`
	`299`	`+ if not os.path.exists(output_folder):`
	`300`	`+ try:`
	`301`	`+ os.mkdir(output_folder)`
	`302`	`+ except Exception as e:`
	`303`	`+ warnings.warn(f"[Warn]: {str(e)}")`
	`304`	`+ warnings.warn(f"Failed to created output_folder not exists. Saving the outputs to {output_folder}")`
	`305`	`+ output_folder = os.path.dirname(table_outputs_path[0])`
	`306`	`+ else:`
	`307`	`+ output_folder = os.path.dirname(table_outputs_path[0])`
	`308`	`+`
	`309`	`+ if output_folder != os.path.dirname(table_outputs_path[0]):`
	`310`	`+ for each_tbl_path in table_outputs_path:`
	`311`	`+ shutil.move(each_tbl_path,`
	`312`	`+ os.path.join(output_folder, input_fname + os.path.basename(each_tbl_path)))`
	`313`	`+`
	`314`	`+ return output_folder`
	`315`	`+`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit e7ad566

File tree

3 files changed

3 files changed

`‎ExtractTable/init.py‎`

`‎ExtractTable/version.py‎`

`‎ExtractTable/common.py‎`

0 commit comments

File tree

3 files changed

3 files changed

‎ExtractTable/__init__.py‎

‎ExtractTable/__version__.py‎

‎ExtractTable/common.py‎

0 commit comments

`‎ExtractTable/init.py‎`

`‎ExtractTable/version.py‎`

`‎ExtractTable/common.py‎`