Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit e7ad566

Browse files
Save Correction Output (#50)
* Save the corrected output to a folder
1 parent a8d6b26 commit e7ad566

File tree

3 files changed

+69
-24
lines changed

3 files changed

+69
-24
lines changed

‎ExtractTable/__init__.py‎

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,19 @@ def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv
211211

212212
if output_folder:
213213
if not os.path.exists(output_folder):
214-
output_folder = os.path.split(table_outputs_path[0])[0]
215-
warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
216-
else:
217-
for each_tbl_path in table_outputs_path:
218-
shutil.move(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))
214+
try:
215+
os.mkdir(output_folder)
216+
except Exception as e:
217+
warnings.warn(f"[Warn]: {str(e)}")
218+
warnings.warn(f"Failed to created output_folder not exists. Saving the outputs to {output_folder}")
219+
output_folder = os.path.dirname(table_outputs_path[0])
219220
else:
220-
output_folder = os.path.split(table_outputs_path[0])[0]
221+
output_folder = os.path.dirname(table_outputs_path[0])
222+
223+
if output_folder != os.path.dirname(table_outputs_path[0]):
224+
for each_tbl_path in table_outputs_path:
225+
shutil.move(each_tbl_path,
226+
os.path.join(output_folder, input_fname + os.path.basename(each_tbl_path)))
221227

222228
for each_page in self.server_response.get("Lines", []):
223229
page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")

‎ExtractTable/__version__.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (2, 3, 1)
1+
VERSION = (2, 4, 0)
22
PRERELEASE = None # "alpha", "beta" or "rc"
33
REVISION = None
44

‎ExtractTable/common.py‎

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
import os
55
import re
6+
import shutil
67
import tempfile
78
import warnings
89
import collections
@@ -84,6 +85,7 @@ def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
8485
Default assumes all dataframes from the extracttable response, `et_resp`.
8586
If both `et_resp` and `dataframes` are provided, the later is considered for the processing
8687
"""
88+
self.et_resp = et_resp
8789
if et_resp:
8890
self.dataframes = ConvertTo(server_response=et_resp).output
8991

@@ -134,6 +136,7 @@ def split_merged_rows(self) -> List[pd.DataFrame]:
134136
reformat.append(row)
135137

136138
self.dataframes[df_idx] = pd.DataFrame(reformat)
139+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
137140

138141
return self.dataframes
139142

@@ -147,12 +150,11 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
147150
"""
148151
# TODO: Should we consider delimiter_pattern for the split?
149152
for df_idx, df in enumerate(self.dataframes):
150-
if not columns_idx:
151-
columns_idx=df.columns
153+
cols_idx=df.columnsif not columns_idxelsecolumns_idx.copy()
154+
cols_idx= [str(x) forxincols_idx]
152155

153-
columns_idx = [str(x) for x in columns_idx]
154156
reformat = []
155-
for col_idx in columns_idx:
157+
for col_idx in cols_idx:
156158
tmp = df[col_idx].str.split(expand=True)
157159

158160
if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
@@ -163,6 +165,7 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
163165
reformat.extend([tmp[each].tolist() for each in tmp.columns])
164166

165167
self.dataframes[df_idx] = pd.DataFrame(reformat).T
168+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
166169

167170
return self.dataframes
168171

@@ -185,11 +188,10 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
185188
decimal_position = int(decimal_position)
186189

187190
for df_idx, df in enumerate(self.dataframes):
188-
if not columns_idx:
189-
columns_idx = df.columns
190-
columns_idx = [str(x) for x in columns_idx]
191+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
192+
cols_idx = [str(x) for x in cols_idx]
191193

192-
for col_idx in columns_idx:
194+
for col_idx in cols_idx:
193195
digits = df[col_idx].str.count(pat=r'\d').sum()
194196
chars = df[col_idx].str.count(pat=r'[\w]').sum()
195197

@@ -220,6 +222,8 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
220222
df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]
221223

222224
self.dataframes[df_idx] = df
225+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
226+
223227
return self.dataframes
224228

225229
def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
@@ -233,11 +237,10 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
233237
"""
234238
date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
235239
for df_idx, df in enumerate(self.dataframes):
236-
if not columns_idx:
237-
columns_idx = df.columns
238-
columns_idx = [str(x) for x in columns_idx]
240+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
241+
cols_idx = [str(x) for x in cols_idx]
239242

240-
for col_idx in columns_idx:
243+
for col_idx in cols_idx:
241244
dates = df[col_idx].str.count(pat=date_regex).sum()
242245

243246
if not (dates >= len(df) * 0.75):
@@ -249,6 +252,7 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
249252
df[col_idx].replace(regex={date_regex: r'1円%s4円%s6円' % (delimiter, delimiter)}, inplace=True)
250253

251254
self.dataframes[df_idx] = df
255+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
252256

253257
return self.dataframes
254258

@@ -263,14 +267,49 @@ def fix_characters(self, columns_idx: List[int] = None, replace_ref: dict = {}):
263267
:return: correted list of dataframes
264268
"""
265269
for df_idx, df in enumerate(self.dataframes):
266-
if not columns_idx:
267-
columns_idx = df.columns
268-
columns_idx = [str(x) for x in columns_idx]
270+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
271+
cols_idx = [str(x) for x in cols_idx]
269272

270-
for col_idx in columns_idx:
273+
for col_idx in cols_idx:
271274
for find_ch, repl_ch in replace_ref.items():
272275
df[col_idx] = df[col_idx].str.replace(str(find_ch), str(repl_ch))
273276

274277
self.dataframes[df_idx] = df
275-
278+
self.et_resp['Tables'][df_idx]['TableJson'] =self.dataframes[df_idx].to_dict(orient='index')
276279
return self.dataframes
280+
281+
def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv", indexing: bool = False):
282+
"""
283+
Save the objects of session data to user preferred location or a default folder
284+
:param output_folder: user preferred output location; default tmp directory
285+
:param output_format: needed only for tables CSV or XLSX
286+
:param indexing: row & column index consideration in the output
287+
:return: location of the output
288+
"""
289+
input_fname = "corrected_"
290+
291+
output_format = output_format.lower()
292+
if output_format not in ("csv", "xlsx"):
293+
output_format = "csv"
294+
warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")
295+
296+
table_outputs_path = ConvertTo(server_response=self.et_resp, output_format=output_format, indexing=indexing).output
297+
298+
if output_folder:
299+
if not os.path.exists(output_folder):
300+
try:
301+
os.mkdir(output_folder)
302+
except Exception as e:
303+
warnings.warn(f"[Warn]: {str(e)}")
304+
warnings.warn(f"Failed to created output_folder not exists. Saving the outputs to {output_folder}")
305+
output_folder = os.path.dirname(table_outputs_path[0])
306+
else:
307+
output_folder = os.path.dirname(table_outputs_path[0])
308+
309+
if output_folder != os.path.dirname(table_outputs_path[0]):
310+
for each_tbl_path in table_outputs_path:
311+
shutil.move(each_tbl_path,
312+
os.path.join(output_folder, input_fname + os.path.basename(each_tbl_path)))
313+
314+
return output_folder
315+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /