Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 7502a61

Browse files
BUG: read_csv(on_bad_lines=callable)+index_col should warn; add test
- Always emit ParserWarning and drop extra fields when an on_bad_lines callable returns more elements than expected, regardless of index_col, in PythonParser._rows_to_cols. [GH#61837] - Ensure non-bad rows are appended in the outer else branch so good lines are preserved. - Add regression test pandas/tests/io/parser/test_python_parser_only.py::test_on_bad_lines_callable_warns_and_truncates_with_index_col covering index_col in [None, 0]. Closes #61837.
1 parent d4bac86 commit 7502a61

File tree

2 files changed

+52
-2
lines changed

2 files changed

+52
-2
lines changed

‎pandas/io/parsers/python_parser.py‎

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,21 +1189,30 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11891189

11901190
for i, _content in iter_content:
11911191
actual_len = len(_content)
1192-
11931192
if actual_len > col_len:
11941193
if callable(self.on_bad_lines):
11951194
new_l = self.on_bad_lines(_content)
11961195
if new_l is not None:
1196+
# Truncate extra elements and warn.
1197+
if len(new_l) > col_len:
1198+
warnings.warn(
1199+
"Header/names length != data length. "
1200+
"Extra fields dropped.",
1201+
ParserWarning,
1202+
stacklevel=find_stack_level(),
1203+
)
1204+
new_l = new_l[:col_len]
11971205
content.append(new_l) # pyright: ignore[reportArgumentType]
11981206
elif self.on_bad_lines in (
11991207
self.BadLineHandleMethod.ERROR,
12001208
self.BadLineHandleMethod.WARN,
12011209
):
12021210
row_num = self.pos - (content_len - i + footers)
12031211
bad_lines.append((row_num, actual_len))
1204-
12051212
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
12061213
break
1214+
else:
1215+
content.append(_content)
12071216
else:
12081217
content.append(_content)
12091218

‎pandas/tests/io/parser/test_python_parser_only.py‎

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,3 +562,44 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
562562
expected = DataFrame(expected)
563563
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
564564
tm.assert_frame_equal(result, expected)
565+
566+
567+
@pytest.mark.parametrize("index_col", [None, 0])
568+
def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
569+
python_parser_only, index_col
570+
):
571+
"""
572+
GH#61837 regression: callable on_bad_lines returning extra fields must emit a
573+
ParserWarning and drop extras regardless of index_col. [2][3]
574+
"""
575+
parser = python_parser_only
576+
data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
577+
578+
def fixer(bad_line):
579+
# Over-return to trigger truncation + warning
580+
return list(bad_line) + ["EXTRA1", "EXTRA2"]
581+
582+
# Assert ParserWarning is emitted using module helper
583+
df = parser.read_csv_check_warnings(
584+
ParserWarning,
585+
"Length of header or names",
586+
StringIO(data),
587+
on_bad_lines=fixer,
588+
index_col=index_col,
589+
)
590+
591+
if index_col is None:
592+
expected = DataFrame(
593+
{
594+
"id": [101, 102, 103],
595+
"field_1": ["A", "C", "F"],
596+
"field_2": ["B", "D", "G"],
597+
}
598+
)
599+
else:
600+
expected = DataFrame(
601+
{"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
602+
index=Index([101, 102, 103], name="id"),
603+
)
604+
605+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /