Commit 7502a61

committed

BUG: read_csv(on_bad_lines=callable)+index_col should warn; add test

- Always emit ParserWarning and drop extra fields when an on_bad_lines callable returns more elements than expected, regardless of index_col, in PythonParser._rows_to_cols. [GH#61837] - Ensure non-bad rows are appended in the outer else branch so good lines are preserved. - Add regression test pandas/tests/io/parser/test_python_parser_only.py::test_on_bad_lines_callable_warns_and_truncates_with_index_col covering index_col in [None, 0]. Closes #61837.

1 parent d4bac86 commit 7502a61Copy full SHA for 7502a61

File tree

2 files changed

+52

-2

lines changed

pandas
- io/parsers
  - python_parser.py
- tests/io/parser
  - test_python_parser_only.py

2 files changed

+52

-2

lines changed

`‎pandas/io/parsers/python_parser.py‎`

Lines changed: 11 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1189,21 +1189,30 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:`
`1189`	`1189`
`1190`	`1190`	`for i, _content in iter_content:`
`1191`	`1191`	`actual_len = len(_content)`
`1192`		`-`
`1193`	`1192`	`if actual_len > col_len:`
`1194`	`1193`	`if callable(self.on_bad_lines):`
`1195`	`1194`	`new_l = self.on_bad_lines(_content)`
`1196`	`1195`	`if new_l is not None:`
	`1196`	`+ # Truncate extra elements and warn.`
	`1197`	`+ if len(new_l) > col_len:`
	`1198`	`+ warnings.warn(`
	`1199`	`+ "Header/names length != data length. "`
	`1200`	`+ "Extra fields dropped.",`
	`1201`	`+ ParserWarning,`
	`1202`	`+ stacklevel=find_stack_level(),`
	`1203`	`+ )`
	`1204`	`+ new_l = new_l[:col_len]`
`1197`	`1205`	`content.append(new_l) # pyright: ignore[reportArgumentType]`
`1198`	`1206`	`elif self.on_bad_lines in (`
`1199`	`1207`	`self.BadLineHandleMethod.ERROR,`
`1200`	`1208`	`self.BadLineHandleMethod.WARN,`
`1201`	`1209`	`):`
`1202`	`1210`	`row_num = self.pos - (content_len - i + footers)`
`1203`	`1211`	`bad_lines.append((row_num, actual_len))`
`1204`		`-`
`1205`	`1212`	`if self.on_bad_lines == self.BadLineHandleMethod.ERROR:`
`1206`	`1213`	`break`
	`1214`	`+ else:`
	`1215`	`+ content.append(_content)`
`1207`	`1216`	`else:`
`1208`	`1217`	`content.append(_content)`
`1209`	`1218`

`‎pandas/tests/io/parser/test_python_parser_only.py‎`

Lines changed: 41 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -562,3 +562,44 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp`
`562`	`562`	`expected = DataFrame(expected)`
`563`	`563`	`expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])`
`564`	`564`	`tm.assert_frame_equal(result, expected)`
	`565`	`+`
	`566`	`+`
	`567`	`+@pytest.mark.parametrize("index_col", [None, 0])`
	`568`	`+def test_on_bad_lines_callable_warns_and_truncates_with_index_col(`
	`569`	`+ python_parser_only, index_col`
	`570`	`+):`
	`571`	`+ """`
	`572`	`+ GH#61837 regression: callable on_bad_lines returning extra fields must emit a`
	`573`	`+ ParserWarning and drop extras regardless of index_col. [2][3]`
	`574`	`+ """`
	`575`	`+ parser = python_parser_only`
	`576`	`+ data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"`
	`577`	`+`
	`578`	`+ def fixer(bad_line):`
	`579`	`+ # Over-return to trigger truncation + warning`
	`580`	`+ return list(bad_line) + ["EXTRA1", "EXTRA2"]`
	`581`	`+`
	`582`	`+ # Assert ParserWarning is emitted using module helper`
	`583`	`+ df = parser.read_csv_check_warnings(`
	`584`	`+ ParserWarning,`
	`585`	`+ "Length of header or names",`
	`586`	`+ StringIO(data),`
	`587`	`+ on_bad_lines=fixer,`
	`588`	`+ index_col=index_col,`
	`589`	`+ )`
	`590`	`+`
	`591`	`+ if index_col is None:`
	`592`	`+ expected = DataFrame(`
	`593`	`+ {`
	`594`	`+ "id": [101, 102, 103],`
	`595`	`+ "field_1": ["A", "C", "F"],`
	`596`	`+ "field_2": ["B", "D", "G"],`
	`597`	`+ }`
	`598`	`+ )`
	`599`	`+ else:`
	`600`	`+ expected = DataFrame(`
	`601`	`+ {"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},`
	`602`	`+ index=Index([101, 102, 103], name="id"),`
	`603`	`+ )`
	`604`	`+`
	`605`	`+ tm.assert_frame_equal(df, expected)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit 7502a61

File tree

2 files changed

2 files changed

`‎pandas/io/parsers/python_parser.py‎`

`‎pandas/tests/io/parser/test_python_parser_only.py‎`

0 commit comments