From 362c1daaab56ce94e064d19b954057195a9e6ce9 Mon Sep 17 00:00:00 2001 From: skalwaghe-56 Date: Mon, 8 Sep 2025 21:43:40 +0530 Subject: [PATCH 1/2] BUG: read_csv(on_bad_lines=callable)+index_col should warn; add test - Always emit ParserWarning and drop extra fields when an on_bad_lines callable returns more elements than expected, regardless of index_col, in PythonParser._rows_to_cols. [GH#61837] - Ensure non-bad rows are appended in the outer else branch so good lines are preserved. - Add regression test pandas/tests/io/parser/test_python_parser_only.py::test_on_bad_lines_callable_warns_and_truncates_with_index_col covering index_col in [None, 0]. Closes #61837. --- pandas/io/parsers/python_parser.py | 13 +++++- .../io/parser/test_python_parser_only.py | 41 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 2b538f5e3cef4..5c5c9ef214d0b 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1189,11 +1189,19 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: for i, _content in iter_content: actual_len = len(_content) - if actual_len> col_len: if callable(self.on_bad_lines): new_l = self.on_bad_lines(_content) if new_l is not None: + # Truncate extra elements and warn. + if len(new_l)> col_len: + warnings.warn( + "Header/names length != data length. " + "Extra fields dropped.", + ParserWarning, + stacklevel=find_stack_level(), + ) + new_l = new_l[:col_len] content.append(new_l) # pyright: ignore[reportArgumentType] elif self.on_bad_lines in ( self.BadLineHandleMethod.ERROR, @@ -1201,9 +1209,10 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: ): row_num = self.pos - (content_len - i + footers) bad_lines.append((row_num, actual_len)) - if self.on_bad_lines == self.BadLineHandleMethod.ERROR: break + else: + content.append(_content) else: content.append(_content) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a5bb151e84f47..941fa74114f8a 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -562,3 +562,44 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp expected = DataFrame(expected) expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [None, 0]) +def test_on_bad_lines_callable_warns_and_truncates_with_index_col( + python_parser_only, index_col +): + """ + GH#61837 regression: callable on_bad_lines returning extra fields must emit a + ParserWarning and drop extras regardless of index_col. [2][3] + """ + parser = python_parser_only + data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n" + + def fixer(bad_line): + # Over-return to trigger truncation + warning + return list(bad_line) + ["EXTRA1", "EXTRA2"] + + # Assert ParserWarning is emitted using module helper + df = parser.read_csv_check_warnings( + ParserWarning, + "Length of header or names", + StringIO(data), + on_bad_lines=fixer, + index_col=index_col, + ) + + if index_col is None: + expected = DataFrame( + { + "id": [101, 102, 103], + "field_1": ["A", "C", "F"], + "field_2": ["B", "D", "G"], + } + ) + else: + expected = DataFrame( + {"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]}, + index=Index([101, 102, 103], name="id"), + ) + + tm.assert_frame_equal(df, expected) From 1db6f884a048cb89e86a96c2720f30f17d7ad22c Mon Sep 17 00:00:00 2001 From: skalwaghe-56 Date: Mon, 8 Sep 2025 21:49:50 +0530 Subject: [PATCH 2/2] DOC: whatsnew entry for on_bad_lines regression fix (GH#61837) --- doc/source/whatsnew/v3.0.0.rst | 2 ++ pandas/io/parsers/python_parser.py | 29 +++++++++---------- .../io/parser/test_python_parser_only.py | 15 ++++------ 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 99a6be03c84d3..bb0c7f9e7715a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1054,6 +1054,8 @@ MultiIndex I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`) +- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits + ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 5c5c9ef214d0b..1e3f4548621a0 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -21,6 +21,7 @@ import numpy as np from pandas._libs import lib +from pandas._typing import Scalar from pandas.errors import ( EmptyDataError, ParserError, @@ -77,7 +78,6 @@ ArrayLike, DtypeObj, ReadCsvBuffer, - Scalar, T, ) @@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None: """ if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) - if self.on_bad_lines == self.BadLineHandleMethod.WARN: + if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable( + self.on_bad_lines + ): warnings.warn( f"Skipping line {row_num}: {msg}\n", ParserWarning, @@ -1193,34 +1195,31 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: if callable(self.on_bad_lines): new_l = self.on_bad_lines(_content) if new_l is not None: - # Truncate extra elements and warn. + new_l = cast(list[Scalar], new_l) if len(new_l)> col_len: - warnings.warn( - "Header/names length != data length. " - "Extra fields dropped.", - ParserWarning, - stacklevel=find_stack_level(), - ) + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, len(new_l), "callable")) new_l = new_l[:col_len] - content.append(new_l) # pyright: ignore[reportArgumentType] + content.append(new_l) + elif self.on_bad_lines in ( self.BadLineHandleMethod.ERROR, self.BadLineHandleMethod.WARN, ): row_num = self.pos - (content_len - i + footers) - bad_lines.append((row_num, actual_len)) + bad_lines.append((row_num, actual_len, "normal")) if self.on_bad_lines == self.BadLineHandleMethod.ERROR: break - else: - content.append(_content) else: content.append(_content) - for row_num, actual_len in bad_lines: + for row_num, actual_len, source in bad_lines: msg = ( f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}" ) - if ( + if source == "callable": + msg += " from bad_lines callable" + elif ( self.delimiter and len(self.delimiter)> 1 and self.quoting != csv.QUOTE_NONE diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 941fa74114f8a..eed2403a88922 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only): bad_sio = StringIO(data) result = parser.read_csv_check_warnings( - ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x + ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x ) expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) tm.assert_frame_equal(result, expected) @@ -568,21 +568,16 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp def test_on_bad_lines_callable_warns_and_truncates_with_index_col( python_parser_only, index_col ): - """ - GH#61837 regression: callable on_bad_lines returning extra fields must emit a - ParserWarning and drop extras regardless of index_col. [2][3] - """ + # GH#61837 parser = python_parser_only data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n" def fixer(bad_line): - # Over-return to trigger truncation + warning return list(bad_line) + ["EXTRA1", "EXTRA2"] - # Assert ParserWarning is emitted using module helper - df = parser.read_csv_check_warnings( + result = parser.read_csv_check_warnings( ParserWarning, - "Length of header or names", + "from bad_lines callable", StringIO(data), on_bad_lines=fixer, index_col=index_col, @@ -602,4 +597,4 @@ def fixer(bad_line): index=Index([101, 102, 103], name="id"), ) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(result, expected)

AltStyle によって変換されたページ (->オリジナル) /