diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 99a6be03c84d3..bb0c7f9e7715a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1054,6 +1054,8 @@ MultiIndex I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`) +- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits + ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 2b538f5e3cef4..1e3f4548621a0 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -21,6 +21,7 @@ import numpy as np from pandas._libs import lib +from pandas._typing import Scalar from pandas.errors import ( EmptyDataError, ParserError, @@ -77,7 +78,6 @@ ArrayLike, DtypeObj, ReadCsvBuffer, - Scalar, T, ) @@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None: """ if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) - if self.on_bad_lines == self.BadLineHandleMethod.WARN: + if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable( + self.on_bad_lines + ): warnings.warn( f"Skipping line {row_num}: {msg}\n", ParserWarning, @@ -1189,29 +1191,35 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: for i, _content in iter_content: actual_len = len(_content) - if actual_len> col_len: if callable(self.on_bad_lines): new_l = self.on_bad_lines(_content) if new_l is not None: - content.append(new_l) # pyright: ignore[reportArgumentType] + new_l = cast(list[Scalar], new_l) + if len(new_l)> col_len: + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, len(new_l), "callable")) + new_l = new_l[:col_len] + content.append(new_l) + elif self.on_bad_lines in ( self.BadLineHandleMethod.ERROR, self.BadLineHandleMethod.WARN, ): row_num = self.pos - (content_len - i + footers) - bad_lines.append((row_num, actual_len)) - + bad_lines.append((row_num, actual_len, "normal")) if self.on_bad_lines == self.BadLineHandleMethod.ERROR: break else: content.append(_content) - for row_num, actual_len in bad_lines: + for row_num, actual_len, source in bad_lines: msg = ( f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}" ) - if ( + if source == "callable": + msg += " from bad_lines callable" + elif ( self.delimiter and len(self.delimiter)> 1 and self.quoting != csv.QUOTE_NONE diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a5bb151e84f47..eed2403a88922 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only): bad_sio = StringIO(data) result = parser.read_csv_check_warnings( - ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x + ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x ) expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) tm.assert_frame_equal(result, expected) @@ -562,3 +562,39 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp expected = DataFrame(expected) expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [None, 0]) +def test_on_bad_lines_callable_warns_and_truncates_with_index_col( + python_parser_only, index_col +): + # GH#61837 + parser = python_parser_only + data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n" + + def fixer(bad_line): + return list(bad_line) + ["EXTRA1", "EXTRA2"] + + result = parser.read_csv_check_warnings( + ParserWarning, + "from bad_lines callable", + StringIO(data), + on_bad_lines=fixer, + index_col=index_col, + ) + + if index_col is None: + expected = DataFrame( + { + "id": [101, 102, 103], + "field_1": ["A", "C", "F"], + "field_2": ["B", "D", "G"], + } + ) + else: + expected = DataFrame( + {"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]}, + index=Index([101, 102, 103], name="id"), + ) + + tm.assert_frame_equal(result, expected)

AltStyle によって変換されたページ (->オリジナル) /