From 362c1daaab56ce94e064d19b954057195a9e6ce9 Mon Sep 17 00:00:00 2001
From: skalwaghe-56 <skalwaghe56@gmail.com>
Date: Mon, 8 Sep 2025 21:43:40 +0530
Subject: [PATCH 1/2] BUG: read_csv(on_bad_lines=callable)+index_col should
 warn; add test
- Always emit ParserWarning and drop extra fields when an on_bad_lines
 callable returns more elements than expected, regardless of index_col,
 in PythonParser._rows_to_cols. [GH#61837]
- Ensure non-bad rows are appended in the outer else branch so good lines
 are preserved.
- Add regression test
 pandas/tests/io/parser/test_python_parser_only.py::test_on_bad_lines_callable_warns_and_truncates_with_index_col
 covering index_col in [None, 0].
Closes #61837.
---
 pandas/io/parsers/python_parser.py &#124; 13 +++++-
 .../io/parser/test_python_parser_only.py &#124; 41 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 2b538f5e3cef4..5c5c9ef214d0b 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -1189,11 +1189,19 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
 
 for i, _content in iter_content:
 actual_len = len(_content)
-
 if actual_len> col_len:
 if callable(self.on_bad_lines):
 new_l = self.on_bad_lines(_content)
 if new_l is not None:
+ # Truncate extra elements and warn.
+ if len(new_l)> col_len:
+ warnings.warn(
+ "Header/names length != data length. "
+ "Extra fields dropped.",
+ ParserWarning,
+ stacklevel=find_stack_level(),
+ )
+ new_l = new_l[:col_len]
 content.append(new_l) # pyright: ignore[reportArgumentType]
 elif self.on_bad_lines in (
 self.BadLineHandleMethod.ERROR,
@@ -1201,9 +1209,10 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
 ):
 row_num = self.pos - (content_len - i + footers)
 bad_lines.append((row_num, actual_len))
-
 if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
 break
+ else:
+ content.append(_content)
 else:
 content.append(_content)
 
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index a5bb151e84f47..941fa74114f8a 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -562,3 +562,44 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
 expected = DataFrame(expected)
 expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
 tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_col", [None, 0])
+def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
+ python_parser_only, index_col
+):
+ """
+ GH#61837 regression: callable on_bad_lines returning extra fields must emit a
+ ParserWarning and drop extras regardless of index_col. [2][3]
+ """
+ parser = python_parser_only
+ data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
+
+ def fixer(bad_line):
+ # Over-return to trigger truncation + warning
+ return list(bad_line) + ["EXTRA1", "EXTRA2"]
+
+ # Assert ParserWarning is emitted using module helper
+ df = parser.read_csv_check_warnings(
+ ParserWarning,
+ "Length of header or names",
+ StringIO(data),
+ on_bad_lines=fixer,
+ index_col=index_col,
+ )
+
+ if index_col is None:
+ expected = DataFrame(
+ {
+ "id": [101, 102, 103],
+ "field_1": ["A", "C", "F"],
+ "field_2": ["B", "D", "G"],
+ }
+ )
+ else:
+ expected = DataFrame(
+ {"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
+ index=Index([101, 102, 103], name="id"),
+ )
+
+ tm.assert_frame_equal(df, expected)
From 1db6f884a048cb89e86a96c2720f30f17d7ad22c Mon Sep 17 00:00:00 2001
From: skalwaghe-56 <skalwaghe56@gmail.com>
Date: Mon, 8 Sep 2025 21:49:50 +0530
Subject: [PATCH 2/2] DOC: whatsnew entry for on_bad_lines regression fix
 (GH#61837)
---
 doc/source/whatsnew/v3.0.0.rst &#124; 2 ++
 pandas/io/parsers/python_parser.py &#124; 29 +++++++++----------
 .../io/parser/test_python_parser_only.py &#124; 15 ++++------
 3 files changed, 21 insertions(+), 25 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 99a6be03c84d3..bb0c7f9e7715a 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -1054,6 +1054,8 @@ MultiIndex
 I/O
 ^^^
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
+- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
+ ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
 - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
 - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 5c5c9ef214d0b..1e3f4548621a0 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._typing import Scalar
 from pandas.errors import (
 EmptyDataError,
 ParserError,
@@ -77,7 +78,6 @@
 ArrayLike,
 DtypeObj,
 ReadCsvBuffer,
- Scalar,
 T,
 )
 
@@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
 """
 if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
 raise ParserError(msg)
- if self.on_bad_lines == self.BadLineHandleMethod.WARN:
+ if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
+ self.on_bad_lines
+ ):
 warnings.warn(
 f"Skipping line {row_num}: {msg}\n",
 ParserWarning,
@@ -1193,34 +1195,31 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
 if callable(self.on_bad_lines):
 new_l = self.on_bad_lines(_content)
 if new_l is not None:
- # Truncate extra elements and warn.
+ new_l = cast(list[Scalar], new_l)
 if len(new_l)> col_len:
- warnings.warn(
- "Header/names length != data length. "
- "Extra fields dropped.",
- ParserWarning,
- stacklevel=find_stack_level(),
- )
+ row_num = self.pos - (content_len - i + footers)
+ bad_lines.append((row_num, len(new_l), "callable"))
 new_l = new_l[:col_len]
- content.append(new_l) # pyright: ignore[reportArgumentType]
+ content.append(new_l)
+
 elif self.on_bad_lines in (
 self.BadLineHandleMethod.ERROR,
 self.BadLineHandleMethod.WARN,
 ):
 row_num = self.pos - (content_len - i + footers)
- bad_lines.append((row_num, actual_len))
+ bad_lines.append((row_num, actual_len, "normal"))
 if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
 break
- else:
- content.append(_content)
 else:
 content.append(_content)
 
- for row_num, actual_len in bad_lines:
+ for row_num, actual_len, source in bad_lines:
 msg = (
 f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
 )
- if (
+ if source == "callable":
+ msg += " from bad_lines callable"
+ elif (
 self.delimiter
 and len(self.delimiter)> 1
 and self.quoting != csv.QUOTE_NONE
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 941fa74114f8a..eed2403a88922 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
 bad_sio = StringIO(data)
 
 result = parser.read_csv_check_warnings(
- ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
+ ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x
 )
 expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
 tm.assert_frame_equal(result, expected)
@@ -568,21 +568,16 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
 def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
 python_parser_only, index_col
 ):
- """
- GH#61837 regression: callable on_bad_lines returning extra fields must emit a
- ParserWarning and drop extras regardless of index_col. [2][3]
- """
+ # GH#61837
 parser = python_parser_only
 data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
 
 def fixer(bad_line):
- # Over-return to trigger truncation + warning
 return list(bad_line) + ["EXTRA1", "EXTRA2"]
 
- # Assert ParserWarning is emitted using module helper
- df = parser.read_csv_check_warnings(
+ result = parser.read_csv_check_warnings(
 ParserWarning,
- "Length of header or names",
+ "from bad_lines callable",
 StringIO(data),
 on_bad_lines=fixer,
 index_col=index_col,
@@ -602,4 +597,4 @@ def fixer(bad_line):
 index=Index([101, 102, 103], name="id"),
 )
 
- tm.assert_frame_equal(df, expected)
+ tm.assert_frame_equal(result, expected)
</div><div class="naked_ctrl">
<form action="/index.cgi/contrast" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="https://patch-diff.githubusercontent.com/raw/pandas-dev/pandas/pull/62297.patch">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="https://patch-diff.githubusercontent.com/raw/pandas-dev/pandas/pull/62297.patch" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast" selected="selected">配色反転</option>
<option value="larger-text">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>