Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 014e05f

Browse files
DOC: whatsnew entry for on_bad_lines regression fix (GH#61837)
1 parent 7502a61 commit 014e05f

File tree

3 files changed

+22
-23
lines changed

3 files changed

+22
-23
lines changed

‎doc/source/whatsnew/v2.3.3.rst‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ Bug fixes
2525
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
2626
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
2727
with a compiled regex and custom flags (:issue:`62240`)
28+
- Fix regression in ``on_bad_lines`` callable when returning too many fields: now emits
29+
``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
2830

2931
.. ---------------------------------------------------------------------------
3032
.. _whatsnew_233.contributors:

‎pandas/io/parsers/python_parser.py‎

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import numpy as np
2222

2323
from pandas._libs import lib
24+
from pandas._typing import Scalar
2425
from pandas.errors import (
2526
EmptyDataError,
2627
ParserError,
@@ -77,7 +78,6 @@
7778
ArrayLike,
7879
DtypeObj,
7980
ReadCsvBuffer,
80-
Scalar,
8181
T,
8282
)
8383

@@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
954954
"""
955955
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
956956
raise ParserError(msg)
957-
if self.on_bad_lines == self.BadLineHandleMethod.WARN:
957+
if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
958+
self.on_bad_lines
959+
):
958960
warnings.warn(
959961
f"Skipping line {row_num}: {msg}\n",
960962
ParserWarning,
@@ -1193,34 +1195,34 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11931195
if callable(self.on_bad_lines):
11941196
new_l = self.on_bad_lines(_content)
11951197
if new_l is not None:
1196-
# Truncate extra elements and warn.
1198+
new_l=cast(list[Scalar], new_l)
11971199
if len(new_l) > col_len:
1198-
warnings.warn(
1199-
"Header/names length != data length. "
1200-
"Extra fields dropped.",
1201-
ParserWarning,
1202-
stacklevel=find_stack_level(),
1203-
)
1200+
row_num = self.pos - (content_len - i + footers)
1201+
bad_lines.append((row_num, len(new_l), "callable"))
12041202
new_l = new_l[:col_len]
1205-
content.append(new_l) # pyright: ignore[reportArgumentType]
1203+
content.append(new_l)
1204+
12061205
elif self.on_bad_lines in (
12071206
self.BadLineHandleMethod.ERROR,
12081207
self.BadLineHandleMethod.WARN,
1208+
self.BadLineHandleMethod.SKIP,
12091209
):
12101210
row_num = self.pos - (content_len - i + footers)
1211-
bad_lines.append((row_num, actual_len))
1211+
bad_lines.append((row_num, actual_len, "normal"))
12121212
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
12131213
break
12141214
else:
12151215
content.append(_content)
12161216
else:
12171217
content.append(_content)
12181218

1219-
for row_num, actual_len in bad_lines:
1219+
for row_num, actual_len, source in bad_lines:
12201220
msg = (
12211221
f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
12221222
)
1223-
if (
1223+
if source == "callable":
1224+
msg += " from bad_lines callable"
1225+
elif (
12241226
self.delimiter
12251227
and len(self.delimiter) > 1
12261228
and self.quoting != csv.QUOTE_NONE

‎pandas/tests/io/parser/test_python_parser_only.py‎

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
432432
bad_sio = StringIO(data)
433433

434434
result = parser.read_csv_check_warnings(
435-
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
435+
ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x
436436
)
437437
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
438438
tm.assert_frame_equal(result, expected)
@@ -568,21 +568,16 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
568568
def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
569569
python_parser_only, index_col
570570
):
571-
"""
572-
GH#61837 regression: callable on_bad_lines returning extra fields must emit a
573-
ParserWarning and drop extras regardless of index_col. [2][3]
574-
"""
571+
# GH#61837
575572
parser = python_parser_only
576573
data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
577574

578575
def fixer(bad_line):
579-
# Over-return to trigger truncation + warning
580576
return list(bad_line) + ["EXTRA1", "EXTRA2"]
581577

582-
# Assert ParserWarning is emitted using module helper
583-
df = parser.read_csv_check_warnings(
578+
result = parser.read_csv_check_warnings(
584579
ParserWarning,
585-
"Length of header or names",
580+
"from bad_lines callable",
586581
StringIO(data),
587582
on_bad_lines=fixer,
588583
index_col=index_col,
@@ -602,4 +597,4 @@ def fixer(bad_line):
602597
index=Index([101, 102, 103], name="id"),
603598
)
604599

605-
tm.assert_frame_equal(df, expected)
600+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /