Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 986b4e5

Browse files
BUG: always warn when on_bad_lines callable returns extra fields with index_col in read_csv (Python engine) (GH#61837) (#62297)
1 parent 3c1d868 commit 986b4e5

File tree

3 files changed

+55
-9
lines changed

3 files changed

+55
-9
lines changed

‎doc/source/whatsnew/v3.0.0.rst‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,8 @@ MultiIndex
10541054
I/O
10551055
^^^
10561056
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
1057+
- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
1058+
``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
10571059
- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
10581060
- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
10591061
- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)

‎pandas/io/parsers/python_parser.py‎

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import numpy as np
2222

2323
from pandas._libs import lib
24+
from pandas._typing import Scalar
2425
from pandas.errors import (
2526
EmptyDataError,
2627
ParserError,
@@ -77,7 +78,6 @@
7778
ArrayLike,
7879
DtypeObj,
7980
ReadCsvBuffer,
80-
Scalar,
8181
T,
8282
)
8383

@@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
954954
"""
955955
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
956956
raise ParserError(msg)
957-
if self.on_bad_lines == self.BadLineHandleMethod.WARN:
957+
if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
958+
self.on_bad_lines
959+
):
958960
warnings.warn(
959961
f"Skipping line {row_num}: {msg}\n",
960962
ParserWarning,
@@ -1189,29 +1191,35 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11891191

11901192
for i, _content in iter_content:
11911193
actual_len = len(_content)
1192-
11931194
if actual_len > col_len:
11941195
if callable(self.on_bad_lines):
11951196
new_l = self.on_bad_lines(_content)
11961197
if new_l is not None:
1197-
content.append(new_l) # pyright: ignore[reportArgumentType]
1198+
new_l = cast(list[Scalar], new_l)
1199+
if len(new_l) > col_len:
1200+
row_num = self.pos - (content_len - i + footers)
1201+
bad_lines.append((row_num, len(new_l), "callable"))
1202+
new_l = new_l[:col_len]
1203+
content.append(new_l)
1204+
11981205
elif self.on_bad_lines in (
11991206
self.BadLineHandleMethod.ERROR,
12001207
self.BadLineHandleMethod.WARN,
12011208
):
12021209
row_num = self.pos - (content_len - i + footers)
1203-
bad_lines.append((row_num, actual_len))
1204-
1210+
bad_lines.append((row_num, actual_len, "normal"))
12051211
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
12061212
break
12071213
else:
12081214
content.append(_content)
12091215

1210-
for row_num, actual_len in bad_lines:
1216+
for row_num, actual_len, source in bad_lines:
12111217
msg = (
12121218
f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
12131219
)
1214-
if (
1220+
if source == "callable":
1221+
msg += " from bad_lines callable"
1222+
elif (
12151223
self.delimiter
12161224
and len(self.delimiter) > 1
12171225
and self.quoting != csv.QUOTE_NONE

‎pandas/tests/io/parser/test_python_parser_only.py‎

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
432432
bad_sio = StringIO(data)
433433

434434
result = parser.read_csv_check_warnings(
435-
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
435+
ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x
436436
)
437437
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
438438
tm.assert_frame_equal(result, expected)
@@ -562,3 +562,39 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
562562
expected = DataFrame(expected)
563563
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
564564
tm.assert_frame_equal(result, expected)
565+
566+
567+
@pytest.mark.parametrize("index_col", [None, 0])
568+
def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
569+
python_parser_only, index_col
570+
):
571+
# GH#61837
572+
parser = python_parser_only
573+
data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
574+
575+
def fixer(bad_line):
576+
return list(bad_line) + ["EXTRA1", "EXTRA2"]
577+
578+
result = parser.read_csv_check_warnings(
579+
ParserWarning,
580+
"from bad_lines callable",
581+
StringIO(data),
582+
on_bad_lines=fixer,
583+
index_col=index_col,
584+
)
585+
586+
if index_col is None:
587+
expected = DataFrame(
588+
{
589+
"id": [101, 102, 103],
590+
"field_1": ["A", "C", "F"],
591+
"field_2": ["B", "D", "G"],
592+
}
593+
)
594+
else:
595+
expected = DataFrame(
596+
{"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
597+
index=Index([101, 102, 103], name="id"),
598+
)
599+
600+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /