Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 21130b1

Browse files
BUG: make read_csv read large integers (>64bits) as python integers with C engine (#62582)
1 parent 2560788 commit 21130b1

File tree

3 files changed

+72
-14
lines changed

3 files changed

+72
-14
lines changed

‎doc/source/whatsnew/v3.0.0.rst‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,7 @@ I/O
10791079
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
10801080
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10811081
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
1082+
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
10821083
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
10831084
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
10841085
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)

‎pandas/_libs/parsers.pyx‎

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ from cpython.exc cimport (
2929
PyErr_Fetch,
3030
PyErr_Occurred,
3131
)
32+
from cpython.long cimport PyLong_FromString
3233
from cpython.object cimport PyObject
3334
from cpython.ref cimport (
3435
Py_INCREF,
@@ -1081,9 +1082,13 @@ cdef class TextReader:
10811082
np.dtype("object"), i, start, end, 0,
10821083
0, na_hashset, na_fset)
10831084
except OverflowError:
1084-
col_res, na_count = self._convert_with_dtype(
1085-
np.dtype("object"), i, start, end, na_filter,
1086-
0, na_hashset, na_fset)
1085+
try:
1086+
col_res, na_count = _try_pylong(self.parser, i, start,
1087+
end, na_filter, na_hashset)
1088+
except ValueError:
1089+
col_res, na_count = self._convert_with_dtype(
1090+
np.dtype("object"), i, start, end, 0,
1091+
0, na_hashset, na_fset)
10871092

10881093
if col_res is not None:
10891094
break
@@ -1873,6 +1878,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
18731878

18741879
return 0
18751880

1881+
cdef _try_pylong(parser_t *parser, Py_ssize_t col,
1882+
int64_t line_start, int64_t line_end,
1883+
bint na_filter, kh_str_starts_t *na_hashset):
1884+
cdef:
1885+
int na_count = 0
1886+
Py_ssize_t lines
1887+
coliter_t it
1888+
const char *word = NULL
1889+
ndarray[object] result
1890+
object NA = na_values[np.object_]
1891+
1892+
lines = line_end - line_start
1893+
result = np.empty(lines, dtype=object)
1894+
coliter_setup(&it, parser, col, line_start)
1895+
1896+
for i in range(lines):
1897+
COLITER_NEXT(it, word)
1898+
if na_filter and kh_get_str_starts_item(na_hashset, word):
1899+
# in the hash table
1900+
na_count += 1
1901+
result[i] = NA
1902+
continue
1903+
1904+
py_int = PyLong_FromString(word, NULL, 10)
1905+
if py_int is None:
1906+
raise ValueError("Invalid integer ", word)
1907+
result[i] = py_int
1908+
1909+
return result, na_count
1910+
18761911

18771912
# -> tuple[ndarray[bool], int]
18781913
cdef _try_bool_flex(parser_t *parser, int64_t col,

‎pandas/tests/io/parser/common/test_ints.py‎

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request):
144144
if parser.engine == "pyarrow":
145145
mark = pytest.mark.xfail(reason="parses to float64")
146146
request.applymarker(mark)
147+
elif parser.engine == "python":
148+
mark = pytest.mark.xfail(
149+
reason="TODO: Python engine reads bigint as string"
150+
)
151+
request.applymarker(mark)
147152

148153
result = parser.read_csv(StringIO(data))
149154
expected = DataFrame(
150155
[
151-
"00013007854817840016671868",
152-
"00013007854817840016749251",
153-
"00013007854817840016754630",
154-
"00013007854817840016781876",
155-
"00013007854817840017028824",
156-
"00013007854817840017963235",
157-
"00013007854817840018860166",
156+
13007854817840016671868,
157+
13007854817840016749251,
158+
13007854817840016754630,
159+
13007854817840016781876,
160+
13007854817840017028824,
161+
13007854817840017963235,
162+
13007854817840018860166,
158163
],
159164
columns=["ID"],
160165
)
@@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request):
185190
)
186191
def test_int64_uint64_range(all_parsers, val):
187192
# These numbers fall right inside the int64-uint64
188-
# range, so they should be parsed as string.
193+
# range, so they should be parsed as integer.
189194
parser = all_parsers
190195
result = parser.read_csv(StringIO(str(val)), header=None)
191196

@@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val):
197202
@pytest.mark.parametrize(
198203
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
199204
)
200-
def test_outside_int64_uint64_range(all_parsers, val):
205+
def test_outside_int64_uint64_range(all_parsers, val, request):
201206
# These numbers fall just outside the int64-uint64
202-
# range, so they should be parsed as string.
207+
# range, so they should be parsed as object.
203208
parser = all_parsers
209+
if parser.engine == "python":
210+
mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
211+
request.applymarker(mark)
212+
204213
result = parser.read_csv(StringIO(str(val)), header=None)
205214

206-
expected = DataFrame([str(val)])
215+
expected = DataFrame([val])
216+
tm.assert_frame_equal(result, expected)
217+
218+
219+
@skip_pyarrow # CSV parse error: Empty CSV file or block
220+
@pytest.mark.parametrize(
221+
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
222+
)
223+
def test_outside_int64_uint64_range_follow_str(all_parsers, val):
224+
parser = all_parsers
225+
226+
result = parser.read_csv(StringIO(f"{val}\nabc"), header=None)
227+
228+
expected = DataFrame([str(val), "abc"])
207229
tm.assert_frame_equal(result, expected)
208230

209231

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /