Commit 21130b1

authored

BUG: make read_csv read large integers (>64bits) as python integers with C engine (#62582)

1 parent 2560788 commit 21130b1Copy full SHA for 21130b1

File tree

3 files changed

+72

-14

lines changed

doc/source/whatsnew
- v3.0.0.rst
pandas
- _libs
  - parsers.pyx
- tests/io/parser/common
  - test_ints.py

3 files changed

+72

-14

lines changed

`‎doc/source/whatsnew/v3.0.0.rst‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1079,6 +1079,7 @@ I/O`
`1079`	`1079`	- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
`1080`	`1080`	- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
`1081`	`1081`	- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
	`1082`	+- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
`1082`	`1083`	- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
`1083`	`1084`	- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
`1084`	`1085`	- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)

`‎pandas/_libs/parsers.pyx‎`

Lines changed: 38 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ from cpython.exc cimport (`
`29`	`29`	`PyErr_Fetch,`
`30`	`30`	`PyErr_Occurred,`
`31`	`31`	`)`
	`32`	`+from cpython.long cimport PyLong_FromString`
`32`	`33`	`from cpython.object cimport PyObject`
`33`	`34`	`from cpython.ref cimport (`
`34`	`35`	`Py_INCREF,`
`@@ -1081,9 +1082,13 @@ cdef class TextReader:`
`1081`	`1082`	`np.dtype("object"), i, start, end, 0,`
`1082`	`1083`	`0, na_hashset, na_fset)`
`1083`	`1084`	`except OverflowError:`
`1084`		`- col_res, na_count = self._convert_with_dtype(`
`1085`		`- np.dtype("object"), i, start, end, na_filter,`
`1086`		`- 0, na_hashset, na_fset)`
	`1085`	`+ try:`
	`1086`	`+ col_res, na_count = _try_pylong(self.parser, i, start,`
	`1087`	`+ end, na_filter, na_hashset)`
	`1088`	`+ except ValueError:`
	`1089`	`+ col_res, na_count = self._convert_with_dtype(`
	`1090`	`+ np.dtype("object"), i, start, end, 0,`
	`1091`	`+ 0, na_hashset, na_fset)`
`1087`	`1092`
`1088`	`1093`	`if col_res is not None:`
`1089`	`1094`	`break`
`@@ -1873,6 +1878,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,`
`1873`	`1878`
`1874`	`1879`	`return 0`
`1875`	`1880`
	`1881`	`+cdef _try_pylong(parser_t *parser, Py_ssize_t col,`
	`1882`	`+ int64_t line_start, int64_t line_end,`
	`1883`	`+ bint na_filter, kh_str_starts_t *na_hashset):`
	`1884`	`+ cdef:`
	`1885`	`+ int na_count = 0`
	`1886`	`+ Py_ssize_t lines`
	`1887`	`+ coliter_t it`
	`1888`	`+ const char *word = NULL`
	`1889`	`+ ndarray[object] result`
	`1890`	`+ object NA = na_values[np.object_]`
	`1891`	`+`
	`1892`	`+ lines = line_end - line_start`
	`1893`	`+ result = np.empty(lines, dtype=object)`
	`1894`	`+ coliter_setup(&it, parser, col, line_start)`
	`1895`	`+`
	`1896`	`+ for i in range(lines):`
	`1897`	`+ COLITER_NEXT(it, word)`
	`1898`	`+ if na_filter and kh_get_str_starts_item(na_hashset, word):`
	`1899`	`+ # in the hash table`
	`1900`	`+ na_count += 1`
	`1901`	`+ result[i] = NA`
	`1902`	`+ continue`
	`1903`	`+`
	`1904`	`+ py_int = PyLong_FromString(word, NULL, 10)`
	`1905`	`+ if py_int is None:`
	`1906`	`+ raise ValueError("Invalid integer ", word)`
	`1907`	`+ result[i] = py_int`
	`1908`	`+`
	`1909`	`+ return result, na_count`
	`1910`	`+`
`1876`	`1911`
`1877`	`1912`	`# -> tuple[ndarray[bool], int]`
`1878`	`1913`	`cdef _try_bool_flex(parser_t *parser, int64_t col,`

`‎pandas/tests/io/parser/common/test_ints.py‎`

Lines changed: 33 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request):`
`144`	`144`	`if parser.engine == "pyarrow":`
`145`	`145`	`mark = pytest.mark.xfail(reason="parses to float64")`
`146`	`146`	`request.applymarker(mark)`
	`147`	`+ elif parser.engine == "python":`
	`148`	`+ mark = pytest.mark.xfail(`
	`149`	`+ reason="TODO: Python engine reads bigint as string"`
	`150`	`+ )`
	`151`	`+ request.applymarker(mark)`
`147`	`152`
`148`	`153`	`result = parser.read_csv(StringIO(data))`
`149`	`154`	`expected = DataFrame(`
`150`	`155`	`[`
`151`		`- "00013007854817840016671868",`
`152`		`- "00013007854817840016749251",`
`153`		`- "00013007854817840016754630",`
`154`		`- "00013007854817840016781876",`
`155`		`- "00013007854817840017028824",`
`156`		`- "00013007854817840017963235",`
`157`		`- "00013007854817840018860166",`
	`156`	`+ 13007854817840016671868,`
	`157`	`+ 13007854817840016749251,`
	`158`	`+ 13007854817840016754630,`
	`159`	`+ 13007854817840016781876,`
	`160`	`+ 13007854817840017028824,`
	`161`	`+ 13007854817840017963235,`
	`162`	`+ 13007854817840018860166,`
`158`	`163`	`],`
`159`	`164`	`columns=["ID"],`
`160`	`165`	`)`
`@@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request):`
`185`	`190`	`)`
`186`	`191`	`def test_int64_uint64_range(all_parsers, val):`
`187`	`192`	`# These numbers fall right inside the int64-uint64`
`188`		`- # range, so they should be parsed as string.`
	`193`	`+ # range, so they should be parsed as integer.`
`189`	`194`	`parser = all_parsers`
`190`	`195`	`result = parser.read_csv(StringIO(str(val)), header=None)`
`191`	`196`
`@@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val):`
`197`	`202`	`@pytest.mark.parametrize(`
`198`	`203`	`"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]`
`199`	`204`	`)`
`200`		`-def test_outside_int64_uint64_range(all_parsers, val):`
	`205`	`+def test_outside_int64_uint64_range(all_parsers, val, request):`
`201`	`206`	`# These numbers fall just outside the int64-uint64`
`202`		`- # range, so they should be parsed as string.`
	`207`	`+ # range, so they should be parsed as object.`
`203`	`208`	`parser = all_parsers`
	`209`	`+ if parser.engine == "python":`
	`210`	`+ mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")`
	`211`	`+ request.applymarker(mark)`
	`212`	`+`
`204`	`213`	`result = parser.read_csv(StringIO(str(val)), header=None)`
`205`	`214`
`206`		`- expected = DataFrame([str(val)])`
	`215`	`+ expected = DataFrame([val])`
	`216`	`+ tm.assert_frame_equal(result, expected)`
	`217`	`+`
	`218`	`+`
	`219`	`+@skip_pyarrow # CSV parse error: Empty CSV file or block`
	`220`	`+@pytest.mark.parametrize(`
	`221`	`+ "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]`
	`222`	`+)`
	`223`	`+def test_outside_int64_uint64_range_follow_str(all_parsers, val):`
	`224`	`+ parser = all_parsers`
	`225`	`+`
	`226`	`+ result = parser.read_csv(StringIO(f"{val}\nabc"), header=None)`
	`227`	`+`
	`228`	`+ expected = DataFrame([str(val), "abc"])`
`207`	`229`	`tm.assert_frame_equal(result, expected)`
`208`	`230`
`209`	`231`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit 21130b1

File tree

3 files changed

3 files changed

`‎doc/source/whatsnew/v3.0.0.rst‎`

`‎pandas/_libs/parsers.pyx‎`

`‎pandas/tests/io/parser/common/test_ints.py‎`

0 commit comments