Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit f14f131

Browse files
BUG: fix .str.isdigit to honor unicode superscript for older pyarrow (#61962)
1 parent 0f4222e commit f14f131

File tree

4 files changed

+47
-9
lines changed

4 files changed

+47
-9
lines changed

‎doc/source/whatsnew/v2.3.2.rst‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See
2222

2323
Bug fixes
2424
^^^^^^^^^
25+
- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript
26+
characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`)
2527
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
2628
"string" type in the JSON Table Schema for :class:`StringDtype` columns
2729
(:issue:`61889`)

‎pandas/core/arrays/_arrow_string_mixins.py‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pandas.compat import (
1616
HAS_PYARROW,
1717
pa_version_under17p0,
18+
pa_version_under21p0,
1819
)
1920

2021
if HAS_PYARROW:
@@ -267,6 +268,12 @@ def _str_isdecimal(self):
267268
return self._convert_bool_result(result)
268269

269270
def _str_isdigit(self):
271+
if pa_version_under21p0:
272+
# https://github.com/pandas-dev/pandas/issues/61466
273+
res_list = self._apply_elementwise(str.isdigit)
274+
return self._convert_bool_result(
275+
pa.chunked_array(res_list, type=pa.bool_())
276+
)
270277
result = pc.utf8_is_digit(self._pa_array)
271278
return self._convert_bool_result(result)
272279

‎pandas/core/strings/accessor.py‎

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3602,16 +3602,26 @@ def casefold(self):
36023602
Series.str.isupper : Check whether all characters are uppercase.
36033603
Series.str.istitle : Check whether all characters are titlecase.
36043604
3605-
Examples
3606-
--------
3605+
Notes
3606+
-----
36073607
Similar to ``str.isdecimal`` but also includes special digits, like
36083608
superscripted and subscripted digits in unicode.
36093609
3610+
The exact behavior of this method, i.e. which unicode characters are
3611+
considered as digits, depends on the backend used for string operations,
3612+
and there can be small differences.
3613+
For example, Python considers the 3 superscript character as a digit, but
3614+
not the 1⁄5 fraction character, while PyArrow considers both as digits. For
3615+
simple (ascii) decimal numbers, the behaviour is consistent.
3616+
3617+
Examples
3618+
--------
3619+
36103620
>>> s3 = pd.Series(['23', '3', '1⁄5', ''])
36113621
>>> s3.str.isdigit()
36123622
0 True
3613-
1 False
3614-
2 False
3623+
1 True
3624+
2 True
36153625
3 False
36163626
dtype: bool
36173627
"""

‎pandas/tests/strings/test_strings.py‎

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import pytest
99

10+
from pandas.compat import pa_version_under21p0
1011
from pandas.errors import Pandas4Warning
1112

1213
from pandas import (
@@ -15,6 +16,7 @@
1516
Index,
1617
MultiIndex,
1718
Series,
19+
StringDtype,
1820
option_context,
1921
)
2022
import pandas._testing as tm
@@ -249,8 +251,9 @@ def test_ismethods(method, expected, any_string_dtype):
249251
@pytest.mark.parametrize(
250252
"method, expected",
251253
[
252-
("isnumeric", [False, True, True, False, True, True, False]),
253-
("isdecimal", [False, True, False, False, False, True, False]),
254+
("isnumeric", [False, True, True, True, False, True, True, False]),
255+
("isdecimal", [False, True, False, False, False, False, True, False]),
256+
("isdigit", [False, True, True, False, False, False, True, False]),
254257
],
255258
)
256259
def test_isnumeric_unicode(method, expected, any_string_dtype):
@@ -259,19 +262,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
259262
# 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
260263
# 0xFF13: 3 Em 3 # noqa: RUF003
261264
ser = Series(
262-
["A", "3", "1⁄4", "★", "፸", "3", "four"], # noqa: RUF001
265+
["A", "3", "3", "1⁄4", "★", "፸", "3", "four"], # noqa: RUF001
263266
dtype=any_string_dtype,
264267
)
265268
expected_dtype = (
266269
"bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
267270
)
268271
expected = Series(expected, dtype=expected_dtype)
272+
if (
273+
method == "isdigit"
274+
and isinstance(ser.dtype, StringDtype)
275+
and ser.dtype.storage == "pyarrow"
276+
and not pa_version_under21p0
277+
):
278+
# known difference in behavior between python and pyarrow unicode handling
279+
# pyarrow 21+ considers 1⁄4 and ፸ as a digit, while python does not
280+
expected.iloc[3] = True
281+
expected.iloc[5] = True
282+
269283
result = getattr(ser.str, method)()
270284
tm.assert_series_equal(result, expected)
271285

272286
# compare with standard library
273-
expected = [getattr(item, method)() for item in ser]
274-
assert list(result) == expected
287+
# (only for non-pyarrow storage given the above differences)
288+
if any_string_dtype == "object" or (
289+
isinstance(any_string_dtype, StringDtype)
290+
and any_string_dtype.storage == "python"
291+
):
292+
expected = [getattr(item, method)() for item in ser]
293+
assert list(result) == expected
275294

276295

277296
@pytest.mark.parametrize(

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /