Commit e4ca405

authored

API: mode.nan_is_na to consistently distinguish NaN-vs-NA (#62040)

1 parent 53cb639 commit e4ca405Copy full SHA for e4ca405

File tree

59 files changed

+775

-206

lines changed

asv_bench/benchmarks
doc/source
- user_guide
  - text.rst
- whatsnew
  - v0.24.0.rst
  - v3.0.0.rst
pandas
- _config
  - __init__.py
- _libs
- conftest.py
- core
  - arrays
    - _utils.py
    - arrow
      - array.py
    - masked.py
    - numeric.py
    - string_.py
  - config_init.py
  - dtypes
    - cast.py
  - generic.py
  - indexes
    - base.py
  - internals
    - construction.py
- io/json
  - _json.py
  - _table_schema.py
- tests
  - arrays
    - floating
    - integer
    - interval
      - test_interval_pyarrow.py
    - masked
      - test_function.py
    - string_
      - test_string.py
  - base
    - test_conversion.py
    - test_unique.py
  - extension
    - base
      - interface.py
    - test_arrow.py
    - test_masked.py
  - frame
    - methods
    - test_reductions.py
  - groupby
    - methods
      - test_quantile.py
    - test_reductions.py
  - indexes
    - multi
      - test_constructors.py
    - numeric
      - test_indexing.py
  - indexing
    - test_iloc.py
    - test_loc.py
  - io/formats/style
    - test_highlight.py
  - reshape
    - test_cut.py
  - series
    - accessors
      - test_dt_accessor.py
    - methods
    - test_npfuncs.py

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+775

-206

lines changed

`‎asv_bench/benchmarks/algorithms.py‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -199,8 +199,8 @@ class SortIntegerArray:`
`199`	`199`	`params = [103, 105]`
`200`	`200`
`201`	`201`	`def setup(self, N):`
`202`		`- data = np.arange(N, dtype=float)`
`203`		`- data[40] = np.nan`
	`202`	`+ data = np.arange(N, dtype=float).astype(object)`
	`203`	`+ data[40] = pd.NA`
`204`	`204`	`self.array = pd.array(data, dtype="Int64")`
`205`	`205`
`206`	`206`	`def time_argsort(self, N):`

`‎asv_bench/benchmarks/frame_methods.py‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`	`import numpy as np`
`5`	`5`
`6`	`6`	`from pandas import (`
	`7`	`+ NA,`
`7`	`8`	`DataFrame,`
`8`	`9`	`Index,`
`9`	`10`	`MultiIndex,`
`@@ -445,6 +446,8 @@ def setup(self, inplace, dtype):`
`445`	`446`	`values[::2] = np.nan`
`446`	`447`	`if dtype == "Int64":`
`447`	`448`	`values = values.round()`
	`449`	`+ values = values.astype(object)`
	`450`	`+ values[::2] = NA`
`448`	`451`	`self.df = DataFrame(values, dtype=dtype)`
`449`	`452`	`self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict()`
`450`	`453`

`‎asv_bench/benchmarks/groupby.py‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -689,6 +689,10 @@ def setup(self, dtype, method, with_nans):`
`689`	`689`	`null_vals = vals.astype(float, copy=True)`
`690`	`690`	`null_vals[::2, :] = np.nan`
`691`	`691`	`null_vals[::3, :] = np.nan`
	`692`	`+ if dtype in ["Int64", "Float64"]:`
	`693`	`+ null_vals = null_vals.astype(object)`
	`694`	`+ null_vals[::2, :] = NA`
	`695`	`+ null_vals[::3, :] = NA`
`692`	`696`	`df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)`
`693`	`697`	`df["key"] = keys`
`694`	`698`	`self.df = df`

`‎doc/source/user_guide/text.rst‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ or convert from existing pandas data:`
`75`	`75`
`76`	`76`	`.. ipython:: python`
`77`	`77`
`78`		`- s1 = pd.Series([1, 2, np.nan], dtype="Int64")`
	`78`	`+ s1 = pd.Series([1, 2, pd.NA], dtype="Int64")`
`79`	`79`	`s1`
`80`	`80`	`s2 = s1.astype("string")`
`81`	`81`	`s2`

`‎doc/source/whatsnew/v0.24.0.rst‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
@@ -50,7 +50,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series``
`50`	`50`
`51`	`51`	`.. ipython:: python`
`52`	`52`
`53`		`- s = pd.Series([1, 2, np.nan], dtype='Int64')`
	`53`	`+ s = pd.Series([1, 2, pd.NA], dtype='Int64')`
`54`	`54`	`s`
`55`	`55`
`56`	`56`
@@ -166,7 +166,7 @@ See the :ref:`dtypes docs <basics.dtypes>` for more on extension arrays.
`166`	`166`
`167`	`167`	`.. ipython:: python`
`168`	`168`
`169`		`- pd.array([1, 2, np.nan], dtype='Int64')`
	`169`	`+ pd.array([1, 2, pd.NA], dtype='Int64')`
`170`	`170`	`pd.array(['a', 'b', 'c'], dtype='category')`
`171`	`171`
`172`	`172`	`Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.)`

`‎doc/source/whatsnew/v3.0.0.rst‎`

Lines changed: 49 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -465,6 +465,55 @@ small behavior differences as collateral:`
`465`	`465`	- Adding or subtracting a :class:`Day` with a :class:`Timedelta` is no longer supported.
`466`	`466`	- Adding or subtracting a :class:`Day` offset to a timezone-aware :class:`Timestamp` or datetime-like may lead to an ambiguous or non-existent time, which will raise.
`467`	`467`
	`468`	`+.. _whatsnew_300.api_breaking.nan_vs_na:`
	`469`	`+`
	`470`	`+Changed treatment of NaN values in pyarrow and numpy-nullable floating dtypes`
	`471`	`+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^`
	`472`	`+`
	`473`	+Previously, when dealing with a nullable dtype (e.g. ``Float64Dtype`` or ``int64[pyarrow]``), ``NaN`` was treated as interchangeable with :class:`NA` in some circumstances but not others. This was done to make adoption easier, but caused some confusion (:issue:`32265`). In 3.0, an option ``"mode.nan_is_na"`` (default ``True``) controls whether to treat ``NaN`` as equivalent to :class:`NA`.
	`474`	`+`
	`475`	+With ``pd.set_option("mode.nan_is_na", True)`` (again, this is the default), ``NaN`` can be passed to constructors, ``__setitem__``, ``__contains__`` and be treated the same as :class:`NA`. The only change users will see is that arithmetic and ``np.ufunc`` operations that previously introduced ``NaN`` entries produce :class:`NA` entries instead:
	`476`	`+`
	`477`	`+Old behavior:`
	`478`	`+`
	`479`	`+.. code-block:: ipython`
	`480`	`+`
	`481`	`+ In [2]: ser = pd.Series([0, None], dtype=pd.Float64Dtype())`
	`482`	`+ In [3]: ser / 0`
	`483`	`+ Out[3]:`
	`484`	`+ 0 NaN`
	`485`	`+ 1 <NA>`
	`486`	`+ dtype: Float64`
	`487`	`+`
	`488`	`+New behavior:`
	`489`	`+`
	`490`	`+.. ipython:: python`
	`491`	`+`
	`492`	`+ ser = pd.Series([0, None], dtype=pd.Float64Dtype())`
	`493`	`+ ser / 0`
	`494`	`+`
	`495`	+By contrast, with ``pd.set_option("mode.nan_is_na", False)``, ``NaN`` is always considered distinct and specifically as a floating-point value, so cannot be used with integer dtypes:
	`496`	`+`
	`497`	`+Old behavior:`
	`498`	`+`
	`499`	`+.. code-block:: ipython`
	`500`	`+`
	`501`	`+ In [2]: ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype())`
	`502`	`+ In [3]: ser[1]`
	`503`	`+ Out[3]: <NA>`
	`504`	`+`
	`505`	`+New behavior:`
	`506`	`+`
	`507`	`+.. ipython:: python`
	`508`	`+`
	`509`	`+ pd.set_option("mode.nan_is_na", False)`
	`510`	`+ ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype())`
	`511`	`+ ser[1]`
	`512`	`+`
	`513`	+If we had passed ``pd.Int64Dtype()`` or ``"int64[pyarrow]"`` for the dtype in the latter example, this would raise, as a float ``NaN`` cannot be held by an integer dtype.
	`514`	`+`
	`515`	+With ``"mode.nan_is_na"`` set to ``False``, ``ser.to_numpy()`` (and ``frame.values`` and ``np.asarray(obj)``) will convert to ``object`` dtype if :class:`NA` entries are present, where before they would coerce to ``NaN``. To retain a float numpy dtype, explicitly pass ``na_value=np.nan`` to :meth:`Series.to_numpy`.
	`516`	`+`
`468`	`517`	`.. _whatsnew_300.api_breaking.deps:`
`469`	`518`
`470`	`519`	`Increased minimum version for Python`

`‎pandas/_config/init.py‎`

Lines changed: 5 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -33,3 +33,8 @@`
`33`	`33`	`def using_string_dtype() -> bool:`
`34`	`34`	`_mode_options = _global_config["future"]`
`35`	`35`	`return _mode_options["infer_string"]`
	`36`	`+`
	`37`	`+`
	`38`	`+def is_nan_na() -> bool:`
	`39`	`+ _mode_options = _global_config["mode"]`
	`40`	`+ return _mode_options["nan_is_na"]`

`‎pandas/_libs/missing.pyi‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...`
`14`	`14`	`def checknull(val: object) -> bool: ...`
`15`	`15`	`def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...`
`16`	`16`	`def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...`
	`17`	`+def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...`

`‎pandas/_libs/missing.pyx‎`

Lines changed: 18 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):`
`249`	`249`	`return checknull_with_nat(obj) or obj is C_NA`
`250`	`250`
`251`	`251`
	`252`	`+@cython.wraparound(False)`
	`253`	`+@cython.boundscheck(False)`
	`254`	`+def is_pdna_or_none(values: ndarray) -> ndarray:`
	`255`	`+ cdef:`
	`256`	`+ ndarray[uint8_t] result`
	`257`	`+ Py_ssize_t i, N`
	`258`	`+ object val`
	`259`	`+`
	`260`	`+ N = len(values)`
	`261`	`+ result = np.zeros(N, dtype=np.uint8)`
	`262`	`+`
	`263`	`+ for i in range(N):`
	`264`	`+ val = values[i]`
	`265`	`+ if val is None or val is C_NA:`
	`266`	`+ result[i] = True`
	`267`	`+ return result.view(bool)`
	`268`	`+`
	`269`	`+`
`252`	`270`	`@cython.wraparound(False)`
`253`	`271`	`@cython.boundscheck(False)`
`254`	`272`	`def is_numeric_na(values: ndarray) -> ndarray:`

`‎pandas/_libs/parsers.pyx‎`

Lines changed: 3 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,8 @@ from csv import (`
`8`	`8`	`)`
`9`	`9`	`import warnings`
`10`	`10`
	`11`	`+from pandas._config import is_nan_na`
	`12`	`+`
`11`	`13`	`from pandas.util._exceptions import find_stack_level`
`12`	`14`
`13`	`15`	`from pandas import (`
`@@ -1469,7 +1471,7 @@ def _maybe_upcast(`
`1469`	`1471`	`if isinstance(arr, IntegerArray) and arr.isna().all():`
`1470`	`1472`	`# use null instead of int64 in pyarrow`
`1471`	`1473`	`arr = arr.to_numpy(na_value=None)`
`1472`		`- arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))`
	`1474`	`+ arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na()))`
`1473`	`1475`
`1474`	`1476`	`return arr`
`1475`	`1477`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit e4ca405

File tree

59 files changed

Some content is hidden

59 files changed

`‎asv_bench/benchmarks/algorithms.py‎`

`‎asv_bench/benchmarks/frame_methods.py‎`

`‎asv_bench/benchmarks/groupby.py‎`

`‎doc/source/user_guide/text.rst‎`

`‎doc/source/whatsnew/v0.24.0.rst‎`

`‎doc/source/whatsnew/v3.0.0.rst‎`

`‎pandas/_config/init.py‎`

`‎pandas/_libs/missing.pyi‎`

`‎pandas/_libs/missing.pyx‎`

`‎pandas/_libs/parsers.pyx‎`

0 commit comments