From ba5fce1a97b76974d7fe54205ceeb6e83a2566e2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 2 Jan 2022 07:27:35 -0800 Subject: [PATCH 01/12] API: Allow other na values in StringArray Constructor --- asv_bench/benchmarks/strings.py | 17 +++++++++++++++ doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/_libs/lib.pyx | 25 ++++++++++++++++++---- pandas/core/arrays/string_.py | 6 ++++-- pandas/tests/arrays/string_/test_string.py | 13 +++++++---- 5 files changed, 52 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 32fbf4e6c7de3..85487f5d531a3 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,10 +3,12 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Series, ) +from pandas.arrays import StringArray from .pandas_vb_common import tm @@ -285,3 +287,18 @@ class Iter(Dtypes): def time_iter(self, dtype): for i in self.s: pass + + +class StringArrayConstruction: + def setup(self): + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) + + def time_string_array_construction(self): + StringArray(self.series_arr) + + def time_string_array_with_nan_construction(self): + StringArray(self.series_arr_nan) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4c3e53ddcfa26..30933f4d7381f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -225,7 +225,7 @@ Other enhancements - :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). - Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`) - :meth:`DataFrame.to_sql` now returns an ``int`` of the number of written rows (:issue:`23998`) - +- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 950277ce608eb..f6c55f76ddd20 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -669,6 +669,27 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: return result +# TODO: get rid of this in StringArray and modify +# and go through ensure_string_array instead +@cython.wraparound(False) +@cython.boundscheck(False) +def convert_nans_to_NA(ndarray[object] arr) -> ndarray: + """ + Helper for StringArray that converts null values that + are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements + have already been validated as null. + """ + cdef: + Py_ssize_t i, n = len(arr) + object val + ndarray[object] result + result = np.asarray(arr, dtype="object") + for i in range(n): + val = arr[i] + if not isinstance(val, str): + result[i] = C_NA + return result + @cython.wraparound(False) @cython.boundscheck(False) @@ -1880,10 +1901,6 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - cdef bint is_valid_null(self, object value) except -1: - # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA - cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 919b882f22ecb..46a41ae813ca3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -310,11 +310,11 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) - if not isinstance(values, type(self)): - self._validate() def _validate(self): """Validate that we only store NA or strings.""" @@ -325,6 +325,8 @@ def _validate(self): "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + # Check to see if need to convert Na values to pd.NA + lib.convert_nans_to_NA(self._ndarray) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 22fe7bb0de949..0919d57f9e612 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -267,15 +267,20 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) - - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) + cls(np.array(["a", np.datetime64("nat")], dtype=object)) with pytest.raises(ValueError, match=msg): cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) + + @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): if cls is ArrowStringArray and copy is False: From 8011f8d0319b78764daca2473aacbef304df34fb Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 2 Jan 2022 18:24:22 -0800 Subject: [PATCH 02/12] fix compile warning? --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f6c55f76ddd20..70f3631a8aadf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -687,7 +687,7 @@ def convert_nans_to_NA(ndarray[object] arr) -> ndarray: for i in range(n): val = arr[i] if not isinstance(val, str): - result[i] = C_NA + result[i] = C_NA return result From 36ad88611de070bfbd4a26664357b75c4e42ab5a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 2 Jan 2022 20:13:19 -0800 Subject: [PATCH 03/12] 2d support --- pandas/_libs/lib.pyx | 27 ++++++++++++++++++++------- pandas/tests/dtypes/test_inference.py | 13 ++++++++++--- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 70f3631a8aadf..4882a6993405e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -669,25 +669,38 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: return result +ctypedef fused ndarr_object: + ndarray[object, ndim=1] + ndarray[object, ndim=2] + # TODO: get rid of this in StringArray and modify # and go through ensure_string_array instead @cython.wraparound(False) @cython.boundscheck(False) -def convert_nans_to_NA(ndarray[object] arr) -> ndarray: +def convert_nans_to_NA(ndarr_object arr) -> ndarray: """ Helper for StringArray that converts null values that are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements have already been validated as null. """ cdef: - Py_ssize_t i, n = len(arr) + Py_ssize_t i, m, n object val - ndarray[object] result + ndarr_object result result = np.asarray(arr, dtype="object") - for i in range(n): - val = arr[i] - if not isinstance(val, str): - result[i] = C_NA + if arr.ndim == 2: + m,n = arr.shape[0], arr.shape[1] + for i in range(m): + for j in range(n): + val = arr[i][j] + if not isinstance(val, str): + result[i][j] = C_NA + else: + n = len(arr) + for i in range(n): + val = arr[i] + if not isinstance(val, str): + result[i] = C_NA return result diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7953d650636be..178c91efa0c27 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1542,11 +1542,18 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) assert not lib.is_string_array( - np.array(["foo", "bar", np.nan], dtype=object), skipna=True + np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False ) - assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): From 3ddcf37472d58267d57f4d76611187ab3a3713a6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 3 Jan 2022 19:09:35 -0800 Subject: [PATCH 04/12] fix last tests and address comments --- pandas/_libs/lib.pyx | 4 ++-- pandas/core/arrays/string_.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4882a6993405e..4a00a9dcbb183 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -692,9 +692,9 @@ def convert_nans_to_NA(ndarr_object arr) -> ndarray: m,n = arr.shape[0], arr.shape[1] for i in range(m): for j in range(n): - val = arr[i][j] + val = arr[i, j] if not isinstance(val, str): - result[i][j] = C_NA + result[i, j] = C_NA else: n = len(arr) for i in range(n): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 46a41ae813ca3..1ba739f1a4c45 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -326,7 +326,11 @@ def _validate(self): f"'{self._ndarray.dtype}' dtype instead." ) # Check to see if need to convert Na values to pd.NA - lib.convert_nans_to_NA(self._ndarray) + if self._ndarray.ndim > 2: + # Ravel if ndims > 2 b/c no cythonized version available + lib.convert_nans_to_NA(self._ndarray.ravel("K")) + else: + lib.convert_nans_to_NA(self._ndarray) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): From 4c60d0f5044b32d07dc06a49b36db295fe1646dd Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 4 Jan 2022 08:27:44 -0800 Subject: [PATCH 05/12] fix typing? --- pandas/_libs/lib.pyi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index a7ebd9d0c77ad..6a1519c827c7a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -161,6 +161,9 @@ def astype_intsafe( arr: npt.NDArray[np.object_], new_dtype: np.dtype, ) -> np.ndarray: ... +def convert_nans_to_NA( + arr: npt.NDArray[np.object_], +) -> npt.NDArray[np.object_]: ... def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ... # TODO: can we be more specific about rows? From d5e594d83ba1ab46a318325356dbce40608ba95e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 10 Jan 2022 20:18:31 -0800 Subject: [PATCH 06/12] add more doc --- pandas/core/arrays/string_.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1ba739f1a4c45..29dfea4ea2d71 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -246,11 +246,19 @@ class StringArray(BaseStringArray, PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings or :attr:`pandas.NA`. + where the elements are Python strings + or nan-likes(``None``, ``np.nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. + .. versionchanged:: 1.5.0 + + StringArray now accepts nan-likes(``None``, ``np.nan``) for the + ``values`` parameter in its constructor + in addition to strings and :attr:`pandas.NA` + + copy : bool, default False Whether to copy the array of data. From 9c6e9d3fced6a249cee7b0179e24f50c0ce6d875 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 10 Jan 2022 20:20:26 -0800 Subject: [PATCH 07/12] move whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 - doc/source/whatsnew/v1.5.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7c9345a6a6b75..12bd51cba47a3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -226,7 +226,6 @@ Other enhancements - :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). - Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`) - :meth:`DataFrame.to_sql` now returns an ``int`` of the number of written rows (:issue:`23998`) -- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 2cbc7b06b89df..a952d65c3cb84 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -30,7 +30,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) - .. --------------------------------------------------------------------------- From 1fca52467211764a6cbe4de5403ecd107b234878 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 11 Jan 2022 16:53:20 -0800 Subject: [PATCH 08/12] Update string_.py --- pandas/core/arrays/string_.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 29dfea4ea2d71..66a7da4dc4078 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -258,7 +258,6 @@ class StringArray(BaseStringArray, PandasArray): ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA` - copy : bool, default False Whether to copy the array of data. From 9016c0084fc88c6240b4996f498632abf385173d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 11 Jan 2022 20:09:15 -0800 Subject: [PATCH 09/12] fixes --- pandas/_libs/lib.pyx | 2 +- pandas/tests/dtypes/test_inference.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 71fdda1e60649..9a82e89481b45 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -689,7 +689,7 @@ def convert_nans_to_NA(ndarr_object arr) -> ndarray: ndarr_object result result = np.asarray(arr, dtype="object") if arr.ndim == 2: - m,n = arr.shape[0], arr.shape[1] + m, n = arr.shape[0], arr.shape[1] for i in range(m): for j in range(n): val = arr[i, j] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 178c91efa0c27..404df6f4daadb 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1534,7 +1534,9 @@ def test_is_numeric_array(self): assert not lib.is_integer_array(np.array([1, 2.0])) def test_is_string_array(self): - + # We should only be accepting pd.NA, np.nan, + # other floating point nans e.g. float('nan')] + # when skipna is True. assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=False @@ -1545,6 +1547,9 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", None], dtype=object), skipna=True ) + assert lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=True + ) assert not lib.is_string_array( np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True ) From 6ec20595276ceb5b21657757f0130e0fa5e51e30 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 12 Jan 2022 04:24:56 +0000 Subject: [PATCH 10/12] Fixes from pre-commit [automated commit] --- pandas/tests/dtypes/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 404df6f4daadb..9ae5b42161b73 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1534,7 +1534,7 @@ def test_is_numeric_array(self): assert not lib.is_integer_array(np.array([1, 2.0])) def test_is_string_array(self): - # We should only be accepting pd.NA, np.nan, + # We should only be accepting pd.NA, np.nan, # other floating point nans e.g. float('nan')] # when skipna is True. assert lib.is_string_array(np.array(["foo", "bar"])) From 1fb424c2f53210bf4a96dd1dd628572f99a6cbc8 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 15 Jan 2022 07:56:46 -0800 Subject: [PATCH 11/12] code review --- pandas/core/arrays/string_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 66a7da4dc4078..8142f66dcd3b3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -247,15 +247,15 @@ class StringArray(BaseStringArray, PandasArray): Currently, this expects an object-dtype ndarray where the elements are Python strings - or nan-likes(``None``, ``np.nan``, ``NA``). + or nan-likes (``None``, ``np.nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. .. versionchanged:: 1.5.0 - StringArray now accepts nan-likes(``None``, ``np.nan``) for the - ``values`` parameter in its constructor + StringArray now accepts array-likes containing + nan-likes(``None``, ``np.nan``) for the ``values`` parameter in addition to strings and :attr:`pandas.NA` copy : bool, default False From ad55cd393485028d0a39e66556838751aff736d4 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 15 Jan 2022 15:58:11 +0000 Subject: [PATCH 12/12] Fixes from pre-commit [automated commit] --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8142f66dcd3b3..b79e915fa6c94 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -254,7 +254,7 @@ class StringArray(BaseStringArray, PandasArray): .. versionchanged:: 1.5.0 - StringArray now accepts array-likes containing + StringArray now accepts array-likes containing nan-likes(``None``, ``np.nan``) for the ``values`` parameter in addition to strings and :attr:`pandas.NA`