From 3e1784de50a9305dbccb185d4b2830a4c4addbed Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 May 2021 16:19:47 -0700 Subject: [PATCH 01/28] API: allow nan-likes in StringArray constructor --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/lib.pyx | 24 ++++++++++++++------ pandas/core/arrays/string_.py | 26 +++++++++++++++++----- pandas/tests/arrays/string_/test_string.py | 12 +++++----- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..fd246cb554d7f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -612,6 +612,7 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) +- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings. Build ===== diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1cb744c7033c..fcb6d39bfc91f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, bint convert_na_value=True, + bint coerce=True, bint copy=True, bint skipna=True, ): """ - Returns a new numpy array with object dtype and only strings and na values. + Checks that all elements in numpy are string or null and returns a new numpy array + with object dtype and only strings and na values if so. Otherwise, + raise a ValueError. Parameters ---------- @@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array( The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. + coerce : bool, default True + Whether to coerce non-null non-string elements to strings. + Will raise ValueError otherwise. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - result[i] = str(val) + if coerce: + result[i] = str(val) + else: + raise ValueError("Non-string element encountered in array.") else: if convert_na_value: val = na_value @@ -1835,10 +1844,6 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - cdef bint is_valid_null(self, object value) except -1: - # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA - cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: @@ -2059,7 +2064,7 @@ def maybe_convert_numeric( upcasting for ints with nulls to float64. Returns ------- - np.ndarray + np.ndarray or tuple of converted values and its mask Array of converted object values to numerical ones. Optional[np.ndarray] @@ -2224,6 +2229,11 @@ def maybe_convert_numeric( if allow_null_in_int and seen.null_ and not seen.int_: seen.float_ = True + # This occurs since we disabled float nulls showing as null in anticipation + # of seeing ints that were never seen. So then, we return float + if allow_null_in_int and seen.null_ and not seen.int_: + seen.float_ = True + if seen.complex_: return (complexes, None) elif seen.float_: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 74ca5130ca322..c30d4b8ba7b41 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -144,11 +144,18 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings or :attr:`pandas.NA`. + where the elements are Python strings + or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. + .. versionchanged:: 1.3 + + StringArray now accepts nan-likes in the constructor in addition + to strings, whereas it only accepted strings and :attr:`pandas.NA` + before. + copy : bool, default False Whether to copy the array of data. @@ -208,21 +215,30 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype()) - if not isinstance(values, type(self)): - self._validate() def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + try: + lib.ensure_string_array( + self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False + ), + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(), + ) + except ValueError: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 17d05ebeb0fc5..722aada176c44 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -297,13 +297,15 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) + cls(np.array(["a", None])) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) @pytest.mark.parametrize("copy", [True, False]) From 96ff1da535cd571cd45cb60d4cd1fdb47744f79e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 May 2021 19:31:47 -0700 Subject: [PATCH 02/28] Revert weird changes & Fix stuff --- pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 7 +------ pandas/tests/arrays/string_/test_string.py | 2 +- pandas/tests/dtypes/test_inference.py | 7 ++++--- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 9dbc47f1d40f7..22990361bc52e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -138,6 +138,7 @@ def ensure_string_array( arr, na_value: object = np.nan, convert_na_value: bool = True, + coerce: bool = True, copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index fcb6d39bfc91f..b1523421e59fd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2064,7 +2064,7 @@ def maybe_convert_numeric( upcasting for ints with nulls to float64. Returns ------- - np.ndarray or tuple of converted values and its mask + np.ndarray Array of converted object values to numerical ones. Optional[np.ndarray] @@ -2229,11 +2229,6 @@ def maybe_convert_numeric( if allow_null_in_int and seen.null_ and not seen.int_: seen.float_ = True - # This occurs since we disabled float nulls showing as null in anticipation - # of seeing ints that were never seen. So then, we return float - if allow_null_in_int and seen.null_ and not seen.int_: - seen.float_ = True - if seen.complex_: return (complexes, None) elif seen.float_: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 722aada176c44..b3bc3b09e047a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -297,7 +297,7 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None])) + cls(np.array(["a", np.nan])) @pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 076cc155f3626..73e87c75ee621 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1376,11 +1376,12 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA - assert not lib.is_string_array( + assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) - + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): From 418e1d201ad0c20b9c5119fff34567fe72158ec2 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 11 May 2021 07:01:06 -0700 Subject: [PATCH 03/28] Remove failing test --- pandas/tests/arrays/string_/test_string.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b3bc3b09e047a..7feb22f69632a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -296,9 +296,6 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan])) - @pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) def test_constructor_nan_like(na): From 25a6c4d2ec9287b5b0a341c3cdd583cc3659a276 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 19 May 2021 16:23:41 -0700 Subject: [PATCH 04/28] Changes from code review --- pandas/_libs/lib.pyi | 3 +-- pandas/_libs/lib.pyx | 24 ++++++++++++++---------- pandas/core/arrays/string_.py | 9 ++------- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/dtypes/cast.py | 4 ++-- 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 22990361bc52e..966fd0cd4c008 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -137,8 +137,7 @@ def maybe_convert_numeric( def ensure_string_array( arr, na_value: object = np.nan, - convert_na_value: bool = True, - coerce: bool = True, + coerce: str = "all, copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b1523421e59fd..fc3d73f332646 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -678,15 +678,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, - bint convert_na_value=True, - bint coerce=True, + coerce="all", bint copy=True, bint skipna=True, ): """ - Checks that all elements in numpy are string or null and returns a new numpy array - with object dtype and only strings and na values if so. Otherwise, - raise a ValueError. + Checks that all elements in numpy array are string or null + and returns a new numpy array with object dtype + and only strings and na values if so. Otherwise, raise a ValueError. Parameters ---------- @@ -696,9 +695,14 @@ cpdef ndarray[object] ensure_string_array( The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. - coerce : bool, default True - Whether to coerce non-null non-string elements to strings. - Will raise ValueError otherwise. + coerce : {{'all', 'null', 'non-null', None}}, default 'all' + Whether to coerce non-string elements to strings. + - 'all' will convert null values and non-null non-string values. + - 'null' will only convert nulls without converting other non-strings. + - 'non-null' will only convert non-null non-string elements to string. + - None will not convert anything. + If coerce is not all, a ValueError will be raised for values + that are not strings or na_value. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -730,12 +734,12 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - if coerce: + if coerce =="all" or coerce == "non-null": result[i] = str(val) else: raise ValueError("Non-string element encountered in array.") else: - if convert_na_value: + if coerce=="all" or coerce == "null": val = na_value if skipna: result[i] = val diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c30d4b8ba7b41..289204c9aa4e5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -230,13 +230,8 @@ def _validate(self): ) try: lib.ensure_string_array( - self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False + self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False ), - NDArrayBacked.__init__( - self, - self._ndarray, - StringDtype(), - ) except ValueError: raise ValueError("StringArray requires a sequence of strings or pandas.NA") @@ -251,7 +246,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, coerce="non-null") result[na_values] = StringDtype.na_value else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 219a8c7ec0b82..42b7bf1a52513 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -247,7 +247,7 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, coerce="non-null") return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 46dc97214e2f6..1e8c09136e223 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1125,7 +1125,7 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) + return lib.ensure_string_array(arr, skipna=skipna, convert_na_value="non-null") elif is_datetime64_dtype(arr): if dtype == np.int64: @@ -1925,7 +1925,7 @@ def construct_1d_ndarray_preserving_na( """ if dtype is not None and dtype.kind == "U": - subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + subarr = lib.ensure_string_array(values, coerce="non-null", copy=copy) else: if dtype is not None: _disallow_mismatched_datetimelike(values, dtype) From 8257dbd739a4b6f12b737f89da317a24d3f8b07f Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 20 May 2021 14:32:58 -0700 Subject: [PATCH 05/28] typo --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c256f20527ad6..46af33b724d2a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1125,7 +1125,7 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value="non-null") + return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null") elif is_datetime64_dtype(arr): if dtype == np.int64: From 922436a78903dfa55cd1d54d4381477cad934af5 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 21 May 2021 13:30:08 -0700 Subject: [PATCH 06/28] Update lib.pyi --- pandas/_libs/lib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 1e49ce67f7cec..726b306e71fd5 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -146,7 +146,7 @@ def maybe_convert_numeric( def ensure_string_array( arr, na_value: object = np.nan, - coerce: str = "all, + coerce: str = "all", copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] From 2f28086a0f23bf2b30d79ca41aaab0abb3ca370b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 29 May 2021 11:03:33 -0700 Subject: [PATCH 07/28] Update lib.pyx --- pandas/_libs/lib.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8df50a32ae482..99872d2f9e91f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1849,7 +1849,11 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - + + cdef bint is_valid_null(self, object value) except -1: + # Override to exclude float('Nan') and complex NaN + return value is None or value is C_NA or np.isnan(value) + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: From 3ee219815e619fb57edeee0c295ba36e84232e0a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 29 May 2021 11:05:19 -0700 Subject: [PATCH 08/28] Update lib.pyx --- pandas/_libs/lib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 99872d2f9e91f..ce70d15c202f5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -693,8 +693,6 @@ cpdef ndarray[object] ensure_string_array( The values to be converted to str, if needed. na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. - convert_na_value : bool, default True - If False, existing na values will be used unchanged in the new array. coerce : {{'all', 'null', 'non-null', None}}, default 'all' Whether to coerce non-string elements to strings. - 'all' will convert null values and non-null non-string values. @@ -1849,11 +1847,11 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - + cdef bint is_valid_null(self, object value) except -1: # Override to exclude float('Nan') and complex NaN return value is None or value is C_NA or np.isnan(value) - + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: From 3ee55f25a94a12da069a387a150164538394d460 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 29 May 2021 21:21:57 -0700 Subject: [PATCH 09/28] Updates --- pandas/tests/arrays/string_/test_string.py | 5 ++++- pandas/tests/dtypes/test_inference.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a246be938aef0..af57aff03b073 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -289,8 +289,11 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", pd.NaT], dtype=object)) + -@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) +@pytest.mark.parametrize("na", [np.nan, None, pd.NA]) def test_constructor_nan_like(na): expected = pd.arrays.StringArray(np.array(["a", pd.NA])) tm.assert_extension_array_equal( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 987b3accbca2e..87a1be80e3639 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1391,6 +1391,12 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) assert not lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=False ) From fe4981a6337cd59ae68b1ff44ca0f9b600d2ee49 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 30 May 2021 06:18:55 -0700 Subject: [PATCH 10/28] Update lib.pyx --- pandas/_libs/lib.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a1e66a575097e..08d7a68cd0dc0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -731,7 +731,10 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue - if not checknull(val): + if not (val is None or val is C_NA or np.isnan(val)): + # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid + # If they are present, they are treated like a regular Python object + # and will either cause an exception to be raised or be coerced. if coerce =="all" or coerce == "non-null": result[i] = str(val) else: From a66948aa7aa21d057c322895b59ea9f8c79480cd Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 30 May 2021 09:26:52 -0700 Subject: [PATCH 11/28] Update lib.pyx --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 08d7a68cd0dc0..a987f47533259 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -731,7 +731,7 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue - if not (val is None or val is C_NA or np.isnan(val)): + if not (val is None or val is C_NA or val != val): # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid # If they are present, they are treated like a regular Python object # and will either cause an exception to be raised or be coerced. From e8527191d33ed9c4416d265b175822c19bd5b4ae Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 31 May 2021 09:29:58 -0700 Subject: [PATCH 12/28] Update lib.pyx --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a987f47533259..f39b1fbc49cdb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -731,7 +731,7 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue - if not (val is None or val is C_NA or val != val): + if not (val is None or val is C_NA or val is np.nan): # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid # If they are present, they are treated like a regular Python object # and will either cause an exception to be raised or be coerced. @@ -1853,7 +1853,7 @@ cdef class StringValidator(Validator): cdef bint is_valid_null(self, object value) except -1: # Override to exclude float('Nan') and complex NaN - return value is None or value is C_NA or np.isnan(value) + return value is None or value is C_NA or value is np.nan cpdef bint is_string_array(ndarray values, bint skipna=False): From 91b73bb93aad90f26040c729b57d99ec26eb3941 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 08:28:52 -0700 Subject: [PATCH 13/28] disallow invalid nans in stringarray constructor --- pandas/_libs/lib.pyx | 23 ++++++++++++++++------- pandas/core/arrays/string_.py | 7 +++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f39b1fbc49cdb..e3fa8eeaa9b53 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -94,6 +94,7 @@ from pandas._libs.missing cimport ( is_null_timedelta64, isnaobj, ) +from pandas._libs.missing import checknull from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -696,10 +697,12 @@ cpdef ndarray[object] ensure_string_array( coerce : {{'all', 'null', 'non-null', None}}, default 'all' Whether to coerce non-string elements to strings. - 'all' will convert null values and non-null non-string values. - - 'null' will only convert nulls without converting other non-strings. + - 'strict-null' will only convert pd.NA, np.nan, or None to na_value + without converting other non-strings. + - 'null' will convert nulls to na_value w/out converting other non-strings. - 'non-null' will only convert non-null non-string elements to string. - None will not convert anything. - If coerce is not all, a ValueError will be raised for values + If coerce is not 'all', a ValueError will be raised for values that are not strings or na_value. copy : bool, default True Whether to ensure that a new array is returned. @@ -714,6 +717,7 @@ cpdef ndarray[object] ensure_string_array( """ cdef: Py_ssize_t i = 0, n = len(arr) + set strict_na_values = {C_NA, np.nan, None} if hasattr(arr, "to_numpy"): arr = arr.to_numpy() @@ -725,22 +729,27 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + if coerce == 'strict-null': + # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid + # If they are present, they are treated like a regular Python object + # and will either cause an exception to be raised or be coerced. + check_null = strict_na_values.__contains__ + else: + check_null = checknull + for i in range(n): val = arr[i] if isinstance(val, str): continue - if not (val is None or val is C_NA or val is np.nan): - # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid - # If they are present, they are treated like a regular Python object - # and will either cause an exception to be raised or be coerced. + if not check_null(val): if coerce =="all" or coerce == "non-null": result[i] = str(val) else: raise ValueError("Non-string element encountered in array.") else: - if coerce=="all" or coerce == "null": + if coerce=="all" or coerce == "null" or coerce == 'strict-null': val = na_value if skipna: result[i] = val diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 79ddd12476323..d0ea1aa5c5293 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -230,8 +230,11 @@ def _validate(self): ) try: lib.ensure_string_array( - self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False - ), + self._ndarray, + na_value=StringDtype.na_value, + coerce="strict-null", + copy=False, + ) except ValueError: raise ValueError("StringArray requires a sequence of strings or pandas.NA") From 41f49d21d8da2bbdcc37d33714d009ea2b862049 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 12:40:56 -0700 Subject: [PATCH 14/28] add to _from_sequence and fixes --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/arrays/string_.py | 18 ++++++++++++++---- pandas/core/arrays/string_arrow.py | 16 +++++++++++++--- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 93e27a7318f2d..4c5175b8e1bcc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -644,7 +644,7 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) -- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings. +- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NA``) in its constructor in addition to strings. - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d0ea1aa5c5293..4d97035714ba3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -145,7 +145,7 @@ class StringArray(PandasArray): Currently, this expects an object-dtype ndarray where the elements are Python strings - or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")). + or nan-likes(``None``, ``nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. @@ -239,7 +239,9 @@ def _validate(self): raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True + ): if dtype: assert dtype == "string" @@ -247,15 +249,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype + if coerce: + coerce = "non-null" + else: + coerce = None na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, coerce="non-null") + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) result[na_values] = StringDtype.na_value else: # convert non-na-likes to str, and nan-likes to StringDtype.na_value + if coerce: + coerce = "all" + else: + coerce = "strict-null" result = lib.ensure_string_array( - scalars, na_value=StringDtype.na_value, copy=copy + scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce ) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7aeadbb4c4616..f0af7a8a43594 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -237,7 +237,9 @@ def __init__(self, values): ) @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True + ): from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() @@ -247,11 +249,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, coerce="non-null") + if coerce: + coerce = "non-null" + else: + coerce = None + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) + if coerce: + coerce = "all" + else: + coerce = "strict-null" + result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce) return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod From 62cc5be5dde0b71c4cbb006378cbf27363a2577d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 16:49:58 -0700 Subject: [PATCH 15/28] address code review --- asv_bench/benchmarks/strings.py | 7 +++++++ pandas/_libs/lib.pyx | 2 ++ pandas/tests/arrays/string_/test_string.py | 7 +++++++ 3 files changed, 16 insertions(+) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2e109e59c1c6d..765697f1cc505 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,6 +7,7 @@ DataFrame, Series, ) +from pandas.core.arrays import StringArray from .pandas_vb_common import tm @@ -61,6 +62,12 @@ def time_cat_frame_construction(self, dtype): def peakmem_cat_frame_construction(self, dtype): DataFrame(self.frame_cat_arr, dtype=dtype) + def time_string_array_construction(self): + StringArray(self.series_arr) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) + class Methods(Dtypes): def time_center(self, dtype): diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c68758f990ad2..b73c8cbe0f018 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -719,6 +719,8 @@ cpdef ndarray[object] ensure_string_array( np.ndarray[object] An array with the input array's elements casted to str or nan-like. """ + if coerce not in {"all", "strict-null", "null", "non-null", None}: + raise ValueError("coerce argument is not valid") cdef: Py_ssize_t i = 0, n = len(arr) set strict_na_values = {C_NA, np.nan, None} diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 26770fcc1bf62..822c7f79e15dc 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -8,6 +8,7 @@ import numpy as np import pytest +import pandas._libs.lib as lib import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_dtype_equal @@ -303,6 +304,12 @@ def test_constructor_nan_like(na): ) +def test_invalid_coerce_raises(): + data = np.array(["a", "b'"], dtype=object) + with pytest.raises(ValueError, match="coerce argument is not valid"): + lib.ensure_string_array(data, coerce="abcd") + + @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): if cls is ArrowStringArray and copy is False: From 153b6b4d2c6015c2dedcc1692a4b3d4c53408ff9 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 19:34:44 -0700 Subject: [PATCH 16/28] Fix failures --- pandas/core/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 330902b402324..85a634042658e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -727,7 +727,7 @@ def _try_cast( elif dtype.kind == "U": # TODO: test cases with arr.dtype.kind in ["m", "M"] - return lib.ensure_string_array(arr, convert_na_value=False, copy=copy) + return lib.ensure_string_array(arr, coerce="non-null", copy=copy) elif dtype.kind in ["m", "M"]: return maybe_cast_to_datetime(arr, dtype) From b27a839c3cae55ce9765b91cdc593684ede18a73 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 19:55:13 -0700 Subject: [PATCH 17/28] maybe fix benchmarks? --- asv_bench/benchmarks/strings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 765697f1cc505..0ac6b7643fa9d 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -62,10 +62,10 @@ def time_cat_frame_construction(self, dtype): def peakmem_cat_frame_construction(self, dtype): DataFrame(self.frame_cat_arr, dtype=dtype) - def time_string_array_construction(self): + def time_string_array_construction(self, dtype): StringArray(self.series_arr) - def peakmem_stringarray_construction(self): + def peakmem_stringarray_construction(self, dtype): StringArray(self.series_arr) From ed5b9536d1ec30accfa23cc1726ec4fe2876fa1a Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 5 Jun 2021 12:43:42 -0700 Subject: [PATCH 18/28] Partially address code review --- doc/source/whatsnew/v1.3.0.rst | 3 +-- pandas/_libs/lib.pyx | 21 ++++++++++++++------- pandas/core/arrays/string_.py | 8 ++++---- pandas/tests/arrays/string_/test_string.py | 6 +++++- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 247548cf7c9f1..34ea5bf25d9ac 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -644,10 +644,9 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) -- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NA``) in its constructor in addition to strings. +- :class:`StringArray` now accepts nan-likes(``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) - Build ===== diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index abb714d6b383a..d55cc37b09f87 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -688,9 +688,7 @@ cpdef ndarray[object] ensure_string_array( bint skipna=True, ): """ - Checks that all elements in numpy array are string or null - and returns a new numpy array with object dtype - and only strings and na values if so. Otherwise, raise a ValueError. + Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- @@ -698,7 +696,7 @@ cpdef ndarray[object] ensure_string_array( The values to be converted to str, if needed. na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. - coerce : {{'all', 'null', 'non-null', None}}, default 'all' + coerce : {'all', 'null', 'non-null', None}, default 'all' Whether to coerce non-string elements to strings. - 'all' will convert null values and non-null non-string values. - 'strict-null' will only convert pd.NA, np.nan, or None to na_value @@ -717,10 +715,17 @@ cpdef ndarray[object] ensure_string_array( Returns ------- np.ndarray[object] - An array with the input array's elements casted to str or nan-like. + An array of strings and na_value. + + Raises + ------ + ValueError + If an element is encountered that is not a string or valid NA value + and element is not coerced. """ if coerce not in {"all", "strict-null", "null", "non-null", None}: - raise ValueError("coerce argument is not valid") + raise ValueError("coerce argument must be one of " + f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}") cdef: Py_ssize_t i = 0, n = len(arr) set strict_na_values = {C_NA, np.nan, None} @@ -753,7 +758,9 @@ cpdef ndarray[object] ensure_string_array( if coerce =="all" or coerce == "non-null": result[i] = str(val) else: - raise ValueError("Non-string element encountered in array.") + raise ValueError(f"Element {val} is not a string or valid null." + "If you want it to be coerced to a string," + "specify coerce='all'") else: if coerce=="all" or coerce == "null" or coerce == 'strict-null': val = na_value diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4d97035714ba3..e68b63346ce90 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -145,16 +145,16 @@ class StringArray(PandasArray): Currently, this expects an object-dtype ndarray where the elements are Python strings - or nan-likes(``None``, ``nan``, ``NA``). + or nan-likes(``None``, ``np.nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. .. versionchanged:: 1.3 - StringArray now accepts nan-likes in the constructor in addition - to strings, whereas it only accepted strings and :attr:`pandas.NA` - before. + StringArray now accepts nan-likes(``None``, ``np.nan``) for the + ``values`` parameter in its constructor + in addition to strings and :attr:`pandas.NA` copy : bool, default False Whether to copy the array of data. diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 822c7f79e15dc..75e5203d34f55 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -306,7 +306,11 @@ def test_constructor_nan_like(na): def test_invalid_coerce_raises(): data = np.array(["a", "b'"], dtype=object) - with pytest.raises(ValueError, match="coerce argument is not valid"): + with pytest.raises( + ValueError, + match="coerce argument must be one of " + "'all'|'strict-null'|'null'|'non-null'|None, not abcd", + ): lib.ensure_string_array(data, coerce="abcd") From caa57050887a355daf53830d3899e866b98b17ce Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 5 Jun 2021 20:47:50 -0700 Subject: [PATCH 19/28] Test coerce=False --- pandas/tests/arrays/string_/test_string.py | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 75e5203d34f55..0112ef2b2f5af 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -15,6 +15,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringDtype, @@ -314,6 +315,42 @@ def test_invalid_coerce_raises(): lib.ensure_string_array(data, coerce="abcd") +@pytest.mark.parametrize( + "values", + [ + np.array(["foo", "bar", pd.NA], dtype=object), + np.array(["foo", "bar", np.nan], dtype=object), + np.array(["foo", "bar", None], dtype=object), + BaseMaskedArray( + np.array(["foo", "bar", "garbage"]), np.array([False, False, True]) + ), + ], +) +def test_from_sequence_no_coerce(cls, values): + expected = pd.arrays.StringArray(np.array(["foo", "bar", pd.NA], dtype=object)) + result = cls._from_sequence(values, coerce=False) + # Use bare assert since classes are different + assert (result == expected).all() + + +@pytest.mark.parametrize( + "values", + [ + np.array(["foo", "bar", pd.NaT], dtype=object), + np.array(["foo", "bar", np.datetime64("nat")], dtype=object), + np.array(["foo", "bar", float("nan")], dtype=object), + ], +) +def test_from_sequence_no_coerce_invalid(cls, values): + with pytest.raises( + ValueError, + match="Element .* is not a string or valid null." + "If you want it to be coerced to a string," + "specify coerce='all'", + ): + cls._from_sequence(values, coerce=False) + + @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): if cls is ArrowStringArray and copy is False: From 2d75031a6ecb76aa9247c9925fe1ebe9de131eb1 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 7 Jun 2021 12:29:17 -0700 Subject: [PATCH 20/28] move benchmarks --- asv_bench/benchmarks/strings.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 0ac6b7643fa9d..8ebce086b9cf5 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,6 +3,7 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Series, @@ -26,7 +27,6 @@ def setup(self, dtype): class Construction: - params = ["str", "string"] param_names = ["dtype"] @@ -62,12 +62,6 @@ def time_cat_frame_construction(self, dtype): def peakmem_cat_frame_construction(self, dtype): DataFrame(self.frame_cat_arr, dtype=dtype) - def time_string_array_construction(self, dtype): - StringArray(self.series_arr) - - def peakmem_stringarray_construction(self, dtype): - StringArray(self.series_arr) - class Methods(Dtypes): def time_center(self, dtype): @@ -184,7 +178,6 @@ def time_isupper(self, dtype): class Repeat: - params = ["int", "array"] param_names = ["repeats"] @@ -199,7 +192,6 @@ def time_repeat(self, repeats): class Cat: - params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15]) param_names = ["other_cols", "sep", "na_rep", "na_frac"] @@ -224,7 +216,6 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains(Dtypes): - params = (Dtypes.params, [True, False]) param_names = ["dtype", "regex"] @@ -236,7 +227,6 @@ def time_contains(self, dtype, regex): class Split(Dtypes): - params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] @@ -252,7 +242,6 @@ def time_rsplit(self, dtype, expand): class Extract(Dtypes): - params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] @@ -294,3 +283,18 @@ class Iter(Dtypes): def time_iter(self, dtype): for i in self.s: pass + + +class StringArrayConstruction: + def setup(self): + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) + + def time_string_array_construction(self): + StringArray(self.series_arr) + + def time_string_array_with_nan_construction(self): + StringArray(self.series_arr_nan) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) From 52a00d1b01bbc639b4bf1fb323916ca79d4f9c0d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 7 Jun 2021 12:34:22 -0700 Subject: [PATCH 21/28] accidental formatting changes --- asv_bench/benchmarks/strings.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 8ebce086b9cf5..82278474ab337 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -27,6 +27,7 @@ def setup(self, dtype): class Construction: + params = ["str", "string"] param_names = ["dtype"] @@ -178,6 +179,7 @@ def time_isupper(self, dtype): class Repeat: + params = ["int", "array"] param_names = ["repeats"] @@ -192,6 +194,7 @@ def time_repeat(self, repeats): class Cat: + params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15]) param_names = ["other_cols", "sep", "na_rep", "na_frac"] @@ -216,6 +219,7 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains(Dtypes): + params = (Dtypes.params, [True, False]) param_names = ["dtype", "regex"] @@ -227,6 +231,7 @@ def time_contains(self, dtype, regex): class Split(Dtypes): + params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] @@ -242,6 +247,7 @@ def time_rsplit(self, dtype, expand): class Extract(Dtypes): + params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] From 8dc0b66fb3f0d54baef0d1468ffbc8fc5800fec0 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 7 Jun 2021 21:01:59 -0700 Subject: [PATCH 22/28] Fix --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 34ea5bf25d9ac..13e5a37b906c7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -644,8 +644,8 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) -- :class:`StringArray` now accepts nan-likes(``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) +- :class:`StringArray` now accepts nan-likes(``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) Build ===== From 66be08780e37f6d26fa2f4817e20430bac8974be Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 8 Jun 2021 10:48:56 -0700 Subject: [PATCH 23/28] missing import from conflict --- pandas/tests/arrays/string_/test_string.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8bd46b89c2a3c..934c9d96da442 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -15,6 +15,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray from pandas.core.arrays.string_arrow import ArrowStringArray From 3c5709438457cd5c45b8cde0282c113d55f1dfcc Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 21 Jul 2021 09:51:57 -0700 Subject: [PATCH 24/28] remove old whatsnew --- doc/source/whatsnew/v1.3.0.rst | 77 ++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 722fac61a9495..ed66861efad93 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_130: -What's new in 1.3.0 (??) ------------------------- +What's new in 1.3.0 (July 2, 2021) +---------------------------------- These are the changes in pandas 1.3.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -124,7 +124,7 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404 - The methods :meth:`.Styler.highlight_null`, :meth:`.Styler.highlight_min`, and :meth:`.Styler.highlight_max` now allow custom CSS highlighting instead of the default background coloring (:issue:`40242`) - :meth:`.Styler.apply` now accepts functions that return an ``ndarray`` when ``axis=None``, making it now consistent with the ``axis=0`` and ``axis=1`` behavior (:issue:`39359`) - When incorrectly formatted CSS is given via :meth:`.Styler.apply` or :meth:`.Styler.applymap`, an error is now raised upon rendering (:issue:`39660`) - - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTex escaping (:issue:`40388`, :issue:`41619`) + - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTeX escaping (:issue:`40388`, :issue:`41619`) - :meth:`.Styler.background_gradient` has gained the argument ``gmap`` to supply a specific gradient map for shading (:issue:`22727`) - :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`) - Added the method :meth:`.Styler.highlight_between` (:issue:`39821`) @@ -136,7 +136,7 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404 - Many features of the :class:`.Styler` class are now either partially or fully usable on a DataFrame with a non-unique indexes or columns (:issue:`41143`) - One has greater control of the display through separate sparsification of the index or columns using the :ref:`new styler options `, which are also usable via :func:`option_context` (:issue:`41142`) - Added the option ``styler.render.max_elements`` to avoid browser overload when styling large DataFrames (:issue:`40712`) - - Added the method :meth:`.Styler.to_latex` (:issue:`21673`), which also allows some limited CSS conversion (:issue:`40731`) + - Added the method :meth:`.Styler.to_latex` (:issue:`21673`, :issue:`42320`), which also allows some limited CSS conversion (:issue:`40731`) - Added the method :meth:`.Styler.to_html` (:issue:`13379`) - Added the method :meth:`.Styler.set_sticky` to make index and column headers permanently visible in scrolling HTML frames (:issue:`29072`) @@ -252,7 +252,7 @@ Other enhancements - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) - :func:`read_excel` can now auto-detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) -- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) +- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behavior of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -276,7 +276,9 @@ Other enhancements - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) +- :meth:`Series.between` can now accept ``left`` or ``right`` as arguments to ``inclusive`` to include only the left or right boundary (:issue:`40245`) - :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`) +- :meth:`DataFrame.sample` now accepts the ``ignore_index`` argument to reset the index after sampling, similar to :meth:`DataFrame.drop_duplicates` and :meth:`DataFrame.sort_values` (:issue:`38581`) .. --------------------------------------------------------------------------- @@ -305,7 +307,7 @@ As an example of this, given: original = pd.Series(cat) unique = original.unique() -*pandas < 1.3.0*: +*Previous behavior*: .. code-block:: ipython @@ -315,7 +317,7 @@ As an example of this, given: In [2]: original.dtype == unique.dtype False -*pandas >= 1.3.0* +*New behavior*: .. ipython:: python @@ -337,7 +339,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first` df2 combined = df1.combine_first(df2) -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -348,7 +350,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first` C float64 dtype: object -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -371,7 +373,7 @@ values as measured by ``np.allclose``. Now no such casting occurs. df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]}) df -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -381,7 +383,7 @@ values as measured by ``np.allclose``. Now no such casting occurs. key 1 True 2 -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -399,7 +401,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`) df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]}) -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -408,7 +410,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`) a b c 0 True 1 1.0 -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -432,7 +434,7 @@ insert the values into the existing data rather than create an entirely new arra In both the new and old behavior, the data in ``values`` is overwritten, but in the old behavior the dtype of ``df["A"]`` changed to ``int64``. -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -447,7 +449,7 @@ the old behavior the dtype of ``df["A"]`` changed to ``int64``. In pandas 1.3.0, ``df`` continues to share data with ``values`` -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -474,7 +476,7 @@ never casting to the dtypes of the existing arrays. In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing array backing ``df``: -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -484,7 +486,7 @@ array backing ``df``: In the new behavior, we get a new array, and retain an integer-dtyped ``5``: -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -507,7 +509,7 @@ casts to ``dtype=object`` (:issue:`38709`) ser2 = orig.copy() ser2.iloc[1] = 2.0 -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -523,7 +525,7 @@ casts to ``dtype=object`` (:issue:`38709`) 1 2.0 dtype: object -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -705,7 +707,8 @@ Other API changes - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as `turbodbc `_ (:issue:`36893`) - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) - :meth:`ExtensionDtype.construct_array_type` is now a required method instead of an optional one for :class:`ExtensionDtype` subclasses (:issue:`24860`) -- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) +- Calling ``hash`` on non-hashable pandas objects will now raise ``TypeError`` with the built-in error message (e.g. ``unhashable type: 'Series'``). Previously it would raise a custom message such as ``'Series' objects are mutable, thus they cannot be hashed``. Furthermore, ``isinstance(, abc.collections.Hashable)`` will now return ``False`` (:issue:`40013`) +- :meth:`.Styler.from_custom_template` now has two new arguments for template names, and removed the old ``name``, due to template inheritance having been introducing for better parsing (:issue:`42053`). Subclassing modifications to Styler attributes are also needed. .. _whatsnew_130.api_breaking.build: @@ -787,6 +790,8 @@ For example: 1 2 2 12 +*Future behavior*: + .. code-block:: ipython In [5]: gb.prod(numeric_only=False) @@ -816,8 +821,8 @@ Other Deprecations - Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`) - Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) -- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) -- Deprecated :meth:`.Styler.where` in favour of using an alternative formulation with :meth:`Styler.applymap` (:issue:`40821`) +- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favor of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) +- Deprecated :meth:`.Styler.where` in favor of using an alternative formulation with :meth:`Styler.applymap` (:issue:`40821`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) - Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`) - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) @@ -839,6 +844,7 @@ Other Deprecations - Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) - In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) - Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`) +- Deprecated boolean arguments of ``inclusive`` in :meth:`Series.between` to have ``{"left", "right", "neither", "both"}`` as standard argument values (:issue:`40628`) - Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`): - :func:`concat` (other than ``objs``) @@ -885,7 +891,7 @@ Performance improvements - Performance improvement in :class:`.Styler` where render times are more than 50% reduced and now matches :meth:`DataFrame.to_html` (:issue:`39972` :issue:`39952`, :issue:`40425`) - The method :meth:`.Styler.set_td_classes` is now as performant as :meth:`.Styler.apply` and :meth:`.Styler.applymap`, and even more so in some cases (:issue:`40453`) - Performance improvement in :meth:`.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) -- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) +- Performance improvement in :meth:`.GroupBy.apply` when requiring the Python fallback implementation (:issue:`40176`) - Performance improvement in the conversion of a PyArrow Boolean array to a pandas nullable Boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) @@ -925,6 +931,7 @@ Datetimelike - Bug in :meth:`Timedelta.round`, :meth:`Timedelta.floor`, :meth:`Timedelta.ceil` for values near the implementation bounds of :class:`Timedelta` (:issue:`38964`) - Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`) - Bug in :func:`infer_freq` incorrectly fails to infer 'H' frequency of :class:`DatetimeIndex` if the latter has a timezone and crosses DST boundaries (:issue:`39556`) +- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) Timedelta ^^^^^^^^^ @@ -955,7 +962,8 @@ Numeric - Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`) - Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`) - Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`) - +- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) +- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) Conversion ^^^^^^^^^^ @@ -971,6 +979,12 @@ Conversion - Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`) - Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`) - Bug in :class:`DataFrame` construction when given a two-dimensional object-dtype ``np.ndarray`` of :class:`Period` or :class:`Interval` objects failing to cast to :class:`PeriodDtype` or :class:`IntervalDtype`, respectively (:issue:`41812`) +- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) +- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) +- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) +- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`) +- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) +- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`) Strings ^^^^^^^ @@ -1032,6 +1046,8 @@ Indexing - Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`) - Bug in :meth:`Index.get_indexer` failing to raise ``ValueError`` in some cases with invalid ``method``, ``limit``, or ``tolerance`` arguments (:issue:`41918`) - Bug when slicing a :class:`Series` or :class:`DataFrame` with a :class:`TimedeltaIndex` when passing an invalid string raising ``ValueError`` instead of a ``TypeError`` (:issue:`41821`) +- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`) +- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) Missing ^^^^^^^ @@ -1201,24 +1217,13 @@ Styler Other ^^^^^ -- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`) -- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`) -- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) -- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) - ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`) - Bug in :meth:`Series.where` with numeric dtype and ``other=None`` not casting to ``nan`` (:issue:`39761`) -- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`.assert_series_equal`, :func:`.assert_frame_equal`, :func:`.assert_index_equal` and :func:`.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) - Bug in :func:`.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, and :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`show_versions` where console JSON output was not proper JSON (:issue:`39701`) - pandas can now compile on z/OS when using `xlc `_ (:issue:`35826`) -- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`) -- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`) -- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) -- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) -- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) -- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) - Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`) .. --------------------------------------------------------------------------- @@ -1228,4 +1233,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.2.4..v1.3.0|HEAD +.. contributors:: v1.2.5..v1.3.0 From 12351de548d50013fb55d17c7ab97e8b70f1fcab Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 21 Jul 2021 09:53:00 -0700 Subject: [PATCH 25/28] move whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8d96d49daba4f..21328461f9cf2 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -142,7 +142,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) -- +- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) .. --------------------------------------------------------------------------- From 358000f79fe8c1c456c1a7e1090e7aaaffa223b1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 4 Oct 2021 13:31:41 -0700 Subject: [PATCH 26/28] typo --- pandas/_libs/lib.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c580ef0269d89..09f8398212168 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -758,11 +758,11 @@ cpdef ndarray[object] ensure_string_array( if not check_null(val): if coerce =="all" or coerce == "non-null": if not isinstance(val, np.floating): - # f"{val}" is faster than str(val) - result[i] = f"{val}" - else: - # f"{val}" is not always equivalent to str(val) for floats - result[i] = str(val) + # f"{val}" is faster than str(val) + result[i] = f"{val}" + else: + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) else: raise ValueError(f"Element {val} is not a string or valid null." "If you want it to be coerced to a string," From 20817a790c8fadeca336b9ad25d8db34bfea3efd Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 27 Dec 2021 09:21:45 -0800 Subject: [PATCH 27/28] address comments --- pandas/_libs/lib.pyx | 53 +++++++++++++++++++++++++++++------ pandas/core/arrays/string_.py | 2 +- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7974e732a5e0d..8939b2853d0ef 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -671,6 +671,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: return result +ctypedef enum coerce_options: + all = 0 + strict_null = 1 + null = 2 + non_null = 3 + none = 4 + + @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[object] ensure_string_array( @@ -689,11 +697,11 @@ cpdef ndarray[object] ensure_string_array( The values to be converted to str, if needed. na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. - coerce : {'all', 'null', 'non-null', None}, default 'all' + coerce : {'all', 'strict-null', 'null', 'non-null', None}, default 'all' Whether to coerce non-string elements to strings. - - 'all' will convert null values and non-null non-string values. + - 'all' will convert all non-string values. - 'strict-null' will only convert pd.NA, np.nan, or None to na_value - without converting other non-strings. + raising when encountering non-strings and other null values. - 'null' will convert nulls to na_value w/out converting other non-strings. - 'non-null' will only convert non-null non-string elements to string. - None will not convert anything. @@ -715,13 +723,40 @@ cpdef ndarray[object] ensure_string_array( ValueError If an element is encountered that is not a string or valid NA value and element is not coerced. + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> ensure_string_array(np.array([1,2,3, np.datetime64("nat")]), coerce="all") + array("1", "2", "3", np.nan) + >>> ensure_string_array(np.array([pd.NA, "a", None]), coerce="strict-null") + array(np.nan, "a", np.nan) + >>> ensure_string_array(np.array([pd.NaT, "1"]), coerce="null") + array(np.nan, "1") + >>> ensure_string_array(np.array([1,2,3]), coerce="non-null") + array("1", "2", "3") + >>> ensure_string_array(np.array(["1", "2", "3"]), coerce=None) + array("1", "2", "3") """ - if coerce not in {"all", "strict-null", "null", "non-null", None}: - raise ValueError("coerce argument must be one of " - f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}") cdef: Py_ssize_t i = 0, n = len(arr) set strict_na_values = {C_NA, np.nan, None} + coerce_options coerce_val + + if coerce == "all": + coerce_val = all + elif coerce == "strict-null": + coerce_val = strict_null + elif coerce == "null": + coerce_val = null + elif coerce == "non-null": + coerce_val = non_null + elif coerce is None: + coerce_val = none + else: + raise ValueError("coerce argument must be one of " + f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}") if hasattr(arr, "to_numpy"): @@ -741,7 +776,7 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() - if coerce == 'strict-null': + if coerce_val == strict_null: # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid # If they are present, they are treated like a regular Python object # and will either cause an exception to be raised or be coerced. @@ -756,7 +791,7 @@ cpdef ndarray[object] ensure_string_array( continue if not check_null(val): - if coerce =="all" or coerce == "non-null": + if coerce_val == all or coerce_val == non_null: if not isinstance(val, np.floating): # f"{val}" is faster than str(val) result[i] = f"{val}" @@ -768,7 +803,7 @@ cpdef ndarray[object] ensure_string_array( "If you want it to be coerced to a string," "specify coerce='all'") else: - if coerce=="all" or coerce == "null" or coerce == 'strict-null': + if coerce_val != non_null and coerce_val != none: val = na_value if skipna: result[i] = val diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c2b659d752ad2..8fe5343e471ae 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -252,7 +252,7 @@ class StringArray(BaseStringArray, PandasArray): :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. - .. versionchanged:: 1.3 + .. versionchanged:: 1.4.0 StringArray now accepts nan-likes(``None``, ``np.nan``) for the ``values`` parameter in its constructor From 33d8f9a6bb4d01b2a4c3c66cf40bdcd74e764888 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 27 Dec 2021 12:11:55 -0800 Subject: [PATCH 28/28] accept any float nan w/ util.is_nan --- pandas/_libs/lib.pyx | 17 +++++++++++------ pandas/tests/arrays/string_/test_string.py | 10 +++++++--- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8939b2853d0ef..e373c8a584913 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -679,6 +679,11 @@ ctypedef enum coerce_options: none = 4 +def strict_check_null(x): + # Cython doesn't let me define this in ensure_string_array :( + return x is None or x is C_NA or util.is_nan(x) + + @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[object] ensure_string_array( @@ -729,15 +734,15 @@ cpdef ndarray[object] ensure_string_array( >>> import numpy as np >>> import pandas as pd >>> ensure_string_array(np.array([1,2,3, np.datetime64("nat")]), coerce="all") - array("1", "2", "3", np.nan) + array(['1', '2', '3', nan], dtype=object) >>> ensure_string_array(np.array([pd.NA, "a", None]), coerce="strict-null") - array(np.nan, "a", np.nan) + array([nan, 'a', nan], dtype=object) >>> ensure_string_array(np.array([pd.NaT, "1"]), coerce="null") - array(np.nan, "1") + array([nan, '1'], dtype=object) >>> ensure_string_array(np.array([1,2,3]), coerce="non-null") - array("1", "2", "3") + array(['1', '2', '3'], dtype=object) >>> ensure_string_array(np.array(["1", "2", "3"]), coerce=None) - array("1", "2", "3") + array(['1', '2', '3'], dtype=object) """ cdef: Py_ssize_t i = 0, n = len(arr) @@ -780,7 +785,7 @@ cpdef ndarray[object] ensure_string_array( # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid # If they are present, they are treated like a regular Python object # and will either cause an exception to be raised or be coerced. - check_null = strict_na_values.__contains__ + check_null = strict_check_null else: check_null = checknull diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 4a85daf653d53..10ff1c12d6fa8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,6 +2,8 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +from decimal import Decimal + import numpy as np import pytest @@ -272,7 +274,7 @@ def test_constructor_raises(cls): cls(np.array(["a", pd.NaT], dtype=object)) -@pytest.mark.parametrize("na", [np.nan, None, pd.NA]) +@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA]) def test_constructor_nan_like(na): expected = pd.arrays.StringArray(np.array(["a", pd.NA])) tm.assert_extension_array_equal( @@ -281,7 +283,7 @@ def test_constructor_nan_like(na): def test_invalid_coerce_raises(): - data = np.array(["a", "b'"], dtype=object) + data = np.array(["a", "b"], dtype=object) with pytest.raises( ValueError, match="coerce argument must be one of " @@ -296,6 +298,8 @@ def test_invalid_coerce_raises(): np.array(["foo", "bar", pd.NA], dtype=object), np.array(["foo", "bar", np.nan], dtype=object), np.array(["foo", "bar", None], dtype=object), + np.array(["foo", "bar", float("nan")], dtype=object), + np.array(["foo", "bar", np.float64("nan")], dtype=object), BaseMaskedArray( np.array(["foo", "bar", "garbage"]), np.array([False, False, True]) ), @@ -313,7 +317,7 @@ def test_from_sequence_no_coerce(cls, values): [ np.array(["foo", "bar", pd.NaT], dtype=object), np.array(["foo", "bar", np.datetime64("nat")], dtype=object), - np.array(["foo", "bar", float("nan")], dtype=object), + np.array(["foo", "bar", Decimal("nan")], dtype=object), ], ) def test_from_sequence_no_coerce_invalid(cls, values):