From ba5fce1a97b76974d7fe54205ceeb6e83a2566e2 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 2 Jan 2022 07:27:35 -0800
Subject: [PATCH 01/12] API: Allow other na values in StringArray Constructor
---
asv_bench/benchmarks/strings.py | 17 +++++++++++++++
doc/source/whatsnew/v1.4.0.rst | 2 +-
pandas/_libs/lib.pyx | 25 ++++++++++++++++++----
pandas/core/arrays/string_.py | 6 ++++--
pandas/tests/arrays/string_/test_string.py | 13 +++++++----
5 files changed, 52 insertions(+), 11 deletions(-)
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 32fbf4e6c7de3..85487f5d531a3 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -3,10 +3,12 @@
import numpy as np
from pandas import (
+ NA,
Categorical,
DataFrame,
Series,
)
+from pandas.arrays import StringArray
from .pandas_vb_common import tm
@@ -285,3 +287,18 @@ class Iter(Dtypes):
def time_iter(self, dtype):
for i in self.s:
pass
+
+
+class StringArrayConstruction:
+ def setup(self):
+ self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
+ self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])
+
+ def time_string_array_construction(self):
+ StringArray(self.series_arr)
+
+ def time_string_array_with_nan_construction(self):
+ StringArray(self.series_arr_nan)
+
+ def peakmem_stringarray_construction(self):
+ StringArray(self.series_arr)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 4c3e53ddcfa26..30933f4d7381f 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -225,7 +225,7 @@ Other enhancements
- :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`).
- Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`)
- :meth:`DataFrame.to_sql` now returns an ``int`` of the number of written rows (:issue:`23998`)
-
+- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
.. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 950277ce608eb..f6c55f76ddd20 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -669,6 +669,27 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
return result
+# TODO: get rid of this in StringArray and modify
+# and go through ensure_string_array instead
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def convert_nans_to_NA(ndarray[object] arr) -> ndarray:
+ """
+ Helper for StringArray that converts null values that
+ are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
+ have already been validated as null.
+ """
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ object val
+ ndarray[object] result
+ result = np.asarray(arr, dtype="object")
+ for i in range(n):
+ val = arr[i]
+ if not isinstance(val, str):
+ result[i] = C_NA
+ return result
+
@cython.wraparound(False)
@cython.boundscheck(False)
@@ -1880,10 +1901,6 @@ cdef class StringValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.str_)
- cdef bint is_valid_null(self, object value) except -1:
- # We deliberately exclude None / NaN here since StringArray uses NA
- return value is C_NA
-
cpdef bint is_string_array(ndarray values, bint skipna=False):
cdef:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 919b882f22ecb..46a41ae813ca3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -310,11 +310,11 @@ def __init__(self, values, copy=False):
values = extract_array(values)
super().__init__(values, copy=copy)
+ if not isinstance(values, type(self)):
+ self._validate()
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
- if not isinstance(values, type(self)):
- self._validate()
def _validate(self):
"""Validate that we only store NA or strings."""
@@ -325,6 +325,8 @@ def _validate(self):
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)
+ # Check to see if need to convert Na values to pd.NA
+ lib.convert_nans_to_NA(self._ndarray)
@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 22fe7bb0de949..0919d57f9e612 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -267,15 +267,20 @@ def test_constructor_raises(cls):
cls(np.array([]))
with pytest.raises(ValueError, match=msg):
- cls(np.array(["a", np.nan], dtype=object))
-
- with pytest.raises(ValueError, match=msg):
- cls(np.array(["a", None], dtype=object))
+ cls(np.array(["a", np.datetime64("nat")], dtype=object))
with pytest.raises(ValueError, match=msg):
cls(np.array(["a", pd.NaT], dtype=object))
+@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
+def test_constructor_nan_like(na):
+ expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+ tm.assert_extension_array_equal(
+ pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+ )
+
+
@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy, cls, request):
if cls is ArrowStringArray and copy is False:
From 8011f8d0319b78764daca2473aacbef304df34fb Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 2 Jan 2022 18:24:22 -0800
Subject: [PATCH 02/12] fix compile warning?
---
pandas/_libs/lib.pyx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index f6c55f76ddd20..70f3631a8aadf 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -687,7 +687,7 @@ def convert_nans_to_NA(ndarray[object] arr) -> ndarray:
for i in range(n):
val = arr[i]
if not isinstance(val, str):
- result[i] = C_NA
+ result[i] =