From a4b0d7322e6fc493f74e699b99b3e5d734f2d912 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 14 Oct 2022 20:13:18 -0400 Subject: [PATCH 1/4] perf: string array from np.str_ --- asv_bench/benchmarks/array.py | 18 ++++++++++++++++++ pandas/_libs/lib.pyx | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index df9d171a70397..a8d5777216242 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -44,6 +44,24 @@ def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") +class StringArray: + def setup(self): + N = 100_000 + values = tm.rands_array(3, N) + self.values_obj = np.array(values, dtype="object") + self.values_str = np.array(values, dtype="U") + self.values_list = values.tolist() + + def time_from_np_object_array(self): + pd.array(self.values_obj, dtype="string") + + def time_from_np_str_array(self): + pd.array(self.values_str, dtype="string") + + def time_from_list(self): + pd.array(self.values_list, dtype="string") + + class ArrowStringArray: params = [False, True] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d9a7195520fd7..2f1473e10a5a3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -703,6 +703,10 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + if util.is_array(arr) and issubclass(arr.dtype.type, np.str_): + # short-circuit, all elements are str + return result + for i in range(n): val = arr[i] From 172ce4497ff6b55c171867d67dabc2fc60c71861 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 14 Oct 2022 20:43:50 -0400 Subject: [PATCH 2/4] add test --- pandas/tests/arrays/string_/test_string.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a7b8162eb981a..2a8708403a811 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -611,3 +611,11 @@ def test_setitem_scalar_with_mask_validation(dtype): msg = "Scalar must be NA or str" with pytest.raises(ValueError, match=msg): ser[mask] = 1 + + +def test_from_numpy_str(dtype): + vals = ["a", "b", "c"] + arr = np.array(vals, dtype=np.str_) + result = pd.array(arr, dtype=dtype) + expected = pd.array(vals, dtype=dtype) + tm.assert_extension_array_equal(result, expected) From 3617d97b6a978e9aed97431a7ff6319b95022169 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 14 Oct 2022 20:51:32 -0400 Subject: [PATCH 3/4] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index fea3d70d81554..c1530598616cb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -154,6 +154,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). From 514153640ab51017f05a4b7473b224bbec7ac7fd Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 17 Oct 2022 19:04:26 -0400 Subject: [PATCH 4/4] cleanup --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2f1473e10a5a3..de226fcd19084 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -703,7 +703,7 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() - if util.is_array(arr) and issubclass(arr.dtype.type, np.str_): + if issubclass(arr.dtype.type, np.str_): # short-circuit, all elements are str return result