diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index df9d171a70397..301fc9d405057 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -71,3 +71,6 @@ def time_setitem_list(self, multiple_chunks): def time_setitem_slice(self, multiple_chunks): self.array[::10] = "foo" + + def time_tolist(self, multiple_chunks): + self.array.tolist() diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index f6630857bee91..7a25f77da1c82 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -4,6 +4,7 @@ from pandas import ( DataFrame, + Index, MultiIndex, Series, array, @@ -92,6 +93,39 @@ def time_f_ordered(self, axis, ignore_index): concat(self.frame_f, axis=axis, ignore_index=ignore_index) +class ConcatIndexDtype: + + params = ( + ["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"], + [0, 1], + [True, False], + [True, False], + ) + param_names = ["dtype", "axis", "sort", "is_monotonic"] + + def setup(self, dtype, axis, sort, is_monotonic): + N = 10_000 + if dtype == "datetime64[ns]": + vals = date_range("1970-01-01", periods=N) + elif dtype in ("int64", "Int64"): + vals = np.arange(N, dtype=np.int64) + elif dtype in ("string[python]", "string[pyarrow]"): + vals = tm.makeStringIndex(N) + else: + raise NotImplementedError + + idx = Index(vals, dtype=dtype) + if is_monotonic: + idx = idx.sort_values() + else: + idx = idx[::-1] + + self.series = [Series(i, idx[i:]) for i in range(5)] + + def time_concat_series(self, dtype, axis, sort, is_monotonic): + concat(self.series, axis=axis, sort=sort) + + class Join: params = [True, False] diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index fea3d70d81554..a5aa186d33f4c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -154,6 +154,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 15e201b8279de..746175ee3374d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -427,6 +427,15 @@ def to_numpy( data = self._data.astype(dtype, copy=copy) return data + @doc(ExtensionArray.tolist) + def tolist(self): + if self.ndim > 1: + return [x.tolist() for x in self] + if not self._hasna: + # faster than list(self) + return list(self._data) + return list(self) + @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2f9857eb43860..451399255e7f9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -20,6 +20,7 @@ ) from pandas.compat import pa_version_under1p01 from pandas.compat.numpy import function as nv +from pandas.util._decorators import doc from pandas.core.dtypes.base import ( ExtensionDtype, @@ -214,7 +215,11 @@ class BaseStringArray(ExtensionArray): Mixin class for StringArray, ArrowStringArray. """ - pass + @doc(ExtensionArray.tolist) + def tolist(self): + if self.ndim > 1: + return [x.tolist() for x in self] + return list(self.to_numpy()) class StringArray(BaseStringArray, PandasArray): diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 9a86ef835e5ef..4c7bd6e293ef4 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -49,3 +49,9 @@ def test_round(data, numpy_dtype): dtype=data.dtype, ) tm.assert_extension_array_equal(result, expected) + + +def test_tolist(data): + result = data.tolist() + expected = list(data) + tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a7b8162eb981a..5d0eb81114a23 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -611,3 +611,11 @@ def test_setitem_scalar_with_mask_validation(dtype): msg = "Scalar must be NA or str" with pytest.raises(ValueError, match=msg): ser[mask] = 1 + + +def test_tolist(dtype): + vals = ["a", "b", "c"] + arr = pd.array(vals, dtype=dtype) + result = arr.tolist() + expected = vals + tm.assert_equal(result, expected)