From c4baeb76c8854a4d8fd0bb23ccf35768d6c43253 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Feb 2022 13:21:09 -0800 Subject: [PATCH 1/5] REF: implement ArrowStringArray --- pandas/core/arrays/_arrow_utils.py | 9 +++++++++ pandas/core/arrays/string_.py | 7 +++++-- pandas/core/arrays/string_arrow.py | 5 ++++- pandas/tests/extension/arrow/arrays.py | 4 ++-- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 6214693f22975..667d635a1f3e6 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -3,6 +3,7 @@ import numpy as np import pyarrow +from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.interval import VALID_CLOSED @@ -139,3 +140,11 @@ def to_pandas_dtype(self): # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +class ArrowExtensionArray(ExtensionArray): + """ + Base class for ExtensionArray backed by Arrow array. + """ + + pass diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ca4348e3bd06a..87fb8d215e16e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -43,7 +43,6 @@ IntegerArray, PandasArray, ) -from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import IntegerDtype from pandas.core.construction import extract_array @@ -223,7 +222,11 @@ def __from_arrow__( return StringArray(np.array([], dtype="object")) -class BaseStringArray(ExtensionArray): +class BaseStringArray: + """ + Mixin class for StringArray, ArrowStringArray. + """ + pass diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3503b54dd478a..0737529f595db 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -48,6 +48,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_utils import ArrowExtensionArray from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -94,7 +95,9 @@ def _chk_pyarrow_available() -> None: # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin): +class ArrowStringArray( + OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin +): """ Extension array for string data in a ``pyarrow.ChunkedArray``. diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 3707447151ae3..d3e9c0588922c 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -19,13 +19,13 @@ import pandas as pd from pandas.api.extensions import ( - ExtensionArray, ExtensionDtype, register_extension_dtype, take, ) from pandas.api.types import is_scalar from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_utils import ArrowExtensionArray as _ArrowExtensionArray from pandas.core.construction import extract_array @@ -73,7 +73,7 @@ def construct_array_type(cls) -> type_t[ArrowStringArray]: return ArrowStringArray -class ArrowExtensionArray(OpsMixin, ExtensionArray): +class ArrowExtensionArray(OpsMixin, _ArrowExtensionArray): _data: pa.ChunkedArray @classmethod From acd80ab3d4c363214a9345c08c51abd85ef736b9 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Feb 2022 11:13:54 -0800 Subject: [PATCH 2/5] REF: share more ArrowEA methods --- pandas/core/arrays/_arrow_utils.py | 56 +++++++++++++++++++++++++- pandas/core/arrays/string_.py | 3 +- pandas/core/arrays/string_arrow.py | 45 --------------------- pandas/tests/extension/arrow/arrays.py | 7 ---- 4 files changed, 57 insertions(+), 54 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 667d635a1f3e6..ea285743e91f2 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -1,4 +1,7 @@ +from __future__ import annotations + import json +from typing import TypeVar import numpy as np import pyarrow @@ -142,9 +145,60 @@ def to_pandas_dtype(self): pyarrow.register_extension_type(_interval_type) +ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") + + class ArrowExtensionArray(ExtensionArray): """ Base class for ExtensionArray backed by Arrow array. """ - pass + _data: pyarrow.ChunkedArray + + def __init__(self, values: pyarrow.ChunkedArray): + raise NotImplementedError + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self._data + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return a shallow copy of the array. + + Underlying ChunkedArray is immutable, so a deep copy is unnecessary. + + Returns + ------- + type(self) + """ + return type(self)(self._data) + + @classmethod + def _concat_same_type( + cls: type[ArrowExtensionArrayT], to_concat + ) -> ArrowExtensionArrayT: + """ + Concatenate multiple ArrowExtensionArrays. + + Parameters + ---------- + to_concat : sequence of ArrowExtensionArrays + + Returns + ------- + ArrowExtensionArray + """ + chunks = [array for ea in to_concat for array in ea._data.iterchunks()] + arr = pyarrow.chunked_array(chunks) + return cls(arr) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 87fb8d215e16e..fce1942433cf7 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -39,6 +39,7 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions from pandas.core.arrays import ( + ExtensionArray, FloatingArray, IntegerArray, PandasArray, @@ -222,7 +223,7 @@ def __from_arrow__( return StringArray(np.array([], dtype="object")) -class BaseStringArray: +class BaseStringArray(ExtensionArray): """ Mixin class for StringArray, ArrowStringArray. """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0737529f595db..00983371b0339 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -194,10 +194,6 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" - return self._data - def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -219,16 +215,6 @@ def to_numpy( result[mask] = na_value return result - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self._data) - @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: encoded = self._data.dictionary_encode() @@ -246,25 +232,6 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: return indices.values, uniques - @classmethod - def _concat_same_type(cls, to_concat) -> ArrowStringArray: - """ - Concatenate multiple ArrowStringArray. - - Parameters - ---------- - to_concat : sequence of ArrowStringArray - - Returns - ------- - ArrowStringArray - """ - return cls( - pa.chunked_array( - [array for ea in to_concat for array in ea._data.iterchunks()] - ) - ) - @overload def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: ... @@ -361,18 +328,6 @@ def isna(self) -> np.ndarray: # TODO: Implement .to_numpy for ChunkedArray return self._data.is_null().to_pandas().values - def copy(self) -> ArrowStringArray: - """ - Return a shallow copy of the array. - - Underlying ChunkedArray is immutable, so a deep copy is unnecessary. - - Returns - ------- - ArrowStringArray - """ - return type(self)(self._data) - def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index d3e9c0588922c..d74e6fe3fd270 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -8,7 +8,6 @@ """ from __future__ import annotations -import copy import itertools import operator @@ -111,9 +110,6 @@ def __getitem__(self, item): vals = self._data.to_pandas()[item] return type(self)._from_sequence(vals) - def __len__(self): - return len(self._data) - def astype(self, dtype, copy=True): # needed to fix this astype for the Series constructor. if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: @@ -165,9 +161,6 @@ def take(self, indices, allow_fill=False, fill_value=None): result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result, dtype=self.dtype) - def copy(self): - return type(self)(copy.copy(self._data)) - @classmethod def _concat_same_type(cls, to_concat): chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) From 7762f65b82644186e654a5096379e7785f42d1fa Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Feb 2022 11:21:16 -0800 Subject: [PATCH 3/5] REF: share isna, nbytes --- pandas/core/arrays/_arrow_utils.py | 18 ++++++++++++++++++ pandas/core/arrays/string_arrow.py | 16 ---------------- pandas/tests/extension/arrow/arrays.py | 13 ------------- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index ea285743e91f2..cf9f84a028703 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -6,6 +6,8 @@ import numpy as np import pyarrow +from pandas._typing import npt + from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.interval import VALID_CLOSED @@ -162,6 +164,13 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" return self._data + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + def __len__(self) -> int: """ Length of this array. @@ -172,6 +181,15 @@ def __len__(self) -> int: """ return len(self._data) + def isna(self) -> npt.NDArray[np.bool_]: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self._data.is_null().to_pandas().values + def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ Return a shallow copy of the array. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 00983371b0339..ba086b2541809 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -312,22 +312,6 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): else: return scalar - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self._data.nbytes - - def isna(self) -> np.ndarray: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - # TODO: Implement .to_numpy for ChunkedArray - return self._data.is_null().to_pandas().values - def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index d74e6fe3fd270..8e007bc1b6a74 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -138,19 +138,6 @@ def __eq__(self, other): return self._logical_method(other, operator.eq) - @property - def nbytes(self) -> int: - return sum( - x.size - for chunk in self._data.chunks - for x in chunk.buffers() - if x is not None - ) - - def isna(self): - nas = pd.isna(self._data.to_pandas()) - return type(self)._from_sequence(nas) - def take(self, indices, allow_fill=False, fill_value=None): data = self._data.to_pandas() data = extract_array(data, extract_numpy=True) From fc78c58f996cd413f6f37c39bd6f76312909c6c2 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Feb 2022 21:07:18 -0800 Subject: [PATCH 4/5] move class, fix tests --- pandas/core/arrays/_arrow_utils.py | 79 -------------------- pandas/core/arrays/_mixins.py | 89 +++++++++++++++++++++++ pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/arrow/arrays.py | 2 +- pandas/tests/extension/arrow/test_bool.py | 9 --- 5 files changed, 91 insertions(+), 90 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index cf9f84a028703..3c0614a9f69d6 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -1,14 +1,10 @@ from __future__ import annotations import json -from typing import TypeVar import numpy as np import pyarrow -from pandas._typing import npt - -from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.interval import VALID_CLOSED @@ -145,78 +141,3 @@ def to_pandas_dtype(self): # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) - - -ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") - - -class ArrowExtensionArray(ExtensionArray): - """ - Base class for ExtensionArray backed by Arrow array. - """ - - _data: pyarrow.ChunkedArray - - def __init__(self, values: pyarrow.ChunkedArray): - raise NotImplementedError - - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" - return self._data - - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self._data.nbytes - - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self._data) - - def isna(self) -> npt.NDArray[np.bool_]: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - # TODO: Implement .to_numpy for ChunkedArray - return self._data.is_null().to_pandas().values - - def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - """ - Return a shallow copy of the array. - - Underlying ChunkedArray is immutable, so a deep copy is unnecessary. - - Returns - ------- - type(self) - """ - return type(self)(self._data) - - @classmethod - def _concat_same_type( - cls: type[ArrowExtensionArrayT], to_concat - ) -> ArrowExtensionArrayT: - """ - Concatenate multiple ArrowExtensionArrays. - - Parameters - ---------- - to_concat : sequence of ArrowExtensionArrays - - Returns - ------- - ArrowExtensionArray - """ - chunks = [array for ea in to_concat for array in ea._data.iterchunks()] - arr = pyarrow.chunked_array(chunks) - return cls(arr) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 3446d5fc43a65..f1eed4b00d579 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -28,6 +28,7 @@ npt, type_t, ) +from pandas.compat import pa_version_under2p0 from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import ( @@ -66,6 +67,8 @@ if TYPE_CHECKING: + import pyarrow as pa + from pandas._typing import ( NumpySorter, NumpyValueArrayLike, @@ -494,3 +497,89 @@ def _empty( arr = cls._from_sequence([], dtype=dtype) backing = np.empty(shape, dtype=arr._ndarray.dtype) return arr._from_backing_data(backing) + + +ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") + + +class ArrowExtensionArray(ExtensionArray): + """ + Base class for ExtensionArray backed by Arrow array. + """ + + _data: pa.ChunkedArray + + def __init__(self, values: pa.ChunkedArray): + raise NotImplementedError + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self._data + + def equals(self, other) -> bool: + if not isinstance(other, ArrowExtensionArray): + return False + # I'm told that pyarrow makes __eq__ behave like pandas' equals; + # TODO: is this documented somewhere? + return self._data == other._data + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + def isna(self) -> npt.NDArray[np.bool_]: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + if pa_version_under2p0: + return self._data.is_null().to_pandas().values + else: + return self._data.is_null().to_numpy() + + def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return a shallow copy of the array. + + Underlying ChunkedArray is immutable, so a deep copy is unnecessary. + + Returns + ------- + type(self) + """ + return type(self)(self._data) + + @classmethod + def _concat_same_type( + cls: type[ArrowExtensionArrayT], to_concat + ) -> ArrowExtensionArrayT: + """ + Concatenate multiple ArrowExtensionArrays. + + Parameters + ---------- + to_concat : sequence of ArrowExtensionArrays + + Returns + ------- + ArrowExtensionArray + """ + import pyarrow as pa + + chunks = [array for ea in to_concat for array in ea._data.iterchunks()] + arr = pa.chunked_array(chunks) + return cls(arr) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ba086b2541809..63536e27c9e2a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -48,7 +48,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.arraylike import OpsMixin -from pandas.core.arrays._arrow_utils import ArrowExtensionArray +from pandas.core.arrays._mixins import ArrowExtensionArray from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 8e007bc1b6a74..1ab3d49392052 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -24,7 +24,7 @@ ) from pandas.api.types import is_scalar from pandas.core.arraylike import OpsMixin -from pandas.core.arrays._arrow_utils import ArrowExtensionArray as _ArrowExtensionArray +from pandas.core.arrays._mixins import ArrowExtensionArray as _ArrowExtensionArray from pandas.core.construction import extract_array diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index ddd10dfcb2d60..bdfbbef937019 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -62,11 +62,6 @@ def test_contains(self, data, data_missing): class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): - # seems like some bug in isna on empty BoolArray returning floats. - @pytest.mark.xfail(reason="bad is-na for empty data") - def test_from_sequence_from_cls(self, data): - super().test_from_sequence_from_cls(data) - @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") def test_series_constructor_no_data_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays @@ -77,10 +72,6 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays super().test_series_constructor_scalar_na_with_index(dtype, na_value) - @pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types") - def test_construct_empty_dataframe(self, dtype): - super().test_construct_empty_dataframe(dtype) - @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword") def test_empty(self, dtype): super().test_empty(dtype) From b9daa4d9929cf6b2052d583c28575dfb8de1625e Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 25 Feb 2022 11:48:03 -0800 Subject: [PATCH 5/5] set _data in __init__ --- pandas/core/arrays/_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index f1eed4b00d579..d57d44d4d97fb 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -510,7 +510,7 @@ class ArrowExtensionArray(ExtensionArray): _data: pa.ChunkedArray def __init__(self, values: pa.ChunkedArray): - raise NotImplementedError + self._data = values def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray."""