From d57ef73a1f19d47b48a88e750b9ca110347a22b1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 25 Jan 2023 19:05:15 -0800 Subject: [PATCH 1/7] REF: ArrowEA _data->_pa_array --- pandas/_testing/__init__.py | 8 +- pandas/core/arrays/arrow/array.py | 156 +++++++++++++------------- pandas/core/arrays/string_arrow.py | 50 ++++----- pandas/tests/extension/test_arrow.py | 29 ++--- pandas/tests/extension/test_string.py | 2 +- pandas/tests/io/excel/test_readers.py | 2 +- 6 files changed, 125 insertions(+), 122 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index eb25566e7983e..b329c9e817c23 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -1006,10 +1006,10 @@ def shares_memory(left, right) -> bool: if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]": # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]": - # error: "ExtensionArray" has no attribute "_data" - left_pa_data = left._data # type: ignore[attr-defined] - # error: "ExtensionArray" has no attribute "_data" - right_pa_data = right._data # type: ignore[attr-defined] + # error: "ExtensionArray" has no attribute "_pa_array" + left_pa_data = left._pa_array # type: ignore[attr-defined] + # error: "ExtensionArray" has no attribute "_pa_array" + right_pa_data = right._pa_array # type: ignore[attr-defined] left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] return left_buf1 == right_buf1 diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0e70b3795bc85..ab77334f7dd46 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -200,14 +200,14 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: msg = "pyarrow>=6.0.0 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) + self._pa_array = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): - self._data = values + self._pa_array = values else: raise ValueError( f"Unsupported type '{type(values)}' for ArrowExtensionArray" ) - self._dtype = ArrowDtype(self._data.type) + self._dtype = ArrowDtype(self._pa_array.type) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): @@ -216,7 +216,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal """ pa_dtype = to_pyarrow_type(dtype) if isinstance(scalars, cls): - scalars = scalars._data + scalars = scalars._pa_array elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): try: scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) @@ -315,7 +315,7 @@ def __getitem__(self, item: PositionalIndexer): elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) + return type(self)(self._pa_array.filter(item)) else: raise IndexError( "Only integers, slices and integer or " @@ -340,7 +340,7 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. - value = self._data[item] + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: @@ -355,7 +355,7 @@ def __iter__(self) -> Iterator[Any]: Iterate over elements of the array. """ na_value = self._dtype.na_value - for value in self._data: + for value in self._pa_array: val = value.as_py() if val is None: yield na_value @@ -364,29 +364,29 @@ def __iter__(self) -> Iterator[Any]: def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" - return self._data + return self._pa_array def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(pc.invert(self._data)) + return type(self)(pc.invert(self._pa_array)) def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(pc.negate_checked(self._data)) + return type(self)(pc.negate_checked(self._pa_array)) def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(self._data) + return type(self)(self._pa_array) def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - return type(self)(pc.abs_checked(self._data)) + return type(self)(pc.abs_checked(self._pa_array)) # GH 42600: __getstate__/__setstate__ not necessary once # https://issues.apache.org/jira/browse/ARROW-10739 is addressed def __getstate__(self): state = self.__dict__.copy() - state["_data"] = self._data.combine_chunks() + state["_data"] = self._pa_array.combine_chunks() return state def __setstate__(self, state) -> None: @@ -398,12 +398,12 @@ def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, ArrowExtensionArray): - result = pc_func(self._data, other._data) + result = pc_func(self._pa_array, other._pa_array) elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._data, other) + result = pc_func(self._pa_array, other) elif is_scalar(other): try: - result = pc_func(self._data, pa.scalar(other)) + result = pc_func(self._pa_array, pa.scalar(other)) except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask @@ -429,11 +429,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs): if pc_func is NotImplemented: raise NotImplementedError(f"{op.__name__} not implemented.") if isinstance(other, ArrowExtensionArray): - result = pc_func(self._data, other._data) + result = pc_func(self._pa_array, other._pa_array) elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._data, pa.array(other, from_pandas=True)) + result = pc_func(self._pa_array, pa.array(other, from_pandas=True)) elif is_scalar(other): - result = pc_func(self._data, pa.scalar(other)) + result = pc_func(self._pa_array, pa.scalar(other)) else: raise NotImplementedError( f"{op.__name__} not implemented for {type(other)}" @@ -451,7 +451,7 @@ def equals(self, other) -> bool: return False # I'm told that pyarrow makes __eq__ behave like pandas' equals; # TODO: is this documented somewhere? - return self._data == other._data + return self._pa_array == other._pa_array @property def dtype(self) -> ArrowDtype: @@ -465,7 +465,7 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. """ - return self._data.nbytes + return self._pa_array.nbytes def __len__(self) -> int: """ @@ -475,11 +475,11 @@ def __len__(self) -> int: ------- length : int """ - return len(self._data) + return len(self._pa_array) @property def _hasna(self) -> bool: - return self._data.null_count > 0 + return self._pa_array.null_count > 0 def isna(self) -> npt.NDArray[np.bool_]: """ @@ -487,7 +487,7 @@ def isna(self) -> npt.NDArray[np.bool_]: This should return a 1-D array the same length as 'self'. """ - return self._data.is_null().to_numpy() + return self._pa_array.is_null().to_numpy() def argsort( self, @@ -509,13 +509,13 @@ def argsort( ) result = pc.array_sort_indices( - self._data, order=order, null_placement=null_placement + self._pa_array, order=order, null_placement=null_placement ) np_result = result.to_numpy() return np_result.astype(np.intp, copy=False) def _argmin_max(self, skipna: bool, method: str) -> int: - if self._data.length() in (0, self._data.null_count) or ( + if self._pa_array.length() in (0, self._pa_array.null_count) or ( self._hasna and not skipna ): # For empty or all null, pyarrow returns -1 but pandas expects TypeError @@ -528,7 +528,7 @@ def _argmin_max(self, skipna: bool, method: str) -> int: f"arg{method} only implemented for pyarrow version >= 6.0" ) - data = self._data + data = self._pa_array if pa.types.is_duration(data.type): data = data.cast(pa.int64()) @@ -551,7 +551,7 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: ------- type(self) """ - return type(self)(self._data) + return type(self)(self._pa_array) def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ @@ -565,7 +565,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: fallback_performancewarning(version="6") return super().dropna() else: - return type(self)(pc.drop_null(self._data)) + return type(self)(pc.drop_null(self._pa_array)) @doc(ExtensionArray.fillna) def fillna( @@ -609,15 +609,15 @@ def convert_fill_value(value, pa_type, dtype): raise TypeError(msg) from err return value - fill_value = convert_fill_value(value, self._data.type, self.dtype) + fill_value = convert_fill_value(value, self._pa_array.type, self.dtype) try: if method is None: - return type(self)(pc.fill_null(self._data, fill_value=fill_value)) + return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) elif method == "pad": - return type(self)(pc.fill_null_forward(self._data)) + return type(self)(pc.fill_null_forward(self._pa_array)) elif method == "backfill": - return type(self)(pc.fill_null_backward(self._data)) + return type(self)(pc.fill_null_backward(self._pa_array)) except pa.ArrowNotImplementedError: # ArrowNotImplementedError: Function 'coalesce' has no kernel # matching input types (duration[ns], duration[ns]) @@ -632,7 +632,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True)) + result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -651,7 +651,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: The values returned by this method are also used in :func:`pandas.util.hash_pandas_object`. """ - values = self._data.to_numpy() + values = self._pa_array.to_numpy() return values, self.dtype.na_value @doc(ExtensionArray.factorize) @@ -661,12 +661,12 @@ def factorize( ) -> tuple[np.ndarray, ExtensionArray]: null_encoding = "mask" if use_na_sentinel else "encode" - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: - data = self._data + data = self._pa_array encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: @@ -715,7 +715,7 @@ def round( DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ - return type(self)(pc.round(self._data, ndigits=decimals)) + return type(self)(pc.round(self._pa_array, ndigits=decimals)) @doc(ExtensionArray.searchsorted) def searchsorted( @@ -801,18 +801,18 @@ def take( # "Sequence[int]", variable has type "ndarray") indices_array = indices # type: ignore[assignment] - if len(self._data) == 0 and (indices_array >= 0).any(): + if len(self._pa_array) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): + if indices_array.size > 0 and indices_array.max() >= len(self._pa_array): raise IndexError("out of bounds value in 'indices'.") if allow_fill: fill_mask = indices_array < 0 if fill_mask.any(): - validate_indices(indices_array, len(self._data)) + validate_indices(indices_array, len(self._pa_array)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) + result = self._pa_array.take(indices_array) if isna(fill_value): return type(self)(result) # TODO: ArrowNotImplementedError: Function fill_null has no @@ -823,14 +823,14 @@ def take( # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill - return type(self)(self._data.take(indices)) + return type(self)(self._pa_array.take(indices)) else: # allow_fill=False # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): # Don't modify in-place indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) + indices_array[indices_array < 0] += len(self._pa_array) + return type(self)(self._pa_array.take(indices_array)) @doc(ExtensionArray.to_numpy) def to_numpy( @@ -844,7 +844,7 @@ def to_numpy( if na_value is lib.no_default: na_value = self.dtype.na_value - pa_type = self._data.type + pa_type = self._pa_array.type if ( is_object_dtype(dtype) or pa.types.is_timestamp(pa_type) @@ -852,7 +852,7 @@ def to_numpy( ): result = np.array(list(self), dtype=dtype) else: - result = np.asarray(self._data, dtype=dtype) + result = np.asarray(self._pa_array, dtype=dtype) if copy or self._hasna: result = result.copy() if self._hasna: @@ -867,13 +867,13 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: ------- ArrowExtensionArray """ - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: - data = self._data + data = self._pa_array pa_result = pc.unique(data) @@ -899,12 +899,12 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: - data = self._data + data = self._pa_array from pandas import ( Index, @@ -945,7 +945,7 @@ def _concat_same_type( ------- ArrowExtensionArray """ - chunks = [array for ea in to_concat for array in ea._data.iterchunks()] + chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] arr = pa.chunked_array(chunks) return cls(arr) @@ -986,7 +986,7 @@ def _accumulate( if pyarrow_meth is None: return super()._accumulate(name, skipna=skipna, **kwargs) - data_to_accum = self._data + data_to_accum = self._pa_array pa_dtype = data_to_accum.type if pa.types.is_duration(pa_dtype): @@ -1023,9 +1023,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ------ TypeError : subclass does not define reductions """ - pa_type = self._data.type + pa_type = self._pa_array.type - data_to_reduce = self._data + data_to_reduce = self._pa_array if name in ["any", "all"] and ( pa.types.is_integer(pa_type) @@ -1036,9 +1036,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # for other dtypes, matching our non-pyarrow behavior if pa.types.is_duration(pa_type): - data_to_cmp = self._data.cast(pa.int64()) + data_to_cmp = self._pa_array.cast(pa.int64()) else: - data_to_cmp = self._data + data_to_cmp = self._pa_array not_eq = pc.not_equal(data_to_cmp, 0) data_to_reduce = not_eq @@ -1047,7 +1047,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): def pyarrow_meth(data, skip_nulls, **kwargs): numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs) - denominator = pc.sqrt_checked(pc.count(self._data)) + denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) else: @@ -1109,7 +1109,7 @@ def __setitem__(self, key, value) -> None: if com.is_null_slice(key): # fast path (GH50248) - data = self._if_else(True, value, self._data) + data = self._if_else(True, value, self._pa_array) elif is_integer(key): # fast path @@ -1126,20 +1126,20 @@ def __setitem__(self, key, value) -> None: elif isinstance(value, pa.Scalar): value = value.as_py() chunks = [ - *self._data[:key].chunks, - pa.array([value], type=self._data.type, from_pandas=True), - *self._data[key + 1 :].chunks, + *self._pa_array[:key].chunks, + pa.array([value], type=self._pa_array.type, from_pandas=True), + *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() elif is_bool_dtype(key): key = np.asarray(key, dtype=np.bool_) - data = self._replace_with_mask(self._data, key, value) + data = self._replace_with_mask(self._pa_array, key, value) elif is_scalar(value) or isinstance(value, pa.Scalar): mask = np.zeros(len(self), dtype=np.bool_) mask[key] = True - data = self._if_else(mask, value, self._data) + data = self._if_else(mask, value, self._pa_array) else: indices = np.arange(len(self))[key] @@ -1152,11 +1152,11 @@ def __setitem__(self, key, value) -> None: value = value.take(argsort) mask = np.zeros(len(self), dtype=np.bool_) mask[indices] = True - data = self._replace_with_mask(self._data, mask, value) + data = self._replace_with_mask(self._pa_array, mask, value) if isinstance(data, pa.Array): data = pa.chunked_array([data]) - self._data = data + self._pa_array = data def _rank( self, @@ -1186,7 +1186,7 @@ def _rank( result = pa.array(ranked, type=pa_type, from_pandas=True) return type(self)(result) - data = self._data.combine_chunks() + data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" null_placement = "at_start" if na_option == "top" else "at_end" tiebreaker = "min" if method == "average" else method @@ -1199,7 +1199,7 @@ def _rank( ) if na_option == "keep": - mask = pc.is_null(self._data) + mask = pc.is_null(self._pa_array) null = pa.scalar(None, type=result.type) result = pc.if_else(mask, null, result) @@ -1240,9 +1240,9 @@ def _quantile( ------- same type as self """ - pa_dtype = self._data.type + pa_dtype = self._pa_array.type - data = self._data + data = self._pa_array if pa.types.is_temporal(pa_dtype) and interpolation in ["lower", "higher"]: # https://github.com/apache/arrow/issues/33769 in these cases # we can cast to ints and back @@ -1279,17 +1279,17 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra if pa_version_under6p0: raise NotImplementedError("mode only supported for pyarrow version >= 6.0") - pa_type = self._data.type + pa_type = self._pa_array.type if pa.types.is_temporal(pa_type): nbits = pa_type.bit_width if nbits == 32: - data = self._data.cast(pa.int32()) + data = self._pa_array.cast(pa.int32()) elif nbits == 64: - data = self._data.cast(pa.int64()) + data = self._pa_array.cast(pa.int64()) else: raise NotImplementedError(pa_type) else: - data = self._data + data = self._pa_array modes = pc.mode(data, pc.count_distinct(data).as_py()) values = modes.field(0) @@ -1314,7 +1314,7 @@ def _maybe_convert_setitem_value(self, value): else: pa_box = pa.scalar try: - value = pa_box(value, type=self._data.type, from_pandas=True) + value = pa_box(value, type=self._pa_array.type, from_pandas=True) except pa.ArrowTypeError as err: msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" raise TypeError(msg) from err diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4aebe61412866..073e97a5911fa 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -112,7 +112,7 @@ def __init__(self, values) -> None: super().__init__(values) self._dtype = StringDtype(storage="pyarrow") - if not pa.types.is_string(self._data.type): + if not pa.types.is_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of string type" ) @@ -183,7 +183,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if not len(value_set): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._data, value_set=pa.array(value_set)) + result = pc.is_in(self._pa_array, value_set=pa.array(value_set)) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -196,7 +196,7 @@ def astype(self, dtype, copy: bool = True): return self.copy() return self elif isinstance(dtype, NumericDtype): - data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) return dtype.__from_arrow__(data) elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): return self.to_numpy(dtype=dtype, na_value=np.nan) @@ -282,12 +282,12 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) else: - result = pc.match_substring_regex(self._data, pat) + result = pc.match_substring_regex(self._pa_array, pat) else: if case: - result = pc.match_substring(self._data, pat) + result = pc.match_substring(self._pa_array, pat) else: - result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper()) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) @@ -315,7 +315,7 @@ def _str_replace( return super()._str_replace(pat, repl, n, case, flags, regex) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) def _str_match( @@ -333,68 +333,68 @@ def _str_fullmatch( return self._str_match(pat, case, flags, na) def _str_isalnum(self): - result = pc.utf8_is_alnum(self._data) + result = pc.utf8_is_alnum(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isalpha(self): - result = pc.utf8_is_alpha(self._data) + result = pc.utf8_is_alpha(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._data) + result = pc.utf8_is_decimal(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isdigit(self): - result = pc.utf8_is_digit(self._data) + result = pc.utf8_is_digit(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_islower(self): - result = pc.utf8_is_lower(self._data) + result = pc.utf8_is_lower(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._data) + result = pc.utf8_is_numeric(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isspace(self): - result = pc.utf8_is_space(self._data) + result = pc.utf8_is_space(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_istitle(self): - result = pc.utf8_is_title(self._data) + result = pc.utf8_is_title(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_isupper(self): - result = pc.utf8_is_upper(self._data) + result = pc.utf8_is_upper(self._pa_array) return BooleanDtype().__from_arrow__(result) def _str_len(self): - result = pc.utf8_length(self._data) + result = pc.utf8_length(self._pa_array) return Int64Dtype().__from_arrow__(result) def _str_lower(self): - return type(self)(pc.utf8_lower(self._data)) + return type(self)(pc.utf8_lower(self._pa_array)) def _str_upper(self): - return type(self)(pc.utf8_upper(self._data)) + return type(self)(pc.utf8_upper(self._pa_array)) def _str_strip(self, to_strip=None): if to_strip is None: - result = pc.utf8_trim_whitespace(self._data) + result = pc.utf8_trim_whitespace(self._pa_array) else: - result = pc.utf8_trim(self._data, characters=to_strip) + result = pc.utf8_trim(self._pa_array, characters=to_strip) return type(self)(result) def _str_lstrip(self, to_strip=None): if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._data) + result = pc.utf8_ltrim_whitespace(self._pa_array) else: - result = pc.utf8_ltrim(self._data, characters=to_strip) + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) return type(self)(result) def _str_rstrip(self, to_strip=None): if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._data) + result = pc.utf8_rtrim_whitespace(self._pa_array) else: - result = pc.utf8_rtrim(self._data, characters=to_strip) + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index eef77ceabb6fe..fe189e33c64e2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -264,14 +264,14 @@ def test_from_dtype(self, data, request): def test_from_sequence_pa_array(self, data, request): # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784 - # data._data = pa.ChunkedArray - result = type(data)._from_sequence(data._data) + # data._pa_array = pa.ChunkedArray + result = type(data)._from_sequence(data._pa_array) tm.assert_extension_array_equal(result, data) - assert isinstance(result._data, pa.ChunkedArray) + assert isinstance(result._pa_array, pa.ChunkedArray) - result = type(data)._from_sequence(data._data.combine_chunks()) + result = type(data)._from_sequence(data._pa_array.combine_chunks()) tm.assert_extension_array_equal(result, data) - assert isinstance(result._data, pa.ChunkedArray) + assert isinstance(result._pa_array, pa.ChunkedArray) def test_from_sequence_pa_array_notimplemented(self, request): if pa_version_under6p0: @@ -326,7 +326,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason=f"pyarrow doesn't support string cast from {pa_dtype}", ) ) - pa_array = data._data.cast(pa.string()) + pa_array = data._pa_array.cast(pa.string()) result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) tm.assert_extension_array_equal(result, data) @@ -339,7 +339,7 @@ class TestGetitemTests(base.BaseGetitemTests): def test_getitem_scalar(self, data): # In the base class we expect data.dtype.type; but this (intentionally) # returns Python scalars or pd.NA - pa_type = data._data.type + pa_type = data._pa_array.type if pa.types.is_integer(pa_type): exp_type = int elif pa.types.is_floating(pa_type): @@ -1300,7 +1300,10 @@ def test_quantile(data, interpolation, quantile, request): if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): pass - elif pa.types.is_temporal(data._data.type) and interpolation in ["lower", "higher"]: + elif pa.types.is_temporal(data._pa_array.type) and interpolation in [ + "lower", + "higher", + ]: pass else: request.node.add_marker( @@ -1444,7 +1447,7 @@ def test_pickle_roundtrip(data): def test_astype_from_non_pyarrow(data): # GH49795 - pd_array = data._data.to_pandas().array + pd_array = data._pa_array.to_pandas().array result = pd_array.astype(data.dtype) assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) @@ -1463,11 +1466,11 @@ def test_to_numpy_with_defaults(data): # GH49973 result = data.to_numpy() - pa_type = data._data.type + pa_type = data._pa_array.type if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): expected = np.array(list(data)) else: - expected = np.array(data._data) + expected = np.array(data._pa_array) if data._hasna: expected = expected.astype(object) @@ -1483,7 +1486,7 @@ def test_setitem_null_slice(data): result = orig.copy() result[:] = data[0] expected = ArrowExtensionArray( - pa.array([data[0]] * len(data), type=data._data.type) + pa.array([data[0]] * len(data), type=data._pa_array.type) ) tm.assert_extension_array_equal(result, expected) @@ -1500,7 +1503,7 @@ def test_setitem_null_slice(data): def test_setitem_invalid_dtype(data): # GH50248 - pa_type = data._data.type + pa_type = data._pa_array.type if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): fill_value = 123 err = TypeError diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 3e865947aa968..8ad7c008b0c9e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -39,7 +39,7 @@ def split_array(arr): def _split_array(arr): import pyarrow as pa - arrow_array = arr._data + arrow_array = arr._pa_array split = len(arrow_array) // 2 arrow_array = pa.chunked_array( [*arrow_array[:split].chunks, *arrow_array[split:].chunks] diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f194cadbc73d8..35e3878ebd9b0 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -583,7 +583,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): ) # pyarrow by default infers timestamp resolution as us, not ns expected["i"] = ArrowExtensionArray( - expected["i"].array._data.cast(pa.timestamp(unit="us")) + expected["i"].array._pa_array.cast(pa.timestamp(unit="us")) ) # pyarrow supports a null type, so don't have to default to Int64 expected["j"] = ArrowExtensionArray(pa.array([None, None])) From ccad453bd60a5107e29956224997833e7f5f05e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Feb 2023 15:52:37 -0800 Subject: [PATCH 2/7] fixup accessor --- pandas/core/indexes/accessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 788448f2c7be6..9684b3645a399 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -215,7 +215,7 @@ def isocalendar(self): result = ( cast(ArrowExtensionArray, self._parent.array) ._dt_isocalendar() - ._data.combine_chunks() + ._pa_array.combine_chunks() ) iso_calendar_df = DataFrame( { From c2d99b70646b98324fcd7eada3291a1a84f6a786 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 11 Feb 2023 09:03:57 -0800 Subject: [PATCH 3/7] fix copy/view test --- pandas/tests/copy_view/test_astype.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index a485275a28ac4..182ffed61e7bc 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -192,4 +192,6 @@ def test_astype_arrow_timestamp(using_copy_on_write): result = df.astype("timestamp[ns][pyarrow]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data) + assert np.shares_memory( + get_array(df, "a").asi8, get_array(result, "a")._pa_array + ) From 662afac419a42059f054da4b9855ca1bf4e600f1 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Feb 2023 19:30:42 -0800 Subject: [PATCH 4/7] fix ArrowStringArray --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 50de63b7836c7..40c0dfaecddc6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -125,7 +125,7 @@ def __len__(self) -> int: ------- length : int """ - return len(self._data) + return len(self._pa_array) @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): From 86959c9727b88c67f3ecbdff4711b5e74563033a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Feb 2023 16:46:57 -0800 Subject: [PATCH 5/7] fix pickle tests --- pandas/core/arrays/string_arrow.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 40c0dfaecddc6..2086a93ea6e14 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -5,6 +5,7 @@ Callable, Union, ) +import warnings import numpy as np @@ -18,6 +19,7 @@ npt, ) from pandas.compat import pa_version_under7p0 +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool_dtype, @@ -213,6 +215,17 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy=copy) + @property + def _data(self): + # dask accesses ._data directlys + warnings.warn( + f"{type(self).__name__}._data is a deprecated and will be removed " + "in a future version, use ._pa_array instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._pa_array + # ------------------------------------------------------------------------ # String methods interface From 0f483590f7fde42f1223d8c5b1ca0bb5df7bcfe5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Feb 2023 18:35:49 -0800 Subject: [PATCH 6/7] update --- pandas/core/arrays/arrow/array.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2a28cf2ae3dd6..54d87f944dd91 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -460,12 +460,12 @@ def _cmp_method(self, other, op): return BooleanArray(values, mask) def _evaluate_op_method(self, other, op, arrow_funcs): - pa_type = self._data.type + pa_type = self._pa_array.type if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ operator.add, roperator.radd, ]: - length = self._data.length() + length = self._pa_array.length() seps: list[str] | list[bytes] if pa.types.is_string(pa_type): @@ -476,11 +476,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs): if is_scalar(other): other = [other] * length elif isinstance(other, type(self)): - other = other._data + other = other._pa_array if op is operator.add: - result = pc.binary_join_element_wise(self._data, other, seps) + result = pc.binary_join_element_wise(self._pa_array, other, seps) else: - result = pc.binary_join_element_wise(other, self._data, seps) + result = pc.binary_join_element_wise(other, self._pa_array, seps) return type(self)(result) pc_func = arrow_funcs[op.__name__] From 60c912fd4374f2c326e16b9d33e57841226f45e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Mar 2023 15:08:21 -0800 Subject: [PATCH 7/7] update setstate, _to_pydatetime --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 83ad775a722a7..2eb723ecb8e37 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -439,7 +439,7 @@ def __getstate__(self): return state def __setstate__(self, state) -> None: - state["_data"] = pa.chunked_array(state["_data"]) + state["_pa_array"] = pa.chunked_array(state["_data"]) self.__dict__.update(state) def _cmp_method(self, other, op): @@ -2091,7 +2091,7 @@ def _dt_round( return self._round_temporally("round", freq, ambiguous, nonexistent) def _dt_to_pydatetime(self): - return np.array(self._data.to_pylist(), dtype=object) + return np.array(self._pa_array.to_pylist(), dtype=object) def _dt_tz_localize( self,