From d57ef73a1f19d47b48a88e750b9ca110347a22b1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 25 Jan 2023 19:05:15 -0800
Subject: [PATCH 1/7] REF: ArrowEA _data->_pa_array

---
 pandas/_testing/__init__.py           |   8 +-
 pandas/core/arrays/arrow/array.py     | 156 +++++++++++++-------------
 pandas/core/arrays/string_arrow.py    |  50 ++++-----
 pandas/tests/extension/test_arrow.py  |  29 ++---
 pandas/tests/extension/test_string.py |   2 +-
 pandas/tests/io/excel/test_readers.py |   2 +-
 6 files changed, 125 insertions(+), 122 deletions(-)

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index eb25566e7983e..b329c9e817c23 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -1006,10 +1006,10 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]":
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
         if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
-            # error: "ExtensionArray" has no attribute "_data"
-            left_pa_data = left._data  # type: ignore[attr-defined]
-            # error: "ExtensionArray" has no attribute "_data"
-            right_pa_data = right._data  # type: ignore[attr-defined]
+            # error: "ExtensionArray" has no attribute "_pa_array"
+            left_pa_data = left._pa_array  # type: ignore[attr-defined]
+            # error: "ExtensionArray" has no attribute "_pa_array"
+            right_pa_data = right._pa_array  # type: ignore[attr-defined]
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
             return left_buf1 == right_buf1
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 0e70b3795bc85..ab77334f7dd46 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -200,14 +200,14 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
             msg = "pyarrow>=6.0.0 is required for PyArrow backed ArrowExtensionArray."
             raise ImportError(msg)
         if isinstance(values, pa.Array):
-            self._data = pa.chunked_array([values])
+            self._pa_array = pa.chunked_array([values])
         elif isinstance(values, pa.ChunkedArray):
-            self._data = values
+            self._pa_array = values
         else:
             raise ValueError(
                 f"Unsupported type '{type(values)}' for ArrowExtensionArray"
             )
-        self._dtype = ArrowDtype(self._data.type)
+        self._dtype = ArrowDtype(self._pa_array.type)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
@@ -216,7 +216,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
         """
         pa_dtype = to_pyarrow_type(dtype)
         if isinstance(scalars, cls):
-            scalars = scalars._data
+            scalars = scalars._pa_array
         elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
             try:
                 scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)
@@ -315,7 +315,7 @@ def __getitem__(self, item: PositionalIndexer):
             elif is_integer_dtype(item.dtype):
                 return self.take(item)
             elif is_bool_dtype(item.dtype):
-                return type(self)(self._data.filter(item))
+                return type(self)(self._pa_array.filter(item))
             else:
                 raise IndexError(
                     "Only integers, slices and integer or "
@@ -340,7 +340,7 @@ def __getitem__(self, item: PositionalIndexer):
             )
         # We are not an array indexer, so maybe e.g. a slice or integer
         # indexer. We dispatch to pyarrow.
-        value = self._data[item]
+        value = self._pa_array[item]
         if isinstance(value, pa.ChunkedArray):
             return type(self)(value)
         else:
@@ -355,7 +355,7 @@ def __iter__(self) -> Iterator[Any]:
         Iterate over elements of the array.
         """
         na_value = self._dtype.na_value
-        for value in self._data:
+        for value in self._pa_array:
             val = value.as_py()
             if val is None:
                 yield na_value
@@ -364,29 +364,29 @@ def __iter__(self) -> Iterator[Any]:
 
     def __arrow_array__(self, type=None):
         """Convert myself to a pyarrow ChunkedArray."""
-        return self._data
+        return self._pa_array
 
     def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
         return self.to_numpy(dtype=dtype)
 
     def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
-        return type(self)(pc.invert(self._data))
+        return type(self)(pc.invert(self._pa_array))
 
     def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
-        return type(self)(pc.negate_checked(self._data))
+        return type(self)(pc.negate_checked(self._pa_array))
 
     def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
-        return type(self)(self._data)
+        return type(self)(self._pa_array)
 
     def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
-        return type(self)(pc.abs_checked(self._data))
+        return type(self)(pc.abs_checked(self._pa_array))
 
     # GH 42600: __getstate__/__setstate__ not necessary once
     # https://issues.apache.org/jira/browse/ARROW-10739 is addressed
     def __getstate__(self):
         state = self.__dict__.copy()
-        state["_data"] = self._data.combine_chunks()
+        state["_data"] = self._pa_array.combine_chunks()
         return state
 
     def __setstate__(self, state) -> None:
@@ -398,12 +398,12 @@ def _cmp_method(self, other, op):
 
         pc_func = ARROW_CMP_FUNCS[op.__name__]
         if isinstance(other, ArrowExtensionArray):
-            result = pc_func(self._data, other._data)
+            result = pc_func(self._pa_array, other._pa_array)
         elif isinstance(other, (np.ndarray, list)):
-            result = pc_func(self._data, other)
+            result = pc_func(self._pa_array, other)
         elif is_scalar(other):
             try:
-                result = pc_func(self._data, pa.scalar(other))
+                result = pc_func(self._pa_array, pa.scalar(other))
             except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
                 mask = isna(self) | isna(other)
                 valid = ~mask
@@ -429,11 +429,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
         if pc_func is NotImplemented:
             raise NotImplementedError(f"{op.__name__} not implemented.")
         if isinstance(other, ArrowExtensionArray):
-            result = pc_func(self._data, other._data)
+            result = pc_func(self._pa_array, other._pa_array)
         elif isinstance(other, (np.ndarray, list)):
-            result = pc_func(self._data, pa.array(other, from_pandas=True))
+            result = pc_func(self._pa_array, pa.array(other, from_pandas=True))
         elif is_scalar(other):
-            result = pc_func(self._data, pa.scalar(other))
+            result = pc_func(self._pa_array, pa.scalar(other))
         else:
             raise NotImplementedError(
                 f"{op.__name__} not implemented for {type(other)}"
@@ -451,7 +451,7 @@ def equals(self, other) -> bool:
             return False
         # I'm told that pyarrow makes __eq__ behave like pandas' equals;
         #  TODO: is this documented somewhere?
-        return self._data == other._data
+        return self._pa_array == other._pa_array
 
     @property
     def dtype(self) -> ArrowDtype:
@@ -465,7 +465,7 @@ def nbytes(self) -> int:
         """
         The number of bytes needed to store this object in memory.
         """
-        return self._data.nbytes
+        return self._pa_array.nbytes
 
     def __len__(self) -> int:
         """
@@ -475,11 +475,11 @@ def __len__(self) -> int:
         -------
         length : int
         """
-        return len(self._data)
+        return len(self._pa_array)
 
     @property
     def _hasna(self) -> bool:
-        return self._data.null_count > 0
+        return self._pa_array.null_count > 0
 
     def isna(self) -> npt.NDArray[np.bool_]:
         """
@@ -487,7 +487,7 @@ def isna(self) -> npt.NDArray[np.bool_]:
 
         This should return a 1-D array the same length as 'self'.
         """
-        return self._data.is_null().to_numpy()
+        return self._pa_array.is_null().to_numpy()
 
     def argsort(
         self,
@@ -509,13 +509,13 @@ def argsort(
             )
 
         result = pc.array_sort_indices(
-            self._data, order=order, null_placement=null_placement
+            self._pa_array, order=order, null_placement=null_placement
         )
         np_result = result.to_numpy()
         return np_result.astype(np.intp, copy=False)
 
     def _argmin_max(self, skipna: bool, method: str) -> int:
-        if self._data.length() in (0, self._data.null_count) or (
+        if self._pa_array.length() in (0, self._pa_array.null_count) or (
             self._hasna and not skipna
         ):
             # For empty or all null, pyarrow returns -1 but pandas expects TypeError
@@ -528,7 +528,7 @@ def _argmin_max(self, skipna: bool, method: str) -> int:
                 f"arg{method} only implemented for pyarrow version >= 6.0"
             )
 
-        data = self._data
+        data = self._pa_array
         if pa.types.is_duration(data.type):
             data = data.cast(pa.int64())
 
@@ -551,7 +551,7 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         -------
         type(self)
         """
-        return type(self)(self._data)
+        return type(self)(self._pa_array)
 
     def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         """
@@ -565,7 +565,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
             fallback_performancewarning(version="6")
             return super().dropna()
         else:
-            return type(self)(pc.drop_null(self._data))
+            return type(self)(pc.drop_null(self._pa_array))
 
     @doc(ExtensionArray.fillna)
     def fillna(
@@ -609,15 +609,15 @@ def convert_fill_value(value, pa_type, dtype):
                 raise TypeError(msg) from err
             return value
 
-        fill_value = convert_fill_value(value, self._data.type, self.dtype)
+        fill_value = convert_fill_value(value, self._pa_array.type, self.dtype)
 
         try:
             if method is None:
-                return type(self)(pc.fill_null(self._data, fill_value=fill_value))
+                return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value))
             elif method == "pad":
-                return type(self)(pc.fill_null_forward(self._data))
+                return type(self)(pc.fill_null_forward(self._pa_array))
             elif method == "backfill":
-                return type(self)(pc.fill_null_backward(self._data))
+                return type(self)(pc.fill_null_backward(self._pa_array))
         except pa.ArrowNotImplementedError:
             # ArrowNotImplementedError: Function 'coalesce' has no kernel
             #   matching input types (duration[ns], duration[ns])
@@ -632,7 +632,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
         if not len(values):
             return np.zeros(len(self), dtype=bool)
 
-        result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
+        result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
         # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
         # to False
         return np.array(result, dtype=np.bool_)
@@ -651,7 +651,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         The values returned by this method are also used in
         :func:`pandas.util.hash_pandas_object`.
         """
-        values = self._data.to_numpy()
+        values = self._pa_array.to_numpy()
         return values, self.dtype.na_value
 
     @doc(ExtensionArray.factorize)
@@ -661,12 +661,12 @@ def factorize(
     ) -> tuple[np.ndarray, ExtensionArray]:
         null_encoding = "mask" if use_na_sentinel else "encode"
 
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
         if pa.types.is_duration(pa_type):
             # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
-            data = self._data.cast(pa.int64())
+            data = self._pa_array.cast(pa.int64())
         else:
-            data = self._data
+            data = self._pa_array
 
         encoded = data.dictionary_encode(null_encoding=null_encoding)
         if encoded.length() == 0:
@@ -715,7 +715,7 @@ def round(
         DataFrame.round : Round values of a DataFrame.
         Series.round : Round values of a Series.
         """
-        return type(self)(pc.round(self._data, ndigits=decimals))
+        return type(self)(pc.round(self._pa_array, ndigits=decimals))
 
     @doc(ExtensionArray.searchsorted)
     def searchsorted(
@@ -801,18 +801,18 @@ def take(
             # "Sequence[int]", variable has type "ndarray")
             indices_array = indices  # type: ignore[assignment]
 
-        if len(self._data) == 0 and (indices_array >= 0).any():
+        if len(self._pa_array) == 0 and (indices_array >= 0).any():
             raise IndexError("cannot do a non-empty take")
-        if indices_array.size > 0 and indices_array.max() >= len(self._data):
+        if indices_array.size > 0 and indices_array.max() >= len(self._pa_array):
             raise IndexError("out of bounds value in 'indices'.")
 
         if allow_fill:
             fill_mask = indices_array < 0
             if fill_mask.any():
-                validate_indices(indices_array, len(self._data))
+                validate_indices(indices_array, len(self._pa_array))
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=fill_mask)
-                result = self._data.take(indices_array)
+                result = self._pa_array.take(indices_array)
                 if isna(fill_value):
                     return type(self)(result)
                 # TODO: ArrowNotImplementedError: Function fill_null has no
@@ -823,14 +823,14 @@ def take(
                 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
                 # Nothing to fill
-                return type(self)(self._data.take(indices))
+                return type(self)(self._pa_array.take(indices))
         else:  # allow_fill=False
             # TODO(ARROW-9432): Treat negative indices as indices from the right.
             if (indices_array < 0).any():
                 # Don't modify in-place
                 indices_array = np.copy(indices_array)
-                indices_array[indices_array < 0] += len(self._data)
-            return type(self)(self._data.take(indices_array))
+                indices_array[indices_array < 0] += len(self._pa_array)
+            return type(self)(self._pa_array.take(indices_array))
 
     @doc(ExtensionArray.to_numpy)
     def to_numpy(
@@ -844,7 +844,7 @@ def to_numpy(
         if na_value is lib.no_default:
             na_value = self.dtype.na_value
 
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
         if (
             is_object_dtype(dtype)
             or pa.types.is_timestamp(pa_type)
@@ -852,7 +852,7 @@ def to_numpy(
         ):
             result = np.array(list(self), dtype=dtype)
         else:
-            result = np.asarray(self._data, dtype=dtype)
+            result = np.asarray(self._pa_array, dtype=dtype)
             if copy or self._hasna:
                 result = result.copy()
         if self._hasna:
@@ -867,13 +867,13 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         -------
         ArrowExtensionArray
         """
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
 
         if pa.types.is_duration(pa_type):
             # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
-            data = self._data.cast(pa.int64())
+            data = self._pa_array.cast(pa.int64())
         else:
-            data = self._data
+            data = self._pa_array
 
         pa_result = pc.unique(data)
 
@@ -899,12 +899,12 @@ def value_counts(self, dropna: bool = True) -> Series:
         --------
         Series.value_counts
         """
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
         if pa.types.is_duration(pa_type):
             # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
-            data = self._data.cast(pa.int64())
+            data = self._pa_array.cast(pa.int64())
         else:
-            data = self._data
+            data = self._pa_array
 
         from pandas import (
             Index,
@@ -945,7 +945,7 @@ def _concat_same_type(
         -------
         ArrowExtensionArray
         """
-        chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
+        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
         arr = pa.chunked_array(chunks)
         return cls(arr)
 
@@ -986,7 +986,7 @@ def _accumulate(
         if pyarrow_meth is None:
             return super()._accumulate(name, skipna=skipna, **kwargs)
 
-        data_to_accum = self._data
+        data_to_accum = self._pa_array
 
         pa_dtype = data_to_accum.type
         if pa.types.is_duration(pa_dtype):
@@ -1023,9 +1023,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         ------
         TypeError : subclass does not define reductions
         """
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
 
-        data_to_reduce = self._data
+        data_to_reduce = self._pa_array
 
         if name in ["any", "all"] and (
             pa.types.is_integer(pa_type)
@@ -1036,9 +1036,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
             #  for other dtypes, matching our non-pyarrow behavior
 
             if pa.types.is_duration(pa_type):
-                data_to_cmp = self._data.cast(pa.int64())
+                data_to_cmp = self._pa_array.cast(pa.int64())
             else:
-                data_to_cmp = self._data
+                data_to_cmp = self._pa_array
 
             not_eq = pc.not_equal(data_to_cmp, 0)
             data_to_reduce = not_eq
@@ -1047,7 +1047,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
             def pyarrow_meth(data, skip_nulls, **kwargs):
                 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
-                denominator = pc.sqrt_checked(pc.count(self._data))
+                denominator = pc.sqrt_checked(pc.count(self._pa_array))
                 return pc.divide_checked(numerator, denominator)
 
         else:
@@ -1109,7 +1109,7 @@ def __setitem__(self, key, value) -> None:
 
         if com.is_null_slice(key):
             # fast path (GH50248)
-            data = self._if_else(True, value, self._data)
+            data = self._if_else(True, value, self._pa_array)
 
         elif is_integer(key):
             # fast path
@@ -1126,20 +1126,20 @@ def __setitem__(self, key, value) -> None:
             elif isinstance(value, pa.Scalar):
                 value = value.as_py()
             chunks = [
-                *self._data[:key].chunks,
-                pa.array([value], type=self._data.type, from_pandas=True),
-                *self._data[key + 1 :].chunks,
+                *self._pa_array[:key].chunks,
+                pa.array([value], type=self._pa_array.type, from_pandas=True),
+                *self._pa_array[key + 1 :].chunks,
             ]
             data = pa.chunked_array(chunks).combine_chunks()
 
         elif is_bool_dtype(key):
             key = np.asarray(key, dtype=np.bool_)
-            data = self._replace_with_mask(self._data, key, value)
+            data = self._replace_with_mask(self._pa_array, key, value)
 
         elif is_scalar(value) or isinstance(value, pa.Scalar):
             mask = np.zeros(len(self), dtype=np.bool_)
             mask[key] = True
-            data = self._if_else(mask, value, self._data)
+            data = self._if_else(mask, value, self._pa_array)
 
         else:
             indices = np.arange(len(self))[key]
@@ -1152,11 +1152,11 @@ def __setitem__(self, key, value) -> None:
             value = value.take(argsort)
             mask = np.zeros(len(self), dtype=np.bool_)
             mask[indices] = True
-            data = self._replace_with_mask(self._data, mask, value)
+            data = self._replace_with_mask(self._pa_array, mask, value)
 
         if isinstance(data, pa.Array):
             data = pa.chunked_array([data])
-        self._data = data
+        self._pa_array = data
 
     def _rank(
         self,
@@ -1186,7 +1186,7 @@ def _rank(
             result = pa.array(ranked, type=pa_type, from_pandas=True)
             return type(self)(result)
 
-        data = self._data.combine_chunks()
+        data = self._pa_array.combine_chunks()
         sort_keys = "ascending" if ascending else "descending"
         null_placement = "at_start" if na_option == "top" else "at_end"
         tiebreaker = "min" if method == "average" else method
@@ -1199,7 +1199,7 @@ def _rank(
         )
 
         if na_option == "keep":
-            mask = pc.is_null(self._data)
+            mask = pc.is_null(self._pa_array)
             null = pa.scalar(None, type=result.type)
             result = pc.if_else(mask, null, result)
 
@@ -1240,9 +1240,9 @@ def _quantile(
         -------
         same type as self
         """
-        pa_dtype = self._data.type
+        pa_dtype = self._pa_array.type
 
-        data = self._data
+        data = self._pa_array
         if pa.types.is_temporal(pa_dtype) and interpolation in ["lower", "higher"]:
             # https://github.com/apache/arrow/issues/33769 in these cases
             #  we can cast to ints and back
@@ -1279,17 +1279,17 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra
         if pa_version_under6p0:
             raise NotImplementedError("mode only supported for pyarrow version >= 6.0")
 
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
         if pa.types.is_temporal(pa_type):
             nbits = pa_type.bit_width
             if nbits == 32:
-                data = self._data.cast(pa.int32())
+                data = self._pa_array.cast(pa.int32())
             elif nbits == 64:
-                data = self._data.cast(pa.int64())
+                data = self._pa_array.cast(pa.int64())
             else:
                 raise NotImplementedError(pa_type)
         else:
-            data = self._data
+            data = self._pa_array
 
         modes = pc.mode(data, pc.count_distinct(data).as_py())
         values = modes.field(0)
@@ -1314,7 +1314,7 @@ def _maybe_convert_setitem_value(self, value):
         else:
             pa_box = pa.scalar
         try:
-            value = pa_box(value, type=self._data.type, from_pandas=True)
+            value = pa_box(value, type=self._pa_array.type, from_pandas=True)
         except pa.ArrowTypeError as err:
             msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
             raise TypeError(msg) from err
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4aebe61412866..073e97a5911fa 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -112,7 +112,7 @@ def __init__(self, values) -> None:
         super().__init__(values)
         self._dtype = StringDtype(storage="pyarrow")
 
-        if not pa.types.is_string(self._data.type):
+        if not pa.types.is_string(self._pa_array.type):
             raise ValueError(
                 "ArrowStringArray requires a PyArrow (chunked) array of string type"
             )
@@ -183,7 +183,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
         if not len(value_set):
             return np.zeros(len(self), dtype=bool)
 
-        result = pc.is_in(self._data, value_set=pa.array(value_set))
+        result = pc.is_in(self._pa_array, value_set=pa.array(value_set))
         # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
         # to False
         return np.array(result, dtype=np.bool_)
@@ -196,7 +196,7 @@ def astype(self, dtype, copy: bool = True):
                 return self.copy()
             return self
         elif isinstance(dtype, NumericDtype):
-            data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
+            data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
             return dtype.__from_arrow__(data)
         elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
             return self.to_numpy(dtype=dtype, na_value=np.nan)
@@ -282,12 +282,12 @@ def _str_contains(
                 fallback_performancewarning()
                 return super()._str_contains(pat, case, flags, na, regex)
             else:
-                result = pc.match_substring_regex(self._data, pat)
+                result = pc.match_substring_regex(self._pa_array, pat)
         else:
             if case:
-                result = pc.match_substring(self._data, pat)
+                result = pc.match_substring(self._pa_array, pat)
             else:
-                result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
+                result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper())
         result = BooleanDtype().__from_arrow__(result)
         if not isna(na):
             result[isna(result)] = bool(na)
@@ -315,7 +315,7 @@ def _str_replace(
             return super()._str_replace(pat, repl, n, case, flags, regex)
 
         func = pc.replace_substring_regex if regex else pc.replace_substring
-        result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
+        result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
         return type(self)(result)
 
     def _str_match(
@@ -333,68 +333,68 @@ def _str_fullmatch(
         return self._str_match(pat, case, flags, na)
 
     def _str_isalnum(self):
-        result = pc.utf8_is_alnum(self._data)
+        result = pc.utf8_is_alnum(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isalpha(self):
-        result = pc.utf8_is_alpha(self._data)
+        result = pc.utf8_is_alpha(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isdecimal(self):
-        result = pc.utf8_is_decimal(self._data)
+        result = pc.utf8_is_decimal(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isdigit(self):
-        result = pc.utf8_is_digit(self._data)
+        result = pc.utf8_is_digit(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_islower(self):
-        result = pc.utf8_is_lower(self._data)
+        result = pc.utf8_is_lower(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isnumeric(self):
-        result = pc.utf8_is_numeric(self._data)
+        result = pc.utf8_is_numeric(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isspace(self):
-        result = pc.utf8_is_space(self._data)
+        result = pc.utf8_is_space(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_istitle(self):
-        result = pc.utf8_is_title(self._data)
+        result = pc.utf8_is_title(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_isupper(self):
-        result = pc.utf8_is_upper(self._data)
+        result = pc.utf8_is_upper(self._pa_array)
         return BooleanDtype().__from_arrow__(result)
 
     def _str_len(self):
-        result = pc.utf8_length(self._data)
+        result = pc.utf8_length(self._pa_array)
         return Int64Dtype().__from_arrow__(result)
 
     def _str_lower(self):
-        return type(self)(pc.utf8_lower(self._data))
+        return type(self)(pc.utf8_lower(self._pa_array))
 
     def _str_upper(self):
-        return type(self)(pc.utf8_upper(self._data))
+        return type(self)(pc.utf8_upper(self._pa_array))
 
     def _str_strip(self, to_strip=None):
         if to_strip is None:
-            result = pc.utf8_trim_whitespace(self._data)
+            result = pc.utf8_trim_whitespace(self._pa_array)
         else:
-            result = pc.utf8_trim(self._data, characters=to_strip)
+            result = pc.utf8_trim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
     def _str_lstrip(self, to_strip=None):
         if to_strip is None:
-            result = pc.utf8_ltrim_whitespace(self._data)
+            result = pc.utf8_ltrim_whitespace(self._pa_array)
         else:
-            result = pc.utf8_ltrim(self._data, characters=to_strip)
+            result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
     def _str_rstrip(self, to_strip=None):
         if to_strip is None:
-            result = pc.utf8_rtrim_whitespace(self._data)
+            result = pc.utf8_rtrim_whitespace(self._pa_array)
         else:
-            result = pc.utf8_rtrim(self._data, characters=to_strip)
+            result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
         return type(self)(result)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index eef77ceabb6fe..fe189e33c64e2 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -264,14 +264,14 @@ def test_from_dtype(self, data, request):
 
     def test_from_sequence_pa_array(self, data, request):
         # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
-        # data._data = pa.ChunkedArray
-        result = type(data)._from_sequence(data._data)
+        # data._pa_array = pa.ChunkedArray
+        result = type(data)._from_sequence(data._pa_array)
         tm.assert_extension_array_equal(result, data)
-        assert isinstance(result._data, pa.ChunkedArray)
+        assert isinstance(result._pa_array, pa.ChunkedArray)
 
-        result = type(data)._from_sequence(data._data.combine_chunks())
+        result = type(data)._from_sequence(data._pa_array.combine_chunks())
         tm.assert_extension_array_equal(result, data)
-        assert isinstance(result._data, pa.ChunkedArray)
+        assert isinstance(result._pa_array, pa.ChunkedArray)
 
     def test_from_sequence_pa_array_notimplemented(self, request):
         if pa_version_under6p0:
@@ -326,7 +326,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request):
                     reason=f"pyarrow doesn't support string cast from {pa_dtype}",
                 )
             )
-        pa_array = data._data.cast(pa.string())
+        pa_array = data._pa_array.cast(pa.string())
         result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
         tm.assert_extension_array_equal(result, data)
 
@@ -339,7 +339,7 @@ class TestGetitemTests(base.BaseGetitemTests):
     def test_getitem_scalar(self, data):
         # In the base class we expect data.dtype.type; but this (intentionally)
         #  returns Python scalars or pd.NA
-        pa_type = data._data.type
+        pa_type = data._pa_array.type
         if pa.types.is_integer(pa_type):
             exp_type = int
         elif pa.types.is_floating(pa_type):
@@ -1300,7 +1300,10 @@ def test_quantile(data, interpolation, quantile, request):
 
     if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
         pass
-    elif pa.types.is_temporal(data._data.type) and interpolation in ["lower", "higher"]:
+    elif pa.types.is_temporal(data._pa_array.type) and interpolation in [
+        "lower",
+        "higher",
+    ]:
         pass
     else:
         request.node.add_marker(
@@ -1444,7 +1447,7 @@ def test_pickle_roundtrip(data):
 
 def test_astype_from_non_pyarrow(data):
     # GH49795
-    pd_array = data._data.to_pandas().array
+    pd_array = data._pa_array.to_pandas().array
     result = pd_array.astype(data.dtype)
     assert not isinstance(pd_array.dtype, ArrowDtype)
     assert isinstance(result.dtype, ArrowDtype)
@@ -1463,11 +1466,11 @@ def test_to_numpy_with_defaults(data):
     # GH49973
     result = data.to_numpy()
 
-    pa_type = data._data.type
+    pa_type = data._pa_array.type
     if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
         expected = np.array(list(data))
     else:
-        expected = np.array(data._data)
+        expected = np.array(data._pa_array)
 
     if data._hasna:
         expected = expected.astype(object)
@@ -1483,7 +1486,7 @@ def test_setitem_null_slice(data):
     result = orig.copy()
     result[:] = data[0]
     expected = ArrowExtensionArray(
-        pa.array([data[0]] * len(data), type=data._data.type)
+        pa.array([data[0]] * len(data), type=data._pa_array.type)
     )
     tm.assert_extension_array_equal(result, expected)
 
@@ -1500,7 +1503,7 @@ def test_setitem_null_slice(data):
 
 def test_setitem_invalid_dtype(data):
     # GH50248
-    pa_type = data._data.type
+    pa_type = data._pa_array.type
     if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
         fill_value = 123
         err = TypeError
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 3e865947aa968..8ad7c008b0c9e 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -39,7 +39,7 @@ def split_array(arr):
     def _split_array(arr):
         import pyarrow as pa
 
-        arrow_array = arr._data
+        arrow_array = arr._pa_array
         split = len(arrow_array) // 2
         arrow_array = pa.chunked_array(
             [*arrow_array[:split].chunks, *arrow_array[split:].chunks]
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index f194cadbc73d8..35e3878ebd9b0 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -583,7 +583,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option):
             )
             # pyarrow by default infers timestamp resolution as us, not ns
             expected["i"] = ArrowExtensionArray(
-                expected["i"].array._data.cast(pa.timestamp(unit="us"))
+                expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))
             )
             # pyarrow supports a null type, so don't have to default to Int64
             expected["j"] = ArrowExtensionArray(pa.array([None, None]))

From ccad453bd60a5107e29956224997833e7f5f05e0 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 9 Feb 2023 15:52:37 -0800
Subject: [PATCH 2/7] fixup accessor

---
 pandas/core/indexes/accessors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
index 788448f2c7be6..9684b3645a399 100644
--- a/pandas/core/indexes/accessors.py
+++ b/pandas/core/indexes/accessors.py
@@ -215,7 +215,7 @@ def isocalendar(self):
         result = (
             cast(ArrowExtensionArray, self._parent.array)
             ._dt_isocalendar()
-            ._data.combine_chunks()
+            ._pa_array.combine_chunks()
         )
         iso_calendar_df = DataFrame(
             {

From c2d99b70646b98324fcd7eada3291a1a84f6a786 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 11 Feb 2023 09:03:57 -0800
Subject: [PATCH 3/7] fix copy/view test

---
 pandas/tests/copy_view/test_astype.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index a485275a28ac4..182ffed61e7bc 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -192,4 +192,6 @@ def test_astype_arrow_timestamp(using_copy_on_write):
     result = df.astype("timestamp[ns][pyarrow]")
     if using_copy_on_write:
         assert not result._mgr._has_no_reference(0)
-        assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
+        assert np.shares_memory(
+            get_array(df, "a").asi8, get_array(result, "a")._pa_array
+        )

From 662afac419a42059f054da4b9855ca1bf4e600f1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 16 Feb 2023 19:30:42 -0800
Subject: [PATCH 4/7] fix ArrowStringArray

---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 50de63b7836c7..40c0dfaecddc6 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -125,7 +125,7 @@ def __len__(self) -> int:
         -------
         length : int
         """
-        return len(self._data)
+        return len(self._pa_array)
 
     @classmethod
     def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):

From 86959c9727b88c67f3ecbdff4711b5e74563033a Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 17 Feb 2023 16:46:57 -0800
Subject: [PATCH 5/7] fix pickle tests

---
 pandas/core/arrays/string_arrow.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 40c0dfaecddc6..2086a93ea6e14 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -5,6 +5,7 @@
     Callable,
     Union,
 )
+import warnings
 
 import numpy as np
 
@@ -18,6 +19,7 @@
     npt,
 )
 from pandas.compat import pa_version_under7p0
+from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -213,6 +215,17 @@ def astype(self, dtype, copy: bool = True):
 
         return super().astype(dtype, copy=copy)
 
+    @property
+    def _data(self):
+        # dask accesses ._data directlys
+        warnings.warn(
+            f"{type(self).__name__}._data is a deprecated and will be removed "
+            "in a future version, use ._pa_array instead",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._pa_array
+
     # ------------------------------------------------------------------------
     # String methods interface
 

From 0f483590f7fde42f1223d8c5b1ca0bb5df7bcfe5 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 17 Feb 2023 18:35:49 -0800
Subject: [PATCH 6/7] update

---
 pandas/core/arrays/arrow/array.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 2a28cf2ae3dd6..54d87f944dd91 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -460,12 +460,12 @@ def _cmp_method(self, other, op):
         return BooleanArray(values, mask)
 
     def _evaluate_op_method(self, other, op, arrow_funcs):
-        pa_type = self._data.type
+        pa_type = self._pa_array.type
         if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
             operator.add,
             roperator.radd,
         ]:
-            length = self._data.length()
+            length = self._pa_array.length()
 
             seps: list[str] | list[bytes]
             if pa.types.is_string(pa_type):
@@ -476,11 +476,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
             if is_scalar(other):
                 other = [other] * length
             elif isinstance(other, type(self)):
-                other = other._data
+                other = other._pa_array
             if op is operator.add:
-                result = pc.binary_join_element_wise(self._data, other, seps)
+                result = pc.binary_join_element_wise(self._pa_array, other, seps)
             else:
-                result = pc.binary_join_element_wise(other, self._data, seps)
+                result = pc.binary_join_element_wise(other, self._pa_array, seps)
             return type(self)(result)
 
         pc_func = arrow_funcs[op.__name__]

From 60c912fd4374f2c326e16b9d33e57841226f45e7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 9 Mar 2023 15:08:21 -0800
Subject: [PATCH 7/7] update setstate, _to_pydatetime

---
 pandas/core/arrays/arrow/array.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 83ad775a722a7..2eb723ecb8e37 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -439,7 +439,7 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, state) -> None:
-        state["_data"] = pa.chunked_array(state["_data"])
+        state["_pa_array"] = pa.chunked_array(state["_data"])
         self.__dict__.update(state)
 
     def _cmp_method(self, other, op):
@@ -2091,7 +2091,7 @@ def _dt_round(
         return self._round_temporally("round", freq, ambiguous, nonexistent)
 
     def _dt_to_pydatetime(self):
-        return np.array(self._data.to_pylist(), dtype=object)
+        return np.array(self._pa_array.to_pylist(), dtype=object)
 
     def _dt_tz_localize(
         self,