diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..74a302922c2aa 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -624,6 +624,7 @@ Performance improvements - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) +- Performance improvement in the conversion of pyarrow boolean array to a pandas nullable boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a0bfccc0ea15..0b789693e7f74 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -114,6 +114,9 @@ def __from_arrow__( """ import pyarrow + if array.type != pyarrow.bool_(): + raise TypeError(f"Expected array of boolean type, got {array.type} instead") + if isinstance(array, pyarrow.Array): chunks = [array] else: @@ -122,8 +125,19 @@ def __from_arrow__( results = [] for arr in chunks: - # TODO should optimize this without going through object array - bool_arr = BooleanArray._from_sequence(np.array(arr)) + buflist = arr.buffers() + data = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[1]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + if arr.null_count != 0: + mask = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[0]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + mask = ~mask + else: + mask = np.zeros(len(arr), dtype=bool) + + bool_arr = BooleanArray(data, mask) results.append(bool_arr) return BooleanArray._concat_same_type(results) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..a63f9b195d80c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -55,12 +55,39 @@ def test_arrow_from_arrow_uint(): @td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_sliced(): +def test_arrow_sliced(data): # https://github.com/pandas-dev/pandas/issues/38525 import pyarrow as pa - df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")}) + df = pd.DataFrame({"a": data}) table = pa.table(df) result = table.slice(2, None).to_pandas() expected = df.iloc[2:].reset_index(drop=True) tm.assert_frame_equal(result, expected) + + # no missing values + df2 = df.fillna(data[0]) + table = pa.table(df2) + result = table.slice(2, None).to_pandas() + expected = df2.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_from_arrow_type_error(request, data): + # ensure that __from_arrow__ returns a TypeError when getting a wrong + # array type + import pyarrow as pa + + if data.dtype != "boolean": + # TODO numeric dtypes cast any incoming array to the correct dtype + # instead of erroring + request.node.add_marker( + pytest.mark.xfail(reason="numeric dtypes don't error but cast") + ) + + arr = pa.array(data).cast("string") + with pytest.raises(TypeError, match=None): + # we don't test the exact error message, only the fact that it raises + # a TypeError is relevant + data.dtype.__from_arrow__(arr)