From 7e3ac0d3495d9f0dda8a6e168869b66e6e45262b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Apr 2021 09:11:00 +0200 Subject: [PATCH 1/3] PERF: optimize conversion from boolean Arrow array to masked BooleanArray --- pandas/core/arrays/boolean.py | 15 +++++++++++++-- pandas/tests/arrays/masked/test_arrow_compat.py | 11 +++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a0bfccc0ea15..d59f74ffd256f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -122,8 +122,19 @@ def __from_arrow__( results = [] for arr in chunks: - # TODO should optimize this without going through object array - bool_arr = BooleanArray._from_sequence(np.array(arr)) + buflist = arr.buffers() + data = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[1]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + if arr.null_count != 0: + mask = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[0]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + mask = ~mask + else: + mask = np.zeros(len(arr), dtype=bool) + + bool_arr = BooleanArray(data, mask) results.append(bool_arr) return BooleanArray._concat_same_type(results) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..ae5bec078734c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -55,12 +55,19 @@ def test_arrow_from_arrow_uint(): @td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_sliced(): +def test_arrow_sliced(data): # https://github.com/pandas-dev/pandas/issues/38525 import pyarrow as pa - df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")}) + df = pd.DataFrame({"a": data}) table = pa.table(df) result = table.slice(2, None).to_pandas() expected = df.iloc[2:].reset_index(drop=True) tm.assert_frame_equal(result, expected) + + # no missing values + df2 = df.fillna(data[0]) + table = pa.table(df2) + result = table.slice(2, None).to_pandas() + expected = df2.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) From 4b8bfd3d8a8ae5a38dabcbd483806fa066500288 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Apr 2021 14:12:32 +0200 Subject: [PATCH 2/3] check type and error + test --- pandas/core/arrays/boolean.py | 3 +++ .../tests/arrays/masked/test_arrow_compat.py | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d59f74ffd256f..0b789693e7f74 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -114,6 +114,9 @@ def __from_arrow__( """ import pyarrow + if array.type != pyarrow.bool_(): + raise TypeError(f"Expected array of boolean type, got {array.type} instead") + if isinstance(array, pyarrow.Array): chunks = [array] else: diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index ae5bec078734c..a63f9b195d80c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -71,3 +71,23 @@ def test_arrow_sliced(data): result = table.slice(2, None).to_pandas() expected = df2.iloc[2:].reset_index(drop=True) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_from_arrow_type_error(request, data): + # ensure that __from_arrow__ returns a TypeError when getting a wrong + # array type + import pyarrow as pa + + if data.dtype != "boolean": + # TODO numeric dtypes cast any incoming array to the correct dtype + # instead of erroring + request.node.add_marker( + pytest.mark.xfail(reason="numeric dtypes don't error but cast") + ) + + arr = pa.array(data).cast("string") + with pytest.raises(TypeError, match=None): + # we don't test the exact error message, only the fact that it raises + # a TypeError is relevant + data.dtype.__from_arrow__(arr) From 6c17271a9cde7dfe0194c53e102f7ca1cdf04733 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Apr 2021 15:32:17 +0200 Subject: [PATCH 3/3] add whatsnew note --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..74a302922c2aa 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -624,6 +624,7 @@ Performance improvements - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) - Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) +- Performance improvement in the conversion of pyarrow boolean array to a pandas nullable boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) .. ---------------------------------------------------------------------------