From 3dab96d87f3eeba1b710e09624ff9e3730121f07 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 20 Apr 2021 09:59:53 +0200 Subject: [PATCH 1/5] BUG: Handle zero-chunked pyarrow.ChunkedArray in StringArray --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/arrays/string_.py | 5 ++++- pandas/tests/arrays/string_/test_string.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..6a0d046aef779 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -700,7 +700,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`StringArray` when the original had zero chunks (:issue:`41040`) - Interval diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 600aacec9c87a..49b5bb137cb9a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -118,7 +118,10 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + if len(results) > 0: + return StringArray._concat_same_type(results) + else: + return StringArray(np.array([], dtype="object")) class StringArray(PandasArray): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2b2db49c62ba2..e2d8e522abb35 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -476,6 +476,22 @@ def test_arrow_roundtrip(dtype, dtype_object): assert result.loc[2, "a"] is pd.NA +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_load_from_zero_chunks(dtype, dtype_object): + # GH-41040 + import pyarrow as pa + + data = pd.array([], dtype=dtype) + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + # Instantiate the same table with no chunks at all + table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) + result = table.to_pandas() + assert isinstance(result["a"].dtype, dtype_object) + tm.assert_frame_equal(result, df) + + def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) From e97ad971c691e77e0c08ccfa725f71d19f5c6a14 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 20 Apr 2021 10:34:47 +0200 Subject: [PATCH 2/5] Apply the same fix for masked types --- pandas/core/arrays/boolean.py | 7 ++++++- pandas/core/arrays/numeric.py | 6 +++++- pandas/tests/arrays/masked/test_arrow_compat.py | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a0bfccc0ea15..56065c8c8f6f3 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -126,7 +126,12 @@ def __from_arrow__( bool_arr = BooleanArray._from_sequence(np.array(arr)) results.append(bool_arr) - return BooleanArray._concat_same_type(results) + if len(results) == 0: + return BooleanArray( + np.array([], dtype=np.bool_), np.array([], dtype=np.bool_) + ) + else: + return BooleanArray._concat_same_type(results) def coerce_to_array( diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 4908000a68810..89e04fc4098b9 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -66,7 +66,11 @@ def __from_arrow__( num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) - if len(results) == 1: + if len(results) == 0: + return array_class( + np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) + ) + elif len(results) == 1: # avoid additional copy in _concat_same_type return results[0] else: diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..ec5794a34ac45 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -41,6 +41,22 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_load_from_zero_chunks(data): + # GH-41040 + import pyarrow as pa + + df = pd.DataFrame({"a": data[0:0]}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + table = pa.table( + [pa.chunked_array([], type=table.field("a").type)], schema=table.schema + ) + result = table.to_pandas() + assert result["a"].dtype == data.dtype + tm.assert_frame_equal(result, df) + + @td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 From f4a2c8c90b5e8c0218869c5eff30354603b3bca5 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 20 Apr 2021 12:22:58 +0200 Subject: [PATCH 3/5] Use `not results` --- pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/numeric.py | 2 +- pandas/core/arrays/string_.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 56065c8c8f6f3..6271a13875371 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -126,7 +126,7 @@ def __from_arrow__( bool_arr = BooleanArray._from_sequence(np.array(arr)) results.append(bool_arr) - if len(results) == 0: + if not results: return BooleanArray( np.array([], dtype=np.bool_), np.array([], dtype=np.bool_) ) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 89e04fc4098b9..bc467e93c2c2c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -66,7 +66,7 @@ def __from_arrow__( num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) - if len(results) == 0: + if not results: return array_class( np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) ) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 49b5bb137cb9a..6954b512c7ad0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -118,7 +118,7 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - if len(results) > 0: + if results: return StringArray._concat_same_type(results) else: return StringArray(np.array([], dtype="object")) From 58736a676eb5d0458acd0fbba21ec905fabe7d52 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 21 Apr 2021 15:45:08 +0200 Subject: [PATCH 4/5] Update doc/source/whatsnew/v1.3.0.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6a0d046aef779..81f5ad34c53a0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -700,7 +700,7 @@ Conversion Strings ^^^^^^^ -- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`StringArray` when the original had zero chunks (:issue:`41040`) +- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`) - Interval From 5ce19cf1b3c121a6c3f1e2cfe828627051241704 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 21 Apr 2021 16:17:26 +0200 Subject: [PATCH 5/5] Fix IntervalArray and PeriodArray --- pandas/core/dtypes/dtypes.py | 8 ++++++++ pandas/tests/arrays/interval/test_interval.py | 7 +++++++ .../tests/arrays/period/test_arrow_compat.py | 20 +++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 84eede019251b..e09c24c94992d 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1005,6 +1005,8 @@ def __from_arrow__( parr[~mask] = NaT results.append(parr) + if not results: + return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False) return PeriodArray._concat_same_type(results) @@ -1238,6 +1240,12 @@ def __from_arrow__( iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) results.append(iarr) + if not results: + return IntervalArray.from_arrays( + np.array([], dtype=self.subtype), + np.array([], dtype=self.subtype), + closed=array.type.closed, + ) return IntervalArray._concat_same_type(results) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d8fca91c5516a..fde45a1e39bb2 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -271,6 +271,13 @@ def test_arrow_table_roundtrip(breaks): expected = pd.concat([df, df], ignore_index=True) tm.assert_frame_equal(result, expected) + # GH-41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + @pyarrow_skip @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index f4e803cf4405f..398972a682504 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -100,6 +100,26 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pyarrow_skip +def test_arrow_load_from_zero_chunks(): + # GH-41040 + import pyarrow as pa + + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([], freq="D") + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + @pyarrow_skip def test_arrow_table_roundtrip_without_metadata(): import pyarrow as pa