Skip to content

Commit 3dab96d

Browse files
committed
BUG: Handle zero-chunked pyarrow.ChunkedArray in StringArray
1 parent ccb90d6 commit 3dab96d

File tree

3 files changed

+21
-2
lines changed

3 files changed

+21
-2
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ Conversion
700700
Strings
701701
^^^^^^^
702702

703-
-
703+
- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`StringArray` when the original had zero chunks (:issue:`41040`)
704704
-
705705

706706
Interval

pandas/core/arrays/string_.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ def __from_arrow__(
118118
str_arr = StringArray._from_sequence(np.array(arr))
119119
results.append(str_arr)
120120

121-
return StringArray._concat_same_type(results)
121+
if len(results) > 0:
122+
return StringArray._concat_same_type(results)
123+
else:
124+
return StringArray(np.array([], dtype="object"))
122125

123126

124127
class StringArray(PandasArray):

pandas/tests/arrays/string_/test_string.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,22 @@ def test_arrow_roundtrip(dtype, dtype_object):
476476
assert result.loc[2, "a"] is pd.NA
477477

478478

479+
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
480+
def test_arrow_load_from_zero_chunks(dtype, dtype_object):
481+
# GH-41040
482+
import pyarrow as pa
483+
484+
data = pd.array([], dtype=dtype)
485+
df = pd.DataFrame({"a": data})
486+
table = pa.table(df)
487+
assert table.field("a").type == "string"
488+
# Instantiate the same table with no chunks at all
489+
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
490+
result = table.to_pandas()
491+
assert isinstance(result["a"].dtype, dtype_object)
492+
tm.assert_frame_equal(result, df)
493+
494+
479495
def test_value_counts_na(dtype):
480496
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
481497
result = arr.value_counts(dropna=False)

0 commit comments

Comments
 (0)