Skip to content

Commit 7c89062

Browse files
[ArrowStringArray] use pyarrow.compute.utf8_length if available (#41248)
1 parent 08ce8df commit 7c89062

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

pandas/core/arrays/string_arrow.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from pandas.core.arraylike import OpsMixin
4343
from pandas.core.arrays.base import ExtensionArray
4444
from pandas.core.arrays.boolean import BooleanDtype
45+
from pandas.core.arrays.integer import Int64Dtype
4546
from pandas.core.indexers import (
4647
check_array_indexer,
4748
validate_indices,
@@ -878,6 +879,14 @@ def _str_isupper(self):
878879
else:
879880
return super()._str_isupper()
880881

882+
def _str_len(self):
883+
# utf8_length added in pyarrow 4.0.0
884+
if hasattr(pc, "utf8_length"):
885+
result = pc.utf8_length(self._data)
886+
return Int64Dtype().__from_arrow__(result)
887+
else:
888+
return super()._str_len()
889+
881890
def _str_lower(self):
882891
return type(self)(pc.utf8_lower(self._data))
883892

pandas/tests/strings/test_strings.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
MultiIndex,
1515
Series,
1616
isna,
17-
notna,
1817
)
1918
import pandas._testing as tm
2019

@@ -402,14 +401,19 @@ def test_join():
402401
tm.assert_almost_equal(rs, xp)
403402

404403

405-
def test_len():
406-
values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"])
404+
def test_len(any_string_dtype):
405+
values = Series(
406+
["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"],
407+
dtype=any_string_dtype,
408+
)
407409

408410
result = values.str.len()
409-
exp = values.map(lambda x: len(x) if notna(x) else np.nan)
410-
tm.assert_series_equal(result, exp)
411+
expected_dtype = "float64" if any_string_dtype == "object" else "Int64"
412+
expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype)
413+
tm.assert_series_equal(result, expected)
411414

412-
# mixed
415+
416+
def test_len_mixed():
413417
mixed = Series(
414418
[
415419
"a_b",

0 commit comments

Comments
 (0)