diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fabb3974728de..4dc291c0fe084 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -41,6 +41,7 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.integer import Int64Dtype from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -858,6 +859,14 @@ def _str_isupper(self): else: return super()._str_isupper() + def _str_len(self): + # utf8_length added in pyarrow 4.0.0 + if hasattr(pc, "utf8_length"): + result = pc.utf8_length(self._data) + return Int64Dtype().__from_arrow__(result) + else: + return super()._str_len() + def _str_lower(self): return type(self)(pc.utf8_lower(self._data)) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f218d5333b415..a013dd6ed4df1 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,6 @@ MultiIndex, Series, isna, - notna, ) import pandas._testing as tm @@ -402,14 +401,19 @@ def test_join(): tm.assert_almost_equal(rs, xp) -def test_len(): - values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) +def test_len(any_string_dtype): + values = Series( + ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], + dtype=any_string_dtype, + ) result = values.str.len() - exp = values.map(lambda x: len(x) if notna(x) else np.nan) - tm.assert_series_equal(result, exp) + expected_dtype = "float64" if any_string_dtype == "object" else "Int64" + expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - # mixed + +def test_len_mixed(): mixed = Series( [ "a_b",