Skip to content

Commit 533f54c

Browse files
author
Rohan Jain
committed
gh reference
1 parent 6d68683 commit 533f54c

File tree

3 files changed

+25
-22
lines changed

3 files changed

+25
-22
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ Strings
803803
- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`)
804804
- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
805805
- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
806+
- Bug in :meth:`Series.str.find` when ``start < 0`` and ``end < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56791`)
806807
- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)
807808
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
808809
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)

pandas/core/arrays/arrow/array.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,23 +2331,20 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self:
23312331
if (start == 0 or start is None) and end is None:
23322332
result = pc.find_substring(self._pa_array, sub)
23332333
else:
2334-
result = pc.find_substring(self._pa_array, sub)
2335-
length = pc.utf8_length(self._pa_array)
23362334
if start is None:
2337-
start = pa.scalar(0, result.type)
2335+
start_offset = 0
2336+
start = 0
23382337
elif start < 0:
2339-
start = pc.add(start, length)
2340-
if end is None:
2341-
end = length
2342-
elif end < 0:
2343-
end = pc.add(end, length)
2344-
found = pc.not_equal(pa.scalar(-1, type=result.type), result)
2345-
found_in_bounds = pc.and_(
2346-
pc.greater_equal(result, start), pc.less(result, end)
2347-
)
2348-
result = pc.if_else(
2349-
pc.and_(found, found_in_bounds), result, pa.scalar(-1, type=result.type)
2350-
)
2338+
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
2339+
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
2340+
else:
2341+
start_offset = start
2342+
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
2343+
result = pc.find_substring(slices, sub)
2344+
not_found = pc.equal(result, pa.scalar(-1, type=result.type))
2345+
2346+
offset_result = pc.add(result, start_offset)
2347+
result = pc.if_else(not_found, result, offset_result)
23512348
return type(self)(result)
23522349

23532350
def _str_join(self, sep: str) -> Self:

pandas/tests/extension/test_arrow.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,29 +1919,34 @@ def test_str_fullmatch(pat, case, na, exp):
19191919

19201920

19211921
@pytest.mark.parametrize(
1922-
"sub, start, end, exp",
1923-
[["ab", 0, None, [0, None]], ["bc", 1, 3, [1, None]], ["ab", 1, None, [-1, None]]],
1922+
"sub, start, end, exp, exp_type",
1923+
[
1924+
["ab", 0, None, [0, None], pa.int32()],
1925+
["bc", 1, 3, [1, None], pa.int64()],
1926+
["ab", 1, None, [-1, None], pa.int64()],
1927+
["ab", -3, -3, [-1, None], pa.int64()],
1928+
],
19241929
)
1925-
def test_str_find(sub, start, end, exp):
1930+
def test_str_find(sub, start, end, exp, exp_type):
19261931
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
19271932
result = ser.str.find(sub, start=start, end=end)
1928-
expected = pd.Series(exp, dtype=ArrowDtype(pa.int32()))
1933+
expected = pd.Series(exp, dtype=ArrowDtype(exp_type))
19291934
tm.assert_series_equal(result, expected)
19301935

19311936

19321937
def test_str_find_negative_start():
19331938
# GH 56411
19341939
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
19351940
result = ser.str.find(sub="b", start=-1000, end=3)
1936-
expected = pd.Series([1, None], dtype=ArrowDtype(pa.int32()))
1941+
expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64()))
19371942
tm.assert_series_equal(result, expected)
19381943

19391944

19401945
def test_str_find_negative_start_negative_end():
1941-
# GH 56411
1946+
# GH 56791
19421947
ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
19431948
result = ser.str.find(sub="d", start=-6, end=-3)
1944-
expected = pd.Series([3, None], dtype=ArrowDtype(pa.int32()))
1949+
expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64()))
19451950
tm.assert_series_equal(result, expected)
19461951

19471952

0 commit comments

Comments
 (0)