-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Series.str.find fix for pd.ArrowDtype(pa.string()) #56792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6d68683
533f54c
beaf5f0
68763b1
52a6f52
771e2f6
0de96cb
f5416e3
06105ce
7fa21eb
8e27d85
403bc4f
58b4a94
ef9a791
2cf0fbf
38b6055
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2364,20 +2364,26 @@ def _str_fullmatch( | |
return self._str_match(pat, case, flags, na) | ||
|
||
def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self: | ||
if start != 0 and end is not None: | ||
if (start == 0 or start is None) and end is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any chance you've looked at using a slice object here already? It feels like that could help simplify a lot of the branching being done here for things being None / 0 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think a slice object would help here, since I believe for this method, we need a positive start index, to calculate the offset of the substring from the original string. This PR essentially converts start into equivalent, but positive index, then it can be added to the resulting index returned by pc.find_substring. For example, if string is "abc" and start is -1, we calculate start_offset as 2, which is the equivalent positive index. We can use this offset to find the index of the substring in the original string, by adding it the position of the substring result. |
||
result = pc.find_substring(self._pa_array, sub) | ||
else: | ||
if sub == "": | ||
# GH 56792 | ||
result = self._apply_elementwise(lambda val: val.find(sub, start, end)) | ||
return type(self)(pa.chunked_array(result)) | ||
if start is None: | ||
start_offset = 0 | ||
start = 0 | ||
elif start < 0: | ||
start_offset = pc.add(start, pc.utf8_length(self._pa_array)) | ||
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) | ||
else: | ||
start_offset = start | ||
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) | ||
result = pc.find_substring(slices, sub) | ||
not_found = pc.equal(result, -1) | ||
start_offset = max(0, start) | ||
found = pc.not_equal(result, pa.scalar(-1, type=result.type)) | ||
offset_result = pc.add(result, start_offset) | ||
result = pc.if_else(not_found, result, offset_result) | ||
elif start == 0 and end is None: | ||
slices = self._pa_array | ||
result = pc.find_substring(slices, sub) | ||
else: | ||
raise NotImplementedError( | ||
f"find not implemented with {sub=}, {start=}, {end=}" | ||
) | ||
result = pc.if_else(found, offset_result, -1) | ||
return type(self)(result) | ||
|
||
def _str_join(self, sep: str) -> Self: | ||
|
Uh oh!
There was an error while loading. Please reload this page.