Skip to content

Commit a8f1152

Browse files
author
d.grigonis
authored
gh-119879: str.find(): Utilize last character gap for two-way periodic needles (#119880)
1 parent 8d63c8d commit a8f1152

File tree

2 files changed

+36
-28
lines changed

2 files changed

+36
-28
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
String search is now slightly faster for certain cases. It now utilizes last character gap (good suffix rule) for two-way periodic needles.

Objects/stringlib/fastsearch.h

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
256256
257257
The local period of the cut is the minimal length of a string w
258258
such that (left endswith w or w endswith left)
259-
and (right startswith w or w startswith left).
259+
and (right startswith w or w startswith right).
260260
261261
The Critical Factorization Theorem says that this maximal local
262262
period is the global period of the string.
@@ -337,21 +337,20 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
337337
if (p->is_periodic) {
338338
assert(p->cut <= len_needle/2);
339339
assert(p->cut < p->period);
340-
p->gap = 0; // unused
341340
}
342341
else {
343342
// A lower bound on the period
344343
p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
345-
// The gap between the last character and the previous
346-
// occurrence of an equivalent character (modulo TABLE_SIZE)
347-
p->gap = len_needle;
348-
STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK;
349-
for (Py_ssize_t i = len_needle - 2; i >= 0; i--) {
350-
STRINGLIB_CHAR x = needle[i] & TABLE_MASK;
351-
if (x == last) {
352-
p->gap = len_needle - 1 - i;
353-
break;
354-
}
344+
}
345+
// The gap between the last character and the previous
346+
// occurrence of an equivalent character (modulo TABLE_SIZE)
347+
p->gap = len_needle;
348+
STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK;
349+
for (Py_ssize_t i = len_needle - 2; i >= 0; i--) {
350+
STRINGLIB_CHAR x = needle[i] & TABLE_MASK;
351+
if (x == last) {
352+
p->gap = len_needle - 1 - i;
353+
break;
355354
}
356355
}
357356
// Fill up a compressed Boyer-Moore "Bad Character" table
@@ -383,6 +382,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
383382
const STRINGLIB_CHAR *window;
384383
LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
385384

385+
Py_ssize_t gap = p->gap;
386+
Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap);
386387
if (p->is_periodic) {
387388
LOG("Needle is periodic.\n");
388389
Py_ssize_t memory = 0;
@@ -408,8 +409,16 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
408409
Py_ssize_t i = Py_MAX(cut, memory);
409410
for (; i < len_needle; i++) {
410411
if (needle[i] != window[i]) {
411-
LOG("Right half does not match.\n");
412-
window_last += i - cut + 1;
412+
if (i < gap_jump_end) {
413+
LOG("Early right half mismatch: jump by gap.\n");
414+
assert(gap >= i - cut + 1);
415+
window_last += gap;
416+
}
417+
else {
418+
LOG("Late right half mismatch: jump by n (>gap)\n");
419+
assert(i - cut + 1 > gap);
420+
window_last += i - cut + 1;
421+
}
413422
memory = 0;
414423
goto periodicwindowloop;
415424
}
@@ -442,10 +451,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
442451
}
443452
}
444453
else {
445-
Py_ssize_t gap = p->gap;
446454
period = Py_MAX(gap, period);
447455
LOG("Needle is not periodic.\n");
448-
Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap);
449456
windowloop:
450457
while (window_last < haystack_end) {
451458
for (;;) {
@@ -463,19 +470,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
463470
window = window_last - len_needle + 1;
464471
assert((window[len_needle - 1] & TABLE_MASK) ==
465472
(needle[len_needle - 1] & TABLE_MASK));
466-
for (Py_ssize_t i = cut; i < gap_jump_end; i++) {
467-
if (needle[i] != window[i]) {
468-
LOG("Early right half mismatch: jump by gap.\n");
469-
assert(gap >= i - cut + 1);
470-
window_last += gap;
471-
goto windowloop;
472-
}
473-
}
474-
for (Py_ssize_t i = gap_jump_end; i < len_needle; i++) {
473+
Py_ssize_t i = cut;
474+
for (; i < len_needle; i++) {
475475
if (needle[i] != window[i]) {
476-
LOG("Late right half mismatch.\n");
477-
assert(i - cut + 1 > gap);
478-
window_last += i - cut + 1;
476+
if (i < gap_jump_end) {
477+
LOG("Early right half mismatch: jump by gap.\n");
478+
assert(gap >= i - cut + 1);
479+
window_last += gap;
480+
}
481+
else {
482+
LOG("Late right half mismatch: jump by n (>gap)\n");
483+
assert(i - cut + 1 > gap);
484+
window_last += i - cut + 1;
485+
}
479486
goto windowloop;
480487
}
481488
}

0 commit comments

Comments
 (0)