Closed
Description
The current implement of functions like StrSlice.contains
are naive string searches, not using a more intelligent algorithm like Boyer-Moore (with a comment referring one to #1932).
Unfortunately that means we're trounced when performing operations like that. e.g.
// contains_string.rs
extern crate test;
fn main() {
let haystack = std::io::stdin().read_to_str().unwrap();
let needle = "Anything whatsoever related to the Rust programming language: an open-source systems programming language from Mozilla, emphasizing safety, concurrency, and speed.";
println!("{} {}", haystack.len(), needle.len());
for _ in range(0, 1000) {
test::black_box(haystack.contains(needle))
}
}
// contains_string.c
#include<string.h>
#include<stdio.h>
#include<unistd.h>
#include<stdlib.h>
// 16MB
#define N (1 << 24)
int main() {
char * haystack = calloc(N, 1);
read(0, haystack, N - 1);
const char * needle = "Anything whatsoever related to the Rust programming language: an open-source systems programming language from Mozilla, emphasizing safety, concurrency, and speed.";
printf("%d %d", strlen(haystack), strlen(needle));
int count = 0;
for (int i = 0; i < 1000; i ++) {
haystack[N - 1] = 0;
count += strstr(haystack, needle) == NULL;
}
return count;
}
// contains_string.py
import sys
haystack = sys.stdin.read()
needle = "Anything whatsoever related to the Rust programming language: an open-source systems programming language from Mozilla, emphasizing safety, concurrency, and speed."
print(len(haystack), len(needle))
for _ in range(0, 1000):
needle in haystack
$ wget http://www.gutenberg.org/cache/epub/1342/pg1342.txt # pride and prejudice (700 kb)
[...snip...]
$ rustc -O contains_string.rs
$ gcc -O contains_string.c -o contains_string_c -std=gnu11
$ time ./contains_string < pg1342.txt
717569 163
real 0m1.041s
user 0m1.036s
sys 0m0.000s
$ time ./contains_string_c < pg1342.txt
717569 163
real 0m0.064s
user 0m0.056s
sys 0m0.004s
$ time python ./contains_string.py < pg1342.txt
(717569, 163)
real 0m0.155s
user 0m0.152s
sys 0m0.000s
General notes:
- glibc's
strstr
: strstr.c and str-two-way.h - A cursory search found this article which might be interesting to experiment with.
- This came up on reddit (discussion)