Skip to content

String searching is slow #14107

Closed
Closed
@huonw

Description

@huonw

The current implement of functions like StrSlice.contains are naive string searches, not using a more intelligent algorithm like Boyer-Moore (with a comment referring one to #1932).

Unfortunately that means we're trounced when performing operations like that. e.g.

// contains_string.rs
extern crate test;

fn main() {
    let haystack = std::io::stdin().read_to_str().unwrap();

    let needle = "Anything whatsoever related to the Rust programming language: an open-source systems programming language from Mozilla, emphasizing safety, concurrency, and speed.";

    println!("{} {}", haystack.len(), needle.len());
    for _ in range(0, 1000) {
        test::black_box(haystack.contains(needle))
    }
}
// contains_string.c
#include<string.h>
#include<stdio.h>
#include<unistd.h>
#include<stdlib.h>

// 16MB
#define N (1 << 24)

int main() {
    char * haystack = calloc(N, 1);

    read(0, haystack, N - 1);

    const char * needle = "Anything whatsoever related to the Rust programming language: an open-source systems programming language from Mozilla, emphasizing safety, concurrency, and speed.";

    printf("%d %d", strlen(haystack), strlen(needle));
    int count = 0;
    for (int i = 0; i < 1000; i ++) {
        haystack[N - 1] = 0;
        count += strstr(haystack, needle) == NULL;
    }

    return count;
}
// contains_string.py
import sys

haystack = sys.stdin.read()
needle = "Anything whatsoever related to the Rust programming language: an open-source systems programming language from Mozilla, emphasizing safety, concurrency, and speed."

print(len(haystack), len(needle))
for _ in range(0, 1000):
    needle in haystack
$ wget http://www.gutenberg.org/cache/epub/1342/pg1342.txt # pride and prejudice (700 kb)
[...snip...]
$ rustc -O contains_string.rs

$ gcc -O contains_string.c -o contains_string_c -std=gnu11

$ time ./contains_string < pg1342.txt 
717569 163

real    0m1.041s
user    0m1.036s
sys     0m0.000s

$ time ./contains_string_c < pg1342.txt 
717569 163
real    0m0.064s
user    0m0.056s
sys     0m0.004s

$ time python ./contains_string.py < pg1342.txt 
(717569, 163)

real    0m0.155s
user    0m0.152s
sys     0m0.000s

General notes:

Metadata

Metadata

Assignees

No one assigned

    Labels

    I-slowIssue: Problems and improvements with respect to performance of generated code.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions