Skip to content

Commit 7878ee4

Browse files
sjathinczgdp1807
andauthored
Implementation of the Knuth-Morris-Pratt (KMP) string matching algorithm (#403)
Co-authored-by: Gagandeep Singh <gdp.1807@gmail.com>
1 parent 7e501c2 commit 7878ee4

File tree

4 files changed

+185
-2
lines changed

4 files changed

+185
-2
lines changed

pydatastructs/strings/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,18 @@
11
__all__ = []
22

3-
from . import trie
3+
from . import (
4+
trie,
5+
algorithms
6+
)
7+
48
from .trie import (
59
Trie
610
)
711

812
__all__.extend(trie.__all__)
13+
14+
from .algorithms import (
15+
find
16+
)
17+
18+
__all__.extend(algorithms.__all__)

pydatastructs/strings/algorithms.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from pydatastructs.linear_data_structures.arrays import (
2+
DynamicOneDimensionalArray, OneDimensionalArray)
3+
4+
__all__ = [
5+
'find'
6+
]
7+
8+
def find(text, query, algorithm):
9+
"""
10+
Finds occurrence of a query string within the text string.
11+
12+
Parameters
13+
==========
14+
15+
text: str
16+
The string on which query is to be performed.
17+
query: str
18+
The string which is to be searched in the text.
19+
algorithm: str
20+
The algorithm which should be used for
21+
searching.
22+
Currently the following algorithms are
23+
supported,
24+
'kmp' -> Knuth-Morris-Pratt as given in [1].
25+
26+
Returns
27+
=======
28+
29+
DynamicOneDimensionalArray
30+
An array of starting positions of the portions
31+
in the text which match with the given query.
32+
33+
Examples
34+
========
35+
36+
>>> from pydatastructs.strings.algorithms import find
37+
>>> text = "abcdefabcabe"
38+
>>> pos = find(text, "ab", algorithm="kmp")
39+
>>> str(pos)
40+
"['0', '6', '9']"
41+
>>> pos = find(text, "abc", algorithm="kmp")
42+
>>> str(pos)
43+
"['0', '6']"
44+
>>> pos = find(text, "abe", algorithm="kmp")
45+
>>> str(pos)
46+
"['9']"
47+
>>> pos = find(text, "abed", algorithm="kmp")
48+
>>> str(pos)
49+
'[]'
50+
51+
References
52+
==========
53+
54+
.. [1] https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm
55+
"""
56+
import pydatastructs.strings.algorithms as algorithms
57+
func = "_" + algorithm
58+
if not hasattr(algorithms, func):
59+
raise NotImplementedError(
60+
"Currently %s algoithm for searching strings "
61+
"inside a text isn't implemented yet."
62+
%(algorithm))
63+
return getattr(algorithms, func)(text, query)
64+
65+
66+
def _knuth_morris_pratt(text, query):
67+
kmp_table = _build_kmp_table(query)
68+
return _do_match(text, query, kmp_table)
69+
70+
_kmp = _knuth_morris_pratt
71+
72+
def _build_kmp_table(query):
73+
pos, cnd = 1, 0
74+
kmp_table = OneDimensionalArray(int, len(query) + 1)
75+
76+
kmp_table[0] = -1
77+
78+
while pos < len(query):
79+
if query[pos] == query[cnd]:
80+
kmp_table[pos] = kmp_table[cnd]
81+
else:
82+
kmp_table[pos] = cnd
83+
while cnd >= 0 and query[pos] != query[cnd]:
84+
cnd = kmp_table[cnd]
85+
pos, cnd = pos + 1, cnd + 1
86+
kmp_table[pos] = cnd
87+
88+
return kmp_table
89+
90+
91+
92+
def _do_match(string, query, kmp_table):
93+
j, k = 0, 0
94+
positions = DynamicOneDimensionalArray(int, 0)
95+
96+
while j < len(string):
97+
if query[k] == string[j]:
98+
j = j + 1
99+
k = k + 1
100+
if k == len(query):
101+
positions.append(j - k)
102+
k = kmp_table[k]
103+
else:
104+
k = kmp_table[k]
105+
if k < 0:
106+
j = j + 1
107+
k = k + 1
108+
109+
return positions
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from pydatastructs.strings import find
2+
3+
import random, string
4+
5+
def test_kmp():
6+
_test_common_string_matching('kmp')
7+
8+
9+
def _test_common_string_matching(algorithm):
10+
true_text_pattern_dictionary = {
11+
"Knuth-Morris-Pratt": "-Morris-",
12+
"abcabcabcabdabcabdabcabca": "abcabdabcabca",
13+
"aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd",
14+
"aaaaaaaa": "aaa",
15+
"fullstringmatch": "fullstringmatch"
16+
}
17+
for test_case_key in true_text_pattern_dictionary:
18+
text = test_case_key
19+
query = true_text_pattern_dictionary[test_case_key]
20+
positions = find(text, query, algorithm)
21+
for i in range(positions._last_pos_filled):
22+
p = positions[i]
23+
assert text[p:p + len(query)] == query
24+
25+
false_text_pattern_dictionary = {
26+
"Knuth-Morris-Pratt": "-Pratt-",
27+
"abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
28+
"aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
29+
"fullstringmatch": "fullstrinmatch"
30+
}
31+
32+
for test_case_key in false_text_pattern_dictionary:
33+
text = test_case_key
34+
query = false_text_pattern_dictionary[test_case_key]
35+
positions = find(text, query, algorithm)
36+
assert positions.size == 0
37+
38+
random.seed(1000)
39+
40+
def gen_random_string(length):
41+
ascii = string.ascii_uppercase
42+
digits = string.digits
43+
return ''.join(random.choices(ascii + digits, k=length))
44+
45+
for _ in range(100):
46+
query = gen_random_string(random.randint(3, 10))
47+
num_times = random.randint(1, 10)
48+
freq = 0
49+
text = ""
50+
while freq < num_times:
51+
rand_str = gen_random_string(random.randint(5, 10))
52+
if rand_str != query:
53+
freq += 1
54+
text += query + rand_str + query
55+
positions = find(text, query, algorithm="kmp")
56+
assert positions._num == num_times * 2
57+
for i in range(positions._last_pos_filled):
58+
p = positions[i]
59+
assert text[p:p + len(query)] == query
60+
61+
text = gen_random_string(len(query))
62+
if text != query:
63+
positions = find(text, query, algorithm="kmp")
64+
assert positions.size == 0

pydatastructs/utils/tests/test_code_quality.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_trailing_white_spaces():
2222
while line != "":
2323
if line.endswith(" \n") or line.endswith("\t\n") \
2424
or line.endswith(" ") or line.endswith("\t"):
25-
assert False, "%s contains trailing whitespace at line number %d: %s"\
25+
assert False, "%s:%d : %s"\
2626
%(file_path, line_number, line)
2727
line = file.readline()
2828
line_number += 1

0 commit comments

Comments
 (0)