diff --git a/pydatastructs/strings/__init__.py b/pydatastructs/strings/__init__.py index 1ee05158f..33930b426 100644 --- a/pydatastructs/strings/__init__.py +++ b/pydatastructs/strings/__init__.py @@ -1,8 +1,18 @@ __all__ = [] -from . import trie +from . import ( + trie, + algorithms +) + from .trie import ( Trie ) __all__.extend(trie.__all__) + +from .algorithms import ( + find +) + +__all__.extend(algorithms.__all__) diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py new file mode 100644 index 000000000..8d5d4a567 --- /dev/null +++ b/pydatastructs/strings/algorithms.py @@ -0,0 +1,109 @@ +from pydatastructs.linear_data_structures.arrays import ( + DynamicOneDimensionalArray, OneDimensionalArray) + +__all__ = [ + 'find' +] + +def find(text, query, algorithm): + """ + Finds occurrence of a query string within the text string. + + Parameters + ========== + + text: str + The string on which query is to be performed. + query: str + The string which is to be searched in the text. + algorithm: str + The algorithm which should be used for + searching. + Currently the following algorithms are + supported, + 'kmp' -> Knuth-Morris-Pratt as given in [1]. + + Returns + ======= + + DynamicOneDimensionalArray + An array of starting positions of the portions + in the text which match with the given query. + + Examples + ======== + + >>> from pydatastructs.strings.algorithms import find + >>> text = "abcdefabcabe" + >>> pos = find(text, "ab", algorithm="kmp") + >>> str(pos) + "['0', '6', '9']" + >>> pos = find(text, "abc", algorithm="kmp") + >>> str(pos) + "['0', '6']" + >>> pos = find(text, "abe", algorithm="kmp") + >>> str(pos) + "['9']" + >>> pos = find(text, "abed", algorithm="kmp") + >>> str(pos) + '[]' + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm + """ + import pydatastructs.strings.algorithms as algorithms + func = "_" + algorithm + if not hasattr(algorithms, func): + raise NotImplementedError( + "Currently %s algoithm for searching strings " + "inside a text isn't implemented yet." + %(algorithm)) + return getattr(algorithms, func)(text, query) + + +def _knuth_morris_pratt(text, query): + kmp_table = _build_kmp_table(query) + return _do_match(text, query, kmp_table) + +_kmp = _knuth_morris_pratt + +def _build_kmp_table(query): + pos, cnd = 1, 0 + kmp_table = OneDimensionalArray(int, len(query) + 1) + + kmp_table[0] = -1 + + while pos < len(query): + if query[pos] == query[cnd]: + kmp_table[pos] = kmp_table[cnd] + else: + kmp_table[pos] = cnd + while cnd >= 0 and query[pos] != query[cnd]: + cnd = kmp_table[cnd] + pos, cnd = pos + 1, cnd + 1 + kmp_table[pos] = cnd + + return kmp_table + + + +def _do_match(string, query, kmp_table): + j, k = 0, 0 + positions = DynamicOneDimensionalArray(int, 0) + + while j < len(string): + if query[k] == string[j]: + j = j + 1 + k = k + 1 + if k == len(query): + positions.append(j - k) + k = kmp_table[k] + else: + k = kmp_table[k] + if k < 0: + j = j + 1 + k = k + 1 + + return positions diff --git a/pydatastructs/strings/tests/test_algorithms.py b/pydatastructs/strings/tests/test_algorithms.py new file mode 100644 index 000000000..1453c433f --- /dev/null +++ b/pydatastructs/strings/tests/test_algorithms.py @@ -0,0 +1,64 @@ +from pydatastructs.strings import find + +import random, string + +def test_kmp(): + _test_common_string_matching('kmp') + + +def _test_common_string_matching(algorithm): + true_text_pattern_dictionary = { + "Knuth-Morris-Pratt": "-Morris-", + "abcabcabcabdabcabdabcabca": "abcabdabcabca", + "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd", + "aaaaaaaa": "aaa", + "fullstringmatch": "fullstringmatch" + } + for test_case_key in true_text_pattern_dictionary: + text = test_case_key + query = true_text_pattern_dictionary[test_case_key] + positions = find(text, query, algorithm) + for i in range(positions._last_pos_filled): + p = positions[i] + assert text[p:p + len(query)] == query + + false_text_pattern_dictionary = { + "Knuth-Morris-Pratt": "-Pratt-", + "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm", + "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe", + "fullstringmatch": "fullstrinmatch" + } + + for test_case_key in false_text_pattern_dictionary: + text = test_case_key + query = false_text_pattern_dictionary[test_case_key] + positions = find(text, query, algorithm) + assert positions.size == 0 + + random.seed(1000) + + def gen_random_string(length): + ascii = string.ascii_uppercase + digits = string.digits + return ''.join(random.choices(ascii + digits, k=length)) + + for _ in range(100): + query = gen_random_string(random.randint(3, 10)) + num_times = random.randint(1, 10) + freq = 0 + text = "" + while freq < num_times: + rand_str = gen_random_string(random.randint(5, 10)) + if rand_str != query: + freq += 1 + text += query + rand_str + query + positions = find(text, query, algorithm="kmp") + assert positions._num == num_times * 2 + for i in range(positions._last_pos_filled): + p = positions[i] + assert text[p:p + len(query)] == query + + text = gen_random_string(len(query)) + if text != query: + positions = find(text, query, algorithm="kmp") + assert positions.size == 0 diff --git a/pydatastructs/utils/tests/test_code_quality.py b/pydatastructs/utils/tests/test_code_quality.py index a4611bace..b4c1ffa63 100644 --- a/pydatastructs/utils/tests/test_code_quality.py +++ b/pydatastructs/utils/tests/test_code_quality.py @@ -22,7 +22,7 @@ def test_trailing_white_spaces(): while line != "": if line.endswith(" \n") or line.endswith("\t\n") \ or line.endswith(" ") or line.endswith("\t"): - assert False, "%s contains trailing whitespace at line number %d: %s"\ + assert False, "%s:%d : %s"\ %(file_path, line_number, line) line = file.readline() line_number += 1