From d855df45a340cccb3fd2bcbbf4a3e13611c247a3 Mon Sep 17 00:00:00 2001 From: Jathin Sreenivas Date: Sat, 5 Jun 2021 22:30:14 +0200 Subject: [PATCH 1/4] Implementation of the Knuth-Morris-Pratt (KMP) string matching algorithm --- pydatastructs/strings/__init__.py | 12 +- .../strings/string_matching_algorithms.py | 136 ++++++++++++++++++ .../tests/test_string_matching_algorithms.py | 26 ++++ 3 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 pydatastructs/strings/string_matching_algorithms.py create mode 100644 pydatastructs/strings/tests/test_string_matching_algorithms.py diff --git a/pydatastructs/strings/__init__.py b/pydatastructs/strings/__init__.py index 1ee05158f..3f637a012 100644 --- a/pydatastructs/strings/__init__.py +++ b/pydatastructs/strings/__init__.py @@ -1,8 +1,18 @@ __all__ = [] -from . import trie +from . import ( + trie, + string_matching_algorithms +) + from .trie import ( Trie ) __all__.extend(trie.__all__) + +from .string_matching_algorithms import ( + find_string +) + +__all__.extend(string_matching_algorithms.__all__) diff --git a/pydatastructs/strings/string_matching_algorithms.py b/pydatastructs/strings/string_matching_algorithms.py new file mode 100644 index 000000000..f38436ec7 --- /dev/null +++ b/pydatastructs/strings/string_matching_algorithms.py @@ -0,0 +1,136 @@ +from pydatastructs.linear_data_structures.arrays import ( + OneDimensionalArray) + +__all__ = [ + 'find_string' +] + +def find_string(text: str, pattern: str, algorithm: str) -> bool: + """API of finding occurrence of a pattern string within another string or body of text. + + Parameters + ---------- + text: str + A text, set of characters can include alphabets, numbers , special characters and blank spaces + pattern: str + A text, set of characters can include alphabets, numbers , special characters and blank spaces + algorithm: str + A valid algorithm name + + Returns + ------- + bool + True if pattern occurs in the string, else False + + Examples + -------- + >>> from pydatastructs.strings.string_matching_algorithms import find_string + >>> find_string("aefoaefcdaefcdaed", "aefcdaed", algorithm = "kmp") + True + >>> find_string("aefoaefcdaefcdaed", "aefcdaedz", algorithm = "kmp") + False + + """ + return eval(algorithm + "('" + text + "','" + pattern + "')") + + +def kmp(string: str, substring: str) -> bool: + """Determine whether the substring appears somewhere in the string using Knuth–Morris–Pratt algorithm + + Parameters + ---------- + string: str + A text, set of characters + substring: str + A pattern/substring that is searched for in the string + + Returns + ------- + bool + Whether substring exists in the string or not + + Examples + -------- + >>> from pydatastructs.strings.string_matching_algorithms import kmp + >>> kmp("aefoaefcdaefcdaed", "aefcdaed") + True + >>> kmp("aefoaefcdaefcdaed", "aefcdaedz") + False + + References + ------- + .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm + .. [2] https://towardsdatascience.com/pattern-search-with-the-knuth-morris-pratt-kmp-algorithm-8562407dba5b + .. [3] https://iopscience.iop.org/article/10.1088/1742-6596/1345/4/042005/pdf + + """ + patternsInSubString = _buildPattern(substring) + return _doMatch(string, substring, patternsInSubString) + + +def _buildPattern(substring: str) -> OneDimensionalArray: + """Check for patterns existing in the substring + + Parameters + ---------- + substring: str + A text, set of characters + + Returns + ------- + patterns: OneDimensionalArray + Returns an array of indicies. For a given index if value > -1 + represents that the suffix found at the index, is also the prefix + at the value index. If value is -1, then there is no prefix that is also + a suffix. + + """ + j = 0 + i = 1 + patterns = OneDimensionalArray(int, len(substring)) + patterns.fill(-1) + while i < len(substring): + if substring[i] is substring[j]: + # A prefix that is also a suffix + patterns[i] = j + i += 1 + j += 1 + elif j > 0: + # Check the previous existing pattern + j = patterns[j - 1] + 1 + else: + i += 1 + return patterns + + +def _doMatch(string: str, substring: str, patterns: OneDimensionalArray) -> bool: + """Check if the string exists in the substring + + Parameters + ---------- + string: str + A text, set of characters + substring: str + A pattern/substring that is searched for in the string + patterns: OneDimensionalArray + An array of integers, each value < len(patterns) + + Returns + ------- + bool + Whether substring exists in the string or not + + """ + i = 0 + j = 0 + while i < len(string): + if string[i] is substring[j]: + i += 1 + j += 1 + elif j > 0: + j = patterns[j - 1] + 1 + else: + i += 1 + if j is len(substring): + return True + return False diff --git a/pydatastructs/strings/tests/test_string_matching_algorithms.py b/pydatastructs/strings/tests/test_string_matching_algorithms.py new file mode 100644 index 000000000..34d9a4b16 --- /dev/null +++ b/pydatastructs/strings/tests/test_string_matching_algorithms.py @@ -0,0 +1,26 @@ +from pydatastructs.strings.string_matching_algorithms import find_string + +def test_kms(): + _test_common_string_matching('kmp') + + +def _test_common_string_matching(algorithm): + true_text_pattern_dictionary = { + "Knuth-Morris-Pratt": "-Morris-", + "abcabcabcabdabcabdabcabca": "abcabdabcabca", + "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd", + "aaaaaaaa": "aaa", + "fullstringmatch": "fullstringmatch" + } + for test_case_key in true_text_pattern_dictionary: + assert find_string(test_case_key, true_text_pattern_dictionary[test_case_key], algorithm) is True + + false_text_pattern_dictionary = { + "Knuth-Morris-Pratt": "-Pratt-", + "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm", + "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe", + "fullstringmatch": "fullstrinmatch" + } + + for test_case_key in false_text_pattern_dictionary: + assert find_string(test_case_key, false_text_pattern_dictionary[test_case_key], algorithm) is False From 007a1aa59ef0580e480be92b7e6e16b286a415b5 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Sun, 10 Oct 2021 19:01:27 +0530 Subject: [PATCH 2/4] API improvements --- pydatastructs/strings/__init__.py | 8 +- pydatastructs/strings/algorithms.py | 109 ++++++++++++++ .../strings/string_matching_algorithms.py | 136 ------------------ 3 files changed, 113 insertions(+), 140 deletions(-) create mode 100644 pydatastructs/strings/algorithms.py delete mode 100644 pydatastructs/strings/string_matching_algorithms.py diff --git a/pydatastructs/strings/__init__.py b/pydatastructs/strings/__init__.py index 3f637a012..33930b426 100644 --- a/pydatastructs/strings/__init__.py +++ b/pydatastructs/strings/__init__.py @@ -2,7 +2,7 @@ from . import ( trie, - string_matching_algorithms + algorithms ) from .trie import ( @@ -11,8 +11,8 @@ __all__.extend(trie.__all__) -from .string_matching_algorithms import ( - find_string +from .algorithms import ( + find ) -__all__.extend(string_matching_algorithms.__all__) +__all__.extend(algorithms.__all__) diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py new file mode 100644 index 000000000..0e8adf981 --- /dev/null +++ b/pydatastructs/strings/algorithms.py @@ -0,0 +1,109 @@ +from pydatastructs.linear_data_structures.arrays import ( + DynamicOneDimensionalArray, OneDimensionalArray) + +__all__ = [ + 'find' +] + +def find(text, query, algorithm): + """ + Finds occurrence of a query string within the text string. + + Parameters + ========== + + text: str + The string on which query is to be performed. + query: str + The string which is to be searched in the text. + algorithm: str + The algorithm which should be used for + searching. + Currently the following algorithms are + supported, + 'kmp' -> Knuth-Morris-Pratt as given in [1]. + + Returns + ======= + + DynamicOneDimensionalArray + An array of starting positions of the portions + in the text which match with the given query. + + Examples + ======== + + >>> from pydatastructs.strings.algorithms import find + >>> text = "abcdefabcabe" + >>> pos = find(text, "ab", algorithm="kmp") + >>> str(pos) + "['0', '6', '9']" + >>> pos = find(text, "abc", algorithm="kmp") + >>> str(pos) + "['0', '6']" + >>> pos = find(text, "abe", algorithm="kmp") + >>> str(pos) + "['9']" + >>> pos = find(text, "abed", algorithm="kmp") + >>> str(pos) + '[]' + + References + ========== + + .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm + """ + import pydatastructs.strings.algorithms as algorithms + func = "_" + algorithm + if not hasattr(algorithms, func): + raise NotImplementedError( + "Currently %s algoithm for searching strings " + "inside a text isn't implemented yet." + %(algorithm)) + return getattr(algorithms, func)(text, query) + + +def _knuth_morris_pratt(text, query): + kmp_table = _build_kmp_table(query) + return _do_match(text, query, kmp_table) + +_kmp = _knuth_morris_pratt + +def _build_kmp_table(query): + pos, cnd = 1, 0 + kmp_table = OneDimensionalArray(int, len(query) + 1) + + kmp_table[0] = -1 + + while pos < len(query): + if query[pos] == query[cnd]: + kmp_table[pos] = kmp_table[cnd] + else: + kmp_table[pos] = cnd + while cnd >= 0 and query[pos] != query[cnd]: + cnd = kmp_table[cnd] + pos, cnd = pos + 1, cnd + 1 + kmp_table[pos] = cnd + + return kmp_table + + + +def _do_match(string, query, kmp_table): + j, k = 0, 0 + positions = DynamicOneDimensionalArray(int, 0) + + while j < len(string): + if query[k] == string[j]: + j = j + 1 + k = k + 1 + if k == len(query): + positions.append(j - k) + k = kmp_table[k] + else: + k = kmp_table[k] + if k < 0: + j = j + 1 + k = k + 1 + + return positions diff --git a/pydatastructs/strings/string_matching_algorithms.py b/pydatastructs/strings/string_matching_algorithms.py deleted file mode 100644 index f38436ec7..000000000 --- a/pydatastructs/strings/string_matching_algorithms.py +++ /dev/null @@ -1,136 +0,0 @@ -from pydatastructs.linear_data_structures.arrays import ( - OneDimensionalArray) - -__all__ = [ - 'find_string' -] - -def find_string(text: str, pattern: str, algorithm: str) -> bool: - """API of finding occurrence of a pattern string within another string or body of text. - - Parameters - ---------- - text: str - A text, set of characters can include alphabets, numbers , special characters and blank spaces - pattern: str - A text, set of characters can include alphabets, numbers , special characters and blank spaces - algorithm: str - A valid algorithm name - - Returns - ------- - bool - True if pattern occurs in the string, else False - - Examples - -------- - >>> from pydatastructs.strings.string_matching_algorithms import find_string - >>> find_string("aefoaefcdaefcdaed", "aefcdaed", algorithm = "kmp") - True - >>> find_string("aefoaefcdaefcdaed", "aefcdaedz", algorithm = "kmp") - False - - """ - return eval(algorithm + "('" + text + "','" + pattern + "')") - - -def kmp(string: str, substring: str) -> bool: - """Determine whether the substring appears somewhere in the string using Knuth–Morris–Pratt algorithm - - Parameters - ---------- - string: str - A text, set of characters - substring: str - A pattern/substring that is searched for in the string - - Returns - ------- - bool - Whether substring exists in the string or not - - Examples - -------- - >>> from pydatastructs.strings.string_matching_algorithms import kmp - >>> kmp("aefoaefcdaefcdaed", "aefcdaed") - True - >>> kmp("aefoaefcdaefcdaed", "aefcdaedz") - False - - References - ------- - .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm - .. [2] https://towardsdatascience.com/pattern-search-with-the-knuth-morris-pratt-kmp-algorithm-8562407dba5b - .. [3] https://iopscience.iop.org/article/10.1088/1742-6596/1345/4/042005/pdf - - """ - patternsInSubString = _buildPattern(substring) - return _doMatch(string, substring, patternsInSubString) - - -def _buildPattern(substring: str) -> OneDimensionalArray: - """Check for patterns existing in the substring - - Parameters - ---------- - substring: str - A text, set of characters - - Returns - ------- - patterns: OneDimensionalArray - Returns an array of indicies. For a given index if value > -1 - represents that the suffix found at the index, is also the prefix - at the value index. If value is -1, then there is no prefix that is also - a suffix. - - """ - j = 0 - i = 1 - patterns = OneDimensionalArray(int, len(substring)) - patterns.fill(-1) - while i < len(substring): - if substring[i] is substring[j]: - # A prefix that is also a suffix - patterns[i] = j - i += 1 - j += 1 - elif j > 0: - # Check the previous existing pattern - j = patterns[j - 1] + 1 - else: - i += 1 - return patterns - - -def _doMatch(string: str, substring: str, patterns: OneDimensionalArray) -> bool: - """Check if the string exists in the substring - - Parameters - ---------- - string: str - A text, set of characters - substring: str - A pattern/substring that is searched for in the string - patterns: OneDimensionalArray - An array of integers, each value < len(patterns) - - Returns - ------- - bool - Whether substring exists in the string or not - - """ - i = 0 - j = 0 - while i < len(string): - if string[i] is substring[j]: - i += 1 - j += 1 - elif j > 0: - j = patterns[j - 1] + 1 - else: - i += 1 - if j is len(substring): - return True - return False From e588dad31c436ac36e5aa032f1b038b49b7ba985 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Sun, 10 Oct 2021 19:29:49 +0530 Subject: [PATCH 3/4] Added tests --- pydatastructs/strings/algorithms.py | 8 +-- .../strings/tests/test_algorithms.py | 64 +++++++++++++++++++ .../tests/test_string_matching_algorithms.py | 26 -------- .../utils/tests/test_code_quality.py | 2 +- 4 files changed, 69 insertions(+), 31 deletions(-) create mode 100644 pydatastructs/strings/tests/test_algorithms.py delete mode 100644 pydatastructs/strings/tests/test_string_matching_algorithms.py diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py index 0e8adf981..506767014 100644 --- a/pydatastructs/strings/algorithms.py +++ b/pydatastructs/strings/algorithms.py @@ -53,13 +53,13 @@ def find(text, query, algorithm): .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm """ - import pydatastructs.strings.algorithms as algorithms + import pydatastructs.strings.algorithms as algorithms func = "_" + algorithm - if not hasattr(algorithms, func): - raise NotImplementedError( + if not hasattr(algorithms, func): + raise NotImplementedError( "Currently %s algoithm for searching strings " "inside a text isn't implemented yet." - %(algorithm)) + %(algorithm)) return getattr(algorithms, func)(text, query) diff --git a/pydatastructs/strings/tests/test_algorithms.py b/pydatastructs/strings/tests/test_algorithms.py new file mode 100644 index 000000000..1453c433f --- /dev/null +++ b/pydatastructs/strings/tests/test_algorithms.py @@ -0,0 +1,64 @@ +from pydatastructs.strings import find + +import random, string + +def test_kmp(): + _test_common_string_matching('kmp') + + +def _test_common_string_matching(algorithm): + true_text_pattern_dictionary = { + "Knuth-Morris-Pratt": "-Morris-", + "abcabcabcabdabcabdabcabca": "abcabdabcabca", + "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd", + "aaaaaaaa": "aaa", + "fullstringmatch": "fullstringmatch" + } + for test_case_key in true_text_pattern_dictionary: + text = test_case_key + query = true_text_pattern_dictionary[test_case_key] + positions = find(text, query, algorithm) + for i in range(positions._last_pos_filled): + p = positions[i] + assert text[p:p + len(query)] == query + + false_text_pattern_dictionary = { + "Knuth-Morris-Pratt": "-Pratt-", + "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm", + "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe", + "fullstringmatch": "fullstrinmatch" + } + + for test_case_key in false_text_pattern_dictionary: + text = test_case_key + query = false_text_pattern_dictionary[test_case_key] + positions = find(text, query, algorithm) + assert positions.size == 0 + + random.seed(1000) + + def gen_random_string(length): + ascii = string.ascii_uppercase + digits = string.digits + return ''.join(random.choices(ascii + digits, k=length)) + + for _ in range(100): + query = gen_random_string(random.randint(3, 10)) + num_times = random.randint(1, 10) + freq = 0 + text = "" + while freq < num_times: + rand_str = gen_random_string(random.randint(5, 10)) + if rand_str != query: + freq += 1 + text += query + rand_str + query + positions = find(text, query, algorithm="kmp") + assert positions._num == num_times * 2 + for i in range(positions._last_pos_filled): + p = positions[i] + assert text[p:p + len(query)] == query + + text = gen_random_string(len(query)) + if text != query: + positions = find(text, query, algorithm="kmp") + assert positions.size == 0 diff --git a/pydatastructs/strings/tests/test_string_matching_algorithms.py b/pydatastructs/strings/tests/test_string_matching_algorithms.py deleted file mode 100644 index 34d9a4b16..000000000 --- a/pydatastructs/strings/tests/test_string_matching_algorithms.py +++ /dev/null @@ -1,26 +0,0 @@ -from pydatastructs.strings.string_matching_algorithms import find_string - -def test_kms(): - _test_common_string_matching('kmp') - - -def _test_common_string_matching(algorithm): - true_text_pattern_dictionary = { - "Knuth-Morris-Pratt": "-Morris-", - "abcabcabcabdabcabdabcabca": "abcabdabcabca", - "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd", - "aaaaaaaa": "aaa", - "fullstringmatch": "fullstringmatch" - } - for test_case_key in true_text_pattern_dictionary: - assert find_string(test_case_key, true_text_pattern_dictionary[test_case_key], algorithm) is True - - false_text_pattern_dictionary = { - "Knuth-Morris-Pratt": "-Pratt-", - "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm", - "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe", - "fullstringmatch": "fullstrinmatch" - } - - for test_case_key in false_text_pattern_dictionary: - assert find_string(test_case_key, false_text_pattern_dictionary[test_case_key], algorithm) is False diff --git a/pydatastructs/utils/tests/test_code_quality.py b/pydatastructs/utils/tests/test_code_quality.py index a4611bace..b4c1ffa63 100644 --- a/pydatastructs/utils/tests/test_code_quality.py +++ b/pydatastructs/utils/tests/test_code_quality.py @@ -22,7 +22,7 @@ def test_trailing_white_spaces(): while line != "": if line.endswith(" \n") or line.endswith("\t\n") \ or line.endswith(" ") or line.endswith("\t"): - assert False, "%s contains trailing whitespace at line number %d: %s"\ + assert False, "%s:%d : %s"\ %(file_path, line_number, line) line = file.readline() line_number += 1 From fff59fecf3afd2344b4fc0482043a90528f80364 Mon Sep 17 00:00:00 2001 From: Gagandeep Singh Date: Sun, 10 Oct 2021 19:35:40 +0530 Subject: [PATCH 4/4] Updated reference to point to Wikipedia --- pydatastructs/strings/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py index 506767014..8d5d4a567 100644 --- a/pydatastructs/strings/algorithms.py +++ b/pydatastructs/strings/algorithms.py @@ -51,7 +51,7 @@ def find(text, query, algorithm): References ========== - .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm + .. [1] https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm """ import pydatastructs.strings.algorithms as algorithms func = "_" + algorithm