From d855df45a340cccb3fd2bcbbf4a3e13611c247a3 Mon Sep 17 00:00:00 2001
From: Jathin Sreenivas <sjathin@gmail.com>
Date: Sat, 5 Jun 2021 22:30:14 +0200
Subject: [PATCH 1/4] Implementation of the Knuth-Morris-Pratt (KMP) string
 matching algorithm

---
 pydatastructs/strings/__init__.py             |  12 +-
 .../strings/string_matching_algorithms.py     | 136 ++++++++++++++++++
 .../tests/test_string_matching_algorithms.py  |  26 ++++
 3 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 pydatastructs/strings/string_matching_algorithms.py
 create mode 100644 pydatastructs/strings/tests/test_string_matching_algorithms.py

diff --git a/pydatastructs/strings/__init__.py b/pydatastructs/strings/__init__.py
index 1ee05158f..3f637a012 100644
--- a/pydatastructs/strings/__init__.py
+++ b/pydatastructs/strings/__init__.py
@@ -1,8 +1,18 @@
 __all__ = []
 
-from . import trie
+from . import (
+    trie,
+    string_matching_algorithms
+)
+
 from .trie import (
     Trie
 )
 
 __all__.extend(trie.__all__)
+
+from .string_matching_algorithms import (
+    find_string
+)
+
+__all__.extend(string_matching_algorithms.__all__)
diff --git a/pydatastructs/strings/string_matching_algorithms.py b/pydatastructs/strings/string_matching_algorithms.py
new file mode 100644
index 000000000..f38436ec7
--- /dev/null
+++ b/pydatastructs/strings/string_matching_algorithms.py
@@ -0,0 +1,136 @@
+from pydatastructs.linear_data_structures.arrays import (
+    OneDimensionalArray)
+
+__all__ = [
+    'find_string'
+]
+
+def find_string(text: str, pattern: str, algorithm: str) -> bool:
+    """API of finding occurrence of a pattern string within another string or body of text.
+
+    Parameters
+    ----------
+    text: str
+        A text, set of characters can include alphabets, numbers , special characters and blank spaces
+    pattern: str
+        A text, set of characters can include alphabets, numbers , special characters and blank spaces
+    algorithm: str
+        A valid algorithm name
+
+    Returns
+    -------
+    bool
+        True if pattern occurs in the string, else False
+
+    Examples
+    --------
+    >>> from pydatastructs.strings.string_matching_algorithms import find_string
+    >>> find_string("aefoaefcdaefcdaed", "aefcdaed", algorithm = "kmp")
+    True
+    >>> find_string("aefoaefcdaefcdaed", "aefcdaedz", algorithm = "kmp")
+    False
+
+    """
+    return eval(algorithm + "('" + text + "','" + pattern + "')")
+
+
+def kmp(string: str, substring: str) -> bool:
+    """Determine whether the substring appears somewhere in the string using Knuth–Morris–Pratt algorithm
+
+    Parameters
+    ----------
+    string: str
+        A text, set of characters
+    substring: str
+        A pattern/substring that is searched for in the string
+
+    Returns
+    -------
+    bool
+        Whether substring exists in the string or not
+
+    Examples
+    --------
+    >>> from pydatastructs.strings.string_matching_algorithms import kmp
+    >>> kmp("aefoaefcdaefcdaed", "aefcdaed")
+    True
+    >>> kmp("aefoaefcdaefcdaed", "aefcdaedz")
+    False
+
+    References
+    -------
+    .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm
+    .. [2] https://towardsdatascience.com/pattern-search-with-the-knuth-morris-pratt-kmp-algorithm-8562407dba5b
+    .. [3] https://iopscience.iop.org/article/10.1088/1742-6596/1345/4/042005/pdf
+
+    """
+    patternsInSubString = _buildPattern(substring)
+    return _doMatch(string, substring, patternsInSubString)
+
+
+def _buildPattern(substring: str) -> OneDimensionalArray:
+    """Check for patterns existing in the substring
+
+    Parameters
+    ----------
+    substring: str
+        A text, set of characters
+
+    Returns
+    -------
+    patterns: OneDimensionalArray
+        Returns an array of indicies. For a given index if value > -1
+        represents that the suffix found at the index, is also the prefix
+        at the value index. If value is -1, then there is no prefix that is also
+        a suffix.
+
+    """
+    j = 0
+    i = 1
+    patterns = OneDimensionalArray(int, len(substring))
+    patterns.fill(-1)
+    while i < len(substring):
+        if substring[i] is substring[j]:
+            # A prefix that is also a suffix
+            patterns[i] = j
+            i += 1
+            j += 1
+        elif j > 0:
+            # Check the previous existing pattern
+            j = patterns[j - 1] + 1
+        else:
+            i += 1
+    return patterns
+
+
+def _doMatch(string: str, substring: str, patterns: OneDimensionalArray) -> bool:
+    """Check if the string exists in the substring
+
+    Parameters
+    ----------
+    string: str
+        A text, set of characters
+    substring: str
+        A pattern/substring that is searched for in the string
+    patterns: OneDimensionalArray
+        An array of integers, each value < len(patterns)
+
+    Returns
+    -------
+    bool
+        Whether substring exists in the string or not
+
+    """
+    i = 0
+    j = 0
+    while i < len(string):
+        if string[i] is substring[j]:
+            i += 1
+            j += 1
+        elif j > 0:
+            j = patterns[j - 1] + 1
+        else:
+            i += 1
+        if j is len(substring):
+            return True
+    return False
diff --git a/pydatastructs/strings/tests/test_string_matching_algorithms.py b/pydatastructs/strings/tests/test_string_matching_algorithms.py
new file mode 100644
index 000000000..34d9a4b16
--- /dev/null
+++ b/pydatastructs/strings/tests/test_string_matching_algorithms.py
@@ -0,0 +1,26 @@
+from pydatastructs.strings.string_matching_algorithms import find_string
+
+def test_kms():
+    _test_common_string_matching('kmp')
+
+
+def _test_common_string_matching(algorithm):
+    true_text_pattern_dictionary = {
+        "Knuth-Morris-Pratt": "-Morris-",
+        "abcabcabcabdabcabdabcabca": "abcabdabcabca",
+        "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd",
+        "aaaaaaaa": "aaa",
+        "fullstringmatch": "fullstringmatch"
+    }
+    for test_case_key in true_text_pattern_dictionary:
+        assert find_string(test_case_key, true_text_pattern_dictionary[test_case_key], algorithm) is True
+
+    false_text_pattern_dictionary = {
+        "Knuth-Morris-Pratt": "-Pratt-",
+        "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
+        "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
+        "fullstringmatch": "fullstrinmatch"
+    }
+
+    for test_case_key in false_text_pattern_dictionary:
+        assert find_string(test_case_key, false_text_pattern_dictionary[test_case_key], algorithm) is False

From 007a1aa59ef0580e480be92b7e6e16b286a415b5 Mon Sep 17 00:00:00 2001
From: Gagandeep Singh <gdp.1807@gmail.com>
Date: Sun, 10 Oct 2021 19:01:27 +0530
Subject: [PATCH 2/4] API improvements

---
 pydatastructs/strings/__init__.py             |   8 +-
 pydatastructs/strings/algorithms.py           | 109 ++++++++++++++
 .../strings/string_matching_algorithms.py     | 136 ------------------
 3 files changed, 113 insertions(+), 140 deletions(-)
 create mode 100644 pydatastructs/strings/algorithms.py
 delete mode 100644 pydatastructs/strings/string_matching_algorithms.py

diff --git a/pydatastructs/strings/__init__.py b/pydatastructs/strings/__init__.py
index 3f637a012..33930b426 100644
--- a/pydatastructs/strings/__init__.py
+++ b/pydatastructs/strings/__init__.py
@@ -2,7 +2,7 @@
 
 from . import (
     trie,
-    string_matching_algorithms
+    algorithms
 )
 
 from .trie import (
@@ -11,8 +11,8 @@
 
 __all__.extend(trie.__all__)
 
-from .string_matching_algorithms import (
-    find_string
+from .algorithms import (
+    find
 )
 
-__all__.extend(string_matching_algorithms.__all__)
+__all__.extend(algorithms.__all__)
diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py
new file mode 100644
index 000000000..0e8adf981
--- /dev/null
+++ b/pydatastructs/strings/algorithms.py
@@ -0,0 +1,109 @@
+from pydatastructs.linear_data_structures.arrays import (
+    DynamicOneDimensionalArray, OneDimensionalArray)
+
+__all__ = [
+    'find'
+]
+
+def find(text, query, algorithm):
+    """
+    Finds occurrence of a query string within the text string.
+
+    Parameters
+    ==========
+
+    text: str
+        The string on which query is to be performed.
+    query: str
+        The string which is to be searched in the text.
+    algorithm: str
+        The algorithm which should be used for
+        searching.
+        Currently the following algorithms are
+        supported,
+        'kmp' -> Knuth-Morris-Pratt as given in [1].
+
+    Returns
+    =======
+
+    DynamicOneDimensionalArray
+        An array of starting positions of the portions
+        in the text which match with the given query.
+
+    Examples
+    ========
+
+    >>> from pydatastructs.strings.algorithms import find
+    >>> text = "abcdefabcabe"
+    >>> pos = find(text, "ab", algorithm="kmp")
+    >>> str(pos)
+    "['0', '6', '9']"
+    >>> pos = find(text, "abc", algorithm="kmp")
+    >>> str(pos)
+    "['0', '6']"
+    >>> pos = find(text, "abe", algorithm="kmp")
+    >>> str(pos)
+    "['9']"
+    >>> pos = find(text, "abed", algorithm="kmp")
+    >>> str(pos)
+    '[]'
+
+    References
+    ==========
+
+    .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm
+    """
+    import pydatastructs.strings.algorithms as algorithms 
+    func = "_" + algorithm
+    if not hasattr(algorithms, func): 
+        raise NotImplementedError( 
+        "Currently %s algoithm for searching strings "
+        "inside a text isn't implemented yet."
+        %(algorithm)) 
+    return getattr(algorithms, func)(text, query)
+
+
+def _knuth_morris_pratt(text, query):
+    kmp_table = _build_kmp_table(query)
+    return _do_match(text, query, kmp_table)
+
+_kmp = _knuth_morris_pratt
+
+def _build_kmp_table(query):
+    pos, cnd = 1, 0
+    kmp_table = OneDimensionalArray(int, len(query) + 1)
+
+    kmp_table[0] = -1
+
+    while pos < len(query):
+        if query[pos] == query[cnd]:
+            kmp_table[pos] = kmp_table[cnd]
+        else:
+            kmp_table[pos] = cnd
+            while cnd >= 0 and query[pos] != query[cnd]:
+                cnd = kmp_table[cnd]
+        pos, cnd = pos + 1, cnd + 1
+    kmp_table[pos] = cnd
+
+    return kmp_table
+
+
+
+def _do_match(string, query, kmp_table):
+    j, k = 0, 0
+    positions = DynamicOneDimensionalArray(int, 0)
+
+    while j < len(string):
+        if query[k] == string[j]:
+            j = j + 1
+            k = k + 1
+            if k == len(query):
+                positions.append(j - k)
+                k = kmp_table[k]
+        else:
+            k = kmp_table[k]
+            if k < 0:
+                j = j + 1
+                k = k + 1
+
+    return positions
diff --git a/pydatastructs/strings/string_matching_algorithms.py b/pydatastructs/strings/string_matching_algorithms.py
deleted file mode 100644
index f38436ec7..000000000
--- a/pydatastructs/strings/string_matching_algorithms.py
+++ /dev/null
@@ -1,136 +0,0 @@
-from pydatastructs.linear_data_structures.arrays import (
-    OneDimensionalArray)
-
-__all__ = [
-    'find_string'
-]
-
-def find_string(text: str, pattern: str, algorithm: str) -> bool:
-    """API of finding occurrence of a pattern string within another string or body of text.
-
-    Parameters
-    ----------
-    text: str
-        A text, set of characters can include alphabets, numbers , special characters and blank spaces
-    pattern: str
-        A text, set of characters can include alphabets, numbers , special characters and blank spaces
-    algorithm: str
-        A valid algorithm name
-
-    Returns
-    -------
-    bool
-        True if pattern occurs in the string, else False
-
-    Examples
-    --------
-    >>> from pydatastructs.strings.string_matching_algorithms import find_string
-    >>> find_string("aefoaefcdaefcdaed", "aefcdaed", algorithm = "kmp")
-    True
-    >>> find_string("aefoaefcdaefcdaed", "aefcdaedz", algorithm = "kmp")
-    False
-
-    """
-    return eval(algorithm + "('" + text + "','" + pattern + "')")
-
-
-def kmp(string: str, substring: str) -> bool:
-    """Determine whether the substring appears somewhere in the string using Knuth–Morris–Pratt algorithm
-
-    Parameters
-    ----------
-    string: str
-        A text, set of characters
-    substring: str
-        A pattern/substring that is searched for in the string
-
-    Returns
-    -------
-    bool
-        Whether substring exists in the string or not
-
-    Examples
-    --------
-    >>> from pydatastructs.strings.string_matching_algorithms import kmp
-    >>> kmp("aefoaefcdaefcdaed", "aefcdaed")
-    True
-    >>> kmp("aefoaefcdaefcdaed", "aefcdaedz")
-    False
-
-    References
-    -------
-    .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm
-    .. [2] https://towardsdatascience.com/pattern-search-with-the-knuth-morris-pratt-kmp-algorithm-8562407dba5b
-    .. [3] https://iopscience.iop.org/article/10.1088/1742-6596/1345/4/042005/pdf
-
-    """
-    patternsInSubString = _buildPattern(substring)
-    return _doMatch(string, substring, patternsInSubString)
-
-
-def _buildPattern(substring: str) -> OneDimensionalArray:
-    """Check for patterns existing in the substring
-
-    Parameters
-    ----------
-    substring: str
-        A text, set of characters
-
-    Returns
-    -------
-    patterns: OneDimensionalArray
-        Returns an array of indicies. For a given index if value > -1
-        represents that the suffix found at the index, is also the prefix
-        at the value index. If value is -1, then there is no prefix that is also
-        a suffix.
-
-    """
-    j = 0
-    i = 1
-    patterns = OneDimensionalArray(int, len(substring))
-    patterns.fill(-1)
-    while i < len(substring):
-        if substring[i] is substring[j]:
-            # A prefix that is also a suffix
-            patterns[i] = j
-            i += 1
-            j += 1
-        elif j > 0:
-            # Check the previous existing pattern
-            j = patterns[j - 1] + 1
-        else:
-            i += 1
-    return patterns
-
-
-def _doMatch(string: str, substring: str, patterns: OneDimensionalArray) -> bool:
-    """Check if the string exists in the substring
-
-    Parameters
-    ----------
-    string: str
-        A text, set of characters
-    substring: str
-        A pattern/substring that is searched for in the string
-    patterns: OneDimensionalArray
-        An array of integers, each value < len(patterns)
-
-    Returns
-    -------
-    bool
-        Whether substring exists in the string or not
-
-    """
-    i = 0
-    j = 0
-    while i < len(string):
-        if string[i] is substring[j]:
-            i += 1
-            j += 1
-        elif j > 0:
-            j = patterns[j - 1] + 1
-        else:
-            i += 1
-        if j is len(substring):
-            return True
-    return False

From e588dad31c436ac36e5aa032f1b038b49b7ba985 Mon Sep 17 00:00:00 2001
From: Gagandeep Singh <gdp.1807@gmail.com>
Date: Sun, 10 Oct 2021 19:29:49 +0530
Subject: [PATCH 3/4] Added tests

---
 pydatastructs/strings/algorithms.py           |  8 +--
 .../strings/tests/test_algorithms.py          | 64 +++++++++++++++++++
 .../tests/test_string_matching_algorithms.py  | 26 --------
 .../utils/tests/test_code_quality.py          |  2 +-
 4 files changed, 69 insertions(+), 31 deletions(-)
 create mode 100644 pydatastructs/strings/tests/test_algorithms.py
 delete mode 100644 pydatastructs/strings/tests/test_string_matching_algorithms.py

diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py
index 0e8adf981..506767014 100644
--- a/pydatastructs/strings/algorithms.py
+++ b/pydatastructs/strings/algorithms.py
@@ -53,13 +53,13 @@ def find(text, query, algorithm):
 
     .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm
     """
-    import pydatastructs.strings.algorithms as algorithms 
+    import pydatastructs.strings.algorithms as algorithms
     func = "_" + algorithm
-    if not hasattr(algorithms, func): 
-        raise NotImplementedError( 
+    if not hasattr(algorithms, func):
+        raise NotImplementedError(
         "Currently %s algoithm for searching strings "
         "inside a text isn't implemented yet."
-        %(algorithm)) 
+        %(algorithm))
     return getattr(algorithms, func)(text, query)
 
 
diff --git a/pydatastructs/strings/tests/test_algorithms.py b/pydatastructs/strings/tests/test_algorithms.py
new file mode 100644
index 000000000..1453c433f
--- /dev/null
+++ b/pydatastructs/strings/tests/test_algorithms.py
@@ -0,0 +1,64 @@
+from pydatastructs.strings import find
+
+import random, string
+
+def test_kmp():
+    _test_common_string_matching('kmp')
+
+
+def _test_common_string_matching(algorithm):
+    true_text_pattern_dictionary = {
+        "Knuth-Morris-Pratt": "-Morris-",
+        "abcabcabcabdabcabdabcabca": "abcabdabcabca",
+        "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd",
+        "aaaaaaaa": "aaa",
+        "fullstringmatch": "fullstringmatch"
+    }
+    for test_case_key in true_text_pattern_dictionary:
+        text = test_case_key
+        query = true_text_pattern_dictionary[test_case_key]
+        positions = find(text, query, algorithm)
+        for i in range(positions._last_pos_filled):
+            p = positions[i]
+            assert text[p:p + len(query)] == query
+
+    false_text_pattern_dictionary = {
+        "Knuth-Morris-Pratt": "-Pratt-",
+        "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
+        "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
+        "fullstringmatch": "fullstrinmatch"
+    }
+
+    for test_case_key in false_text_pattern_dictionary:
+        text = test_case_key
+        query = false_text_pattern_dictionary[test_case_key]
+        positions = find(text, query, algorithm)
+        assert positions.size == 0
+
+    random.seed(1000)
+
+    def gen_random_string(length):
+        ascii = string.ascii_uppercase
+        digits = string.digits
+        return ''.join(random.choices(ascii + digits, k=length))
+
+    for _ in range(100):
+        query = gen_random_string(random.randint(3, 10))
+        num_times = random.randint(1, 10)
+        freq = 0
+        text = ""
+        while freq < num_times:
+            rand_str = gen_random_string(random.randint(5, 10))
+            if rand_str != query:
+                freq += 1
+                text += query + rand_str + query
+        positions = find(text, query, algorithm="kmp")
+        assert positions._num == num_times * 2
+        for i in range(positions._last_pos_filled):
+            p = positions[i]
+            assert text[p:p + len(query)] == query
+
+        text = gen_random_string(len(query))
+        if text != query:
+            positions = find(text, query, algorithm="kmp")
+            assert positions.size == 0
diff --git a/pydatastructs/strings/tests/test_string_matching_algorithms.py b/pydatastructs/strings/tests/test_string_matching_algorithms.py
deleted file mode 100644
index 34d9a4b16..000000000
--- a/pydatastructs/strings/tests/test_string_matching_algorithms.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from pydatastructs.strings.string_matching_algorithms import find_string
-
-def test_kms():
-    _test_common_string_matching('kmp')
-
-
-def _test_common_string_matching(algorithm):
-    true_text_pattern_dictionary = {
-        "Knuth-Morris-Pratt": "-Morris-",
-        "abcabcabcabdabcabdabcabca": "abcabdabcabca",
-        "aefcdfaecdaefaefcdaefeaefcdcdeae": "aefcdaefeaefcd",
-        "aaaaaaaa": "aaa",
-        "fullstringmatch": "fullstringmatch"
-    }
-    for test_case_key in true_text_pattern_dictionary:
-        assert find_string(test_case_key, true_text_pattern_dictionary[test_case_key], algorithm) is True
-
-    false_text_pattern_dictionary = {
-        "Knuth-Morris-Pratt": "-Pratt-",
-        "abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
-        "aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
-        "fullstringmatch": "fullstrinmatch"
-    }
-
-    for test_case_key in false_text_pattern_dictionary:
-        assert find_string(test_case_key, false_text_pattern_dictionary[test_case_key], algorithm) is False
diff --git a/pydatastructs/utils/tests/test_code_quality.py b/pydatastructs/utils/tests/test_code_quality.py
index a4611bace..b4c1ffa63 100644
--- a/pydatastructs/utils/tests/test_code_quality.py
+++ b/pydatastructs/utils/tests/test_code_quality.py
@@ -22,7 +22,7 @@ def test_trailing_white_spaces():
         while line != "":
             if line.endswith(" \n") or line.endswith("\t\n") \
                 or line.endswith(" ") or line.endswith("\t"):
-                assert False, "%s contains trailing whitespace at line number %d: %s"\
+                assert False, "%s:%d : %s"\
                                %(file_path, line_number, line)
             line = file.readline()
             line_number += 1

From fff59fecf3afd2344b4fc0482043a90528f80364 Mon Sep 17 00:00:00 2001
From: Gagandeep Singh <gdp.1807@gmail.com>
Date: Sun, 10 Oct 2021 19:35:40 +0530
Subject: [PATCH 4/4] Updated reference to point to Wikipedia

---
 pydatastructs/strings/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydatastructs/strings/algorithms.py b/pydatastructs/strings/algorithms.py
index 506767014..8d5d4a567 100644
--- a/pydatastructs/strings/algorithms.py
+++ b/pydatastructs/strings/algorithms.py
@@ -51,7 +51,7 @@ def find(text, query, algorithm):
     References
     ==========
 
-    .. [1] https://www.inf.hs-flensburg.de/lang/algorithmen/pattern/kmpen.htm
+    .. [1] https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm
     """
     import pydatastructs.strings.algorithms as algorithms
     func = "_" + algorithm