fixed failure function and cleaned up code in kmp + added rabin-karp

2023-10-11 13:06:12 +08:00 · 2018-01-07 12:49:51 +00:00 · 2018-01-07 12:49:51 +00:00 · 0d36dc60c5
commit 0d36dc60c5
parent 495fdc1ff9
2 changed files with 88 additions and 15 deletions
--- a/strings/knuth-morris-pratt.py
+++ b/strings/knuth-morris-pratt.py
@ -1,4 +1,4 @@
-def kmp(pattern, text, len_p=None, len_t=None):
+def kmp(pattern, text):
    """
    The Knuth-Morris-Pratt Algorithm for finding a pattern within a piece of text
    with complexity O(n + m)
@ -14,14 +14,7 @@ def kmp(pattern, text, len_p=None, len_t=None):
    """

    # 1) Construct the failure array
-    failure = [0]
-    i = 0
-    for index, char in enumerate(pattern[1:]):
-        if pattern[i] == char:
-            i += 1
-        else:
-            i = 0
-        failure.append(i)
+    failure = get_failure_array(pattern)

    # 2) Step through text searching for pattern
    i, j = 0, 0  # index into text, pattern
@ -29,20 +22,38 @@ def kmp(pattern, text, len_p=None, len_t=None):
        if pattern[j] == text[i]:
            if j == (len(pattern) - 1):
                return True
-            i += 1
            j += 1

        # if this is a prefix in our pattern
        # just go back far enough to continue
-        elif failure[j] > 0:
-            j = failure[j] - 1
-        else:
-            i += 1
+        elif j > 0:
+            j = failure[j - 1]
+            continue
+        i += 1
    return False


-if __name__ == '__main__':
+def get_failure_array(pattern):
+    """
+    Calculates the new index we should go to if we fail a comparison
+    :param pattern:
+    :return:
+    """
+    failure = [0]
+    i = 0
+    j = 1
+    while j < len(pattern):
+        if pattern[i] == pattern[j]:
+            i += 1
+        elif i > 0:
+            i = failure[i-1]
+            continue
+        j += 1
+        failure.append(i)
+    return failure

+
+if __name__ == '__main__':
    # Test 1)
    pattern = "abc1abc12"
    text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc"
@ -54,4 +65,16 @@ if __name__ == '__main__':
    text = "ABABZABABYABABX"
    assert kmp(pattern, text)

+    # Test 3)
+    pattern = "AAAB"
+    text = "ABAAAAAB"
+    assert kmp(pattern, text)

+    # Test 4)
+    pattern = "abcdabcy"
+    text = "abcxabcdabxabcdabcdabcy"
+    assert kmp(pattern, text)
+
+    # Test 5)
+    pattern = "aabaabaaa"
+    assert get_failure_array(pattern) == [0, 1, 0, 1, 2, 3, 4, 5, 2]
--- a/strings/rabin-karp.py
+++ b/strings/rabin-karp.py
@ -0,0 +1,50 @@
+def rabin_karp(pattern, text):
+    """
+
+    The Rabin-Karp Algorithm for finding a pattern within a piece of text
+    with complexity O(nm), most efficient when it is used with multiple patterns
+    as it is able to check if any of a set of patterns match a section of text in o(1) given the precomputed hashes.
+
+    This will be the simple version which only assumes one pattern is being searched for but it's not hard to modify
+
+    1) Calculate pattern hash
+
+    2) Step through the text one character at a time passing a window with the same length as the pattern
+        calculating the hash of the text within the window compare it with the hash of the pattern. Only testing
+        equality if the hashes match
+
+    """
+    p_len = len(pattern)
+    p_hash = hash(pattern)
+
+    for i in range(0, len(text) - (p_len - 1)):
+
+        # written like this t
+        text_hash = hash(text[i:i + p_len])
+        if text_hash == p_hash and \
+                text[i:i + p_len] == pattern:
+            return True
+    return False
+
+
+if __name__ == '__main__':
+    # Test 1)
+    pattern = "abc1abc12"
+    text1 = "alskfjaldsabc1abc1abc12k23adsfabcabc"
+    text2 = "alskfjaldsk23adsfabcabc"
+    assert rabin_karp(pattern, text1) and not rabin_karp(pattern, text2)
+
+    # Test 2)
+    pattern = "ABABX"
+    text = "ABABZABABYABABX"
+    assert rabin_karp(pattern, text)
+
+    # Test 3)
+    pattern = "AAAB"
+    text = "ABAAAAAB"
+    assert rabin_karp(pattern, text)
+
+    # Test 4)
+    pattern = "abcdabcy"
+    text = "abcxabcdabxabcdabcdabcy"
+    assert rabin_karp(pattern, text)