2020-05-29 20:27:33 +08:00
|
|
|
|
/**
|
|
|
|
|
* \file
|
|
|
|
|
* \brief The [Rabin-Karp
|
|
|
|
|
* Algorithm](https://en.wikipedia.org/wiki/Rabin–Karp_algorithm) for finding a
|
|
|
|
|
* pattern within a piece of text with complexity O(n + m)
|
2020-05-22 18:15:29 +08:00
|
|
|
|
*/
|
2020-05-29 20:27:33 +08:00
|
|
|
|
#include <cassert>
|
|
|
|
|
#include <cmath>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
|
#include <string> // use this for MS Visucal C++
|
|
|
|
|
#else
|
|
|
|
|
#include <cstring>
|
|
|
|
|
#endif
|
2020-05-22 18:15:29 +08:00
|
|
|
|
|
2020-05-29 20:27:33 +08:00
|
|
|
|
#define PRIME 5 ///< Prime modulus for hash functions
|
2020-05-22 18:15:29 +08:00
|
|
|
|
|
2020-05-29 20:27:33 +08:00
|
|
|
|
/**
|
|
|
|
|
* convert a string to an intger - called as hashing function
|
|
|
|
|
* \param[in] s source of string to hash
|
|
|
|
|
* \param[in] n length of substring to hash
|
|
|
|
|
* \returns hash integer
|
|
|
|
|
*/
|
2020-05-30 07:26:30 +08:00
|
|
|
|
int64_t create_hash(const std::string& s, int n)
|
|
|
|
|
{
|
2020-05-22 18:15:29 +08:00
|
|
|
|
int64_t result = 0;
|
2020-05-30 07:26:30 +08:00
|
|
|
|
for (int i = 0; i < n; ++i)
|
|
|
|
|
{
|
2020-05-29 20:27:33 +08:00
|
|
|
|
result += (int64_t)(s[i] * (int64_t)pow(PRIME, i));
|
2020-05-22 18:15:29 +08:00
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-29 20:27:33 +08:00
|
|
|
|
/**
|
|
|
|
|
* re-hash a string using known existing hash
|
|
|
|
|
* \param[in] s source of string to hash
|
|
|
|
|
* \param[in] old_index previous index of string
|
|
|
|
|
* \param[in] new_index new index of string
|
|
|
|
|
* \param[in] old_hash previous hash of substring
|
|
|
|
|
* \param[in] patLength length of substring to hash
|
|
|
|
|
* \returns new hash integer
|
|
|
|
|
*/
|
|
|
|
|
int64_t recalculate_hash(const std::string& s, int old_index, int new_index,
|
2020-05-30 07:26:30 +08:00
|
|
|
|
int64_t old_hash, int patLength)
|
|
|
|
|
{
|
2020-05-22 18:15:29 +08:00
|
|
|
|
int64_t new_hash = old_hash - s[old_index];
|
|
|
|
|
new_hash /= PRIME;
|
2020-05-29 20:27:33 +08:00
|
|
|
|
new_hash += (int64_t)(s[new_index] * (int64_t)pow(PRIME, patLength - 1));
|
2020-05-22 18:15:29 +08:00
|
|
|
|
return new_hash;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-29 20:27:33 +08:00
|
|
|
|
/**
|
|
|
|
|
* compare if two sub-strings are equal
|
|
|
|
|
* \param[in] str1 string pattern to search
|
|
|
|
|
* \param[in] str2 text in which to search
|
|
|
|
|
* \param[in] start1,end1 start and end indices for substring in str1
|
|
|
|
|
* \param[in] start2,end2 start and end indices for substring in str2
|
|
|
|
|
* \returns `true` if pattern was found
|
|
|
|
|
* \returns `false` if pattern was not found
|
|
|
|
|
* @note can this be replaced by std::string::compare?
|
|
|
|
|
*/
|
|
|
|
|
bool check_if_equal(const std::string& str1, const std::string& str2,
|
2020-05-30 07:26:30 +08:00
|
|
|
|
int start1, int end1, int start2, int end2)
|
|
|
|
|
{
|
|
|
|
|
if (end1 - start1 != end2 - start2)
|
|
|
|
|
{
|
2020-05-22 18:15:29 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
2020-05-30 07:26:30 +08:00
|
|
|
|
while (start1 <= end1 && start2 <= end2)
|
|
|
|
|
{
|
|
|
|
|
if (str1[start1] != str2[start2])
|
|
|
|
|
{
|
2020-05-22 18:15:29 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
start1++;
|
|
|
|
|
start2++;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-29 20:27:33 +08:00
|
|
|
|
/**
|
|
|
|
|
* Perform string pattern search using Rabin-Karp algorithm
|
|
|
|
|
* @param[in] str string to search in
|
|
|
|
|
* @param[in] pat pattern to search for
|
|
|
|
|
* @return index of first occurrence of pattern
|
|
|
|
|
* @return -1 if pattern not found
|
2020-05-22 18:15:29 +08:00
|
|
|
|
*/
|
|
|
|
|
|
2020-05-30 07:26:30 +08:00
|
|
|
|
int rabin_karp(const std::string& str, const std::string& pat)
|
|
|
|
|
{
|
2020-05-29 20:27:33 +08:00
|
|
|
|
int64_t pat_hash = create_hash(pat, pat.size());
|
|
|
|
|
int64_t str_hash = create_hash(str, pat.size());
|
2020-05-30 07:26:30 +08:00
|
|
|
|
for (int i = 0; i <= str.size() - pat.size(); ++i)
|
|
|
|
|
{
|
2020-05-22 18:15:29 +08:00
|
|
|
|
if (pat_hash == str_hash &&
|
2020-05-30 07:26:30 +08:00
|
|
|
|
check_if_equal(str, pat, i, i + pat.size() - 1, 0, pat.size() - 1))
|
|
|
|
|
{
|
2020-05-29 20:27:33 +08:00
|
|
|
|
return i;
|
2020-05-22 18:15:29 +08:00
|
|
|
|
}
|
2020-05-30 07:26:30 +08:00
|
|
|
|
if (i < str.size() - pat.size())
|
|
|
|
|
{
|
2020-05-22 18:15:29 +08:00
|
|
|
|
str_hash =
|
2020-05-29 20:27:33 +08:00
|
|
|
|
recalculate_hash(str, i, i + pat.size(), str_hash, pat.size());
|
2020-05-22 18:15:29 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1; // return -1 if given pattern not found
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-29 20:27:33 +08:00
|
|
|
|
/** Main function */
|
2020-05-30 07:26:30 +08:00
|
|
|
|
int main(void)
|
|
|
|
|
{
|
2020-05-29 20:27:33 +08:00
|
|
|
|
assert(rabin_karp("helloWorld", "world") == -1);
|
|
|
|
|
assert(rabin_karp("helloWorld", "World") == 5);
|
|
|
|
|
assert(rabin_karp("this_is_c++", "c++") == 8);
|
|
|
|
|
assert(rabin_karp("happy_coding", "happy") == 0);
|
2020-05-22 18:15:29 +08:00
|
|
|
|
return 0;
|
|
|
|
|
}
|