feat: add Boyer Moore algorithm implementation (#2441)

* add boyer moore algorithm implementation

* add a one-line description of what the library/header is for

* fix comments pattern and make tests static

* add documentation

* add namespaces

* fix all warnings for clang-tydy.exe <filename>

* Change lib from limits to climits (CHAR_MAX macro)

Co-authored-by: David Leal <halfpacho@gmail.com>

* Add breif description of boyer-moore algorithm

* Fix styling

* Add needed documentation

* my commit

* fix type of index_pattern

* Fix clang-warnings

* chore: apply suggestions from code review

* chore: add print message after tests

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* fix: variable name

* Update strings/boyer_moore.cpp

Co-authored-by: David Leal <halfpacho@gmail.com>

---------

Co-authored-by: David Leal <halfpacho@gmail.com>
Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>
This commit is contained in:
Stoycho Kyosev 2023-06-16 23:38:38 +02:00 committed by GitHub
parent 5ef7ad5cfe
commit d7a9869dce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 298 additions and 1 deletions

29
.vscode/settings.json vendored
View File

@ -59,6 +59,33 @@
"stdexcept": "cpp",
"streambuf": "cpp",
"typeinfo": "cpp",
"valarray": "cpp"
"valarray": "cpp",
"bit": "cpp",
"charconv": "cpp",
"compare": "cpp",
"concepts": "cpp",
"format": "cpp",
"forward_list": "cpp",
"ios": "cpp",
"locale": "cpp",
"queue": "cpp",
"stack": "cpp",
"xfacet": "cpp",
"xhash": "cpp",
"xiosbase": "cpp",
"xlocale": "cpp",
"xlocbuf": "cpp",
"xlocinfo": "cpp",
"xlocmes": "cpp",
"xlocmon": "cpp",
"xlocnum": "cpp",
"xloctime": "cpp",
"xmemory": "cpp",
"xstddef": "cpp",
"xstring": "cpp",
"xtr1common": "cpp",
"xtree": "cpp",
"xutility": "cpp",
"climits": "cpp"
}
}

270
strings/boyer_moore.cpp Normal file
View File

@ -0,0 +1,270 @@
/**
* @file
* @brief
* The [BoyerMoore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm) algorithm searches for occurrences of pattern P in text T by
* performing explicit character comparisons at different alignments. Instead of
* a brute-force search of all alignments (of which there are n - m + 1),
* BoyerMoore uses information gained by preprocessing P to skip as many
* alignments as possible.
*
* @details
* The key insight in this algorithm is that if the end of the pattern is
* compared to the text, then jumps along the text can be made rather than
* checking every character of the text. The reason that this works is that in
* lining up the pattern against the text, the last character of the pattern is
* compared to the character in the text.
*
* If the characters do not match, there is no need to continue searching
* backwards along the text. This leaves us with two cases.
*
* Case 1:
* If the character in the text does not match any of the characters in the
* pattern, then the next character in the text to check is located m characters
* farther along the text, where m is the length of the pattern.
*
* Case 2:
* If the character in the text is in the pattern, then a partial shift of the
* pattern along the text is done to line up along the matching character and
* the process is repeated.
*
* There are two shift rules:
*
* [The bad character rule]
* (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule)
*
* [The good suffix rule]
* (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule)
*
* The shift rules are implemented as constant-time table lookups, using tables
* generated during the preprocessing of P.
* @author [Stoycho Kyosev](https://github.com/stoychoX)
*/
#include <cassert> /// for assert
#include <climits> /// for CHAR_MAX macro
#include <cstring> /// for strlen
#include <iostream> /// for IO operations
#include <string> /// for std::string
#include <vector> /// for std::vector
#define APLHABET_SIZE CHAR_MAX ///< number of symbols in the alphabet we use
/**
* @namespace
* @brief String algorithms
*/
namespace strings {
/**
* @namespace
* @brief Functions for the [Boyer
* Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
* algorithm implementation
*/
namespace boyer_moore {
/**
* @brief A structure representing all the data we need to search the
* preprocessed pattern in text.
*/
struct pattern {
std::string pat;
std::vector<size_t>
bad_char; ///< bad char table used in [Bad Character
///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/)
std::vector<size_t>
good_suffix; ///< good suffix table used for [Good Suffix
///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp)
};
/**
* @brief A function that preprocess the good suffix thable
*
* @param str The string being preprocessed
* @param arg The good suffix table
* @returns void
*/
void init_good_suffix(const std::string& str, std::vector<size_t>& arg) {
arg.resize(str.size() + 1, 0);
// border_pos[i] - the index of the longest proper suffix of str[i..] which
// is also a proper prefix.
std::vector<size_t> border_pos(str.size() + 1, 0);
size_t current_char = str.length();
size_t border_index = str.length() + 1;
border_pos[current_char] = border_index;
while (current_char > 0) {
while (border_index <= str.length() &&
str[current_char - 1] != str[border_index - 1]) {
if (arg[border_index] == 0) {
arg[border_index] = border_index - current_char;
}
border_index = border_pos[border_index];
}
current_char--;
border_index--;
border_pos[current_char] = border_index;
}
size_t largest_border_index = border_pos[0];
for (size_t i = 0; i < str.size(); i++) {
if (arg[i] == 0) {
arg[i] = largest_border_index;
}
// If we go pass the largest border we find the next one as we iterate
if (i == largest_border_index) {
largest_border_index = border_pos[largest_border_index];
}
}
}
/**
* @brief A function that preprocess the bad char table
*
* @param str The string being preprocessed
* @param arg The bad char table
* @returns void
*/
void init_bad_char(const std::string& str, std::vector<size_t>& arg) {
arg.resize(APLHABET_SIZE, str.length());
for (size_t i = 0; i < str.length(); i++) {
arg[str[i]] = str.length() - i - 1;
}
}
/**
* @brief A function that initializes pattern
*
* @param str Text used for initialization
* @param arg Initialized structure
* @returns void
*/
void init_pattern(const std::string& str, pattern& arg) {
arg.pat = str;
init_bad_char(str, arg.bad_char);
init_good_suffix(str, arg.good_suffix);
}
/**
* @brief A function that implements Boyer-Moore's algorithm.
*
* @param str Text we are seatching in.
* @param arg pattern structure containing the preprocessed pattern
* @return Vector of indexes of the occurrences of pattern in text
*/
std::vector<size_t> search(const std::string& str, const pattern& arg) {
size_t index_position = arg.pat.size() - 1;
std::vector<size_t> index_storage;
while (index_position < str.length()) {
size_t index_string = index_position;
int index_pattern = static_cast<int>(arg.pat.size()) - 1;
while (index_pattern >= 0 &&
str[index_string] == arg.pat[index_pattern]) {
--index_pattern;
--index_string;
}
if (index_pattern < 0) {
index_storage.push_back(index_position - arg.pat.length() + 1);
index_position += arg.good_suffix[0];
} else {
index_position += std::max(arg.bad_char[str[index_string]],
arg.good_suffix[index_pattern + 1]);
}
}
return index_storage;
}
/**
* @brief Check if pat is prefix of str.
*
* @param str pointer to some part of the input text.
* @param pat the searched pattern.
* @param len length of the searched pattern
* @returns `true` if pat IS prefix of str.
* @returns `false` if pat is NOT a prefix of str.
*/
bool is_prefix(const char* str, const char* pat, size_t len) {
if (strlen(str) < len) {
return false;
}
for (size_t i = 0; i < len; i++) {
if (str[i] != pat[i]) {
return false;
}
}
return true;
}
} // namespace boyer_moore
} // namespace strings
/**
* @brief A test case in which we search for every appearance of the word 'and'
* @param text The text in which we search for appearance of the word 'and'
* @returns void
*/
void and_test(const char* text) {
strings::boyer_moore::pattern ands;
strings::boyer_moore::init_pattern("and", ands);
std::vector<size_t> indexes = strings::boyer_moore::search(text, ands);
assert(indexes.size() == 2);
assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3));
assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3));
}
/**
* @brief A test case in which we search for every appearance of the word 'pat'
* @param text The text in which we search for appearance of the word 'pat'
* @returns void
*/
void pat_test(const char* text) {
strings::boyer_moore::pattern pat;
strings::boyer_moore::init_pattern("pat", pat);
std::vector<size_t> indexes = strings::boyer_moore::search(text, pat);
assert(indexes.size() == 6);
for (const auto& currentIndex : indexes) {
assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3));
}
}
/**
* @brief Self-test implementations
* @returns void
*/
static void tests() {
const char* text =
"When pat Mr. and Mrs. pat Dursley woke up on the dull, gray \
Tuesday our story starts, \
there was nothing about pat the cloudy sky outside to pat suggest that\
strange and \
mysterious things would pat soon be happening all pat over the \
country.";
and_test(text);
pat_test(text);
std::cout << "All tests have successfully passed!\n";
}
/**
* @brief Main function
* @returns 0 on exit
*/
int main() {
tests(); // run self-test implementations
return 0;
}