From d7a9869dce772b980c28150013625396a1160388 Mon Sep 17 00:00:00 2001 From: Stoycho Kyosev <68247567+stoychoX@users.noreply.github.com> Date: Fri, 16 Jun 2023 23:38:38 +0200 Subject: [PATCH] feat: add Boyer Moore algorithm implementation (#2441) * add boyer moore algorithm implementation * add a one-line description of what the library/header is for * fix comments pattern and make tests static * add documentation * add namespaces * fix all warnings for clang-tydy.exe * Change lib from limits to climits (CHAR_MAX macro) Co-authored-by: David Leal * Add breif description of boyer-moore algorithm * Fix styling * Add needed documentation * my commit * fix type of index_pattern * Fix clang-warnings * chore: apply suggestions from code review * chore: add print message after tests * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * fix: variable name * Update strings/boyer_moore.cpp Co-authored-by: David Leal --------- Co-authored-by: David Leal Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> --- .vscode/settings.json | 29 ++++- strings/boyer_moore.cpp | 270 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 strings/boyer_moore.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json index 67fe06477..f6d76514f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -59,6 +59,33 @@ "stdexcept": "cpp", "streambuf": "cpp", "typeinfo": "cpp", - "valarray": "cpp" + "valarray": "cpp", + "bit": "cpp", + "charconv": "cpp", + "compare": "cpp", + "concepts": "cpp", + "format": "cpp", + "forward_list": "cpp", + "ios": "cpp", + "locale": "cpp", + "queue": "cpp", + "stack": "cpp", + "xfacet": "cpp", + "xhash": "cpp", + "xiosbase": "cpp", + "xlocale": "cpp", + "xlocbuf": "cpp", + "xlocinfo": "cpp", + "xlocmes": "cpp", + "xlocmon": "cpp", + "xlocnum": "cpp", + "xloctime": "cpp", + "xmemory": "cpp", + "xstddef": "cpp", + "xstring": "cpp", + "xtr1common": "cpp", + "xtree": "cpp", + "xutility": "cpp", + "climits": "cpp" } } diff --git a/strings/boyer_moore.cpp b/strings/boyer_moore.cpp new file mode 100644 index 000000000..a8c4cbf8d --- /dev/null +++ b/strings/boyer_moore.cpp @@ -0,0 +1,270 @@ +/** + * @file + * @brief + * The [Boyer–Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm) algorithm searches for occurrences of pattern P in text T by + * performing explicit character comparisons at different alignments. Instead of + * a brute-force search of all alignments (of which there are n - m + 1), + * Boyer–Moore uses information gained by preprocessing P to skip as many + * alignments as possible. + * + * @details + * The key insight in this algorithm is that if the end of the pattern is + * compared to the text, then jumps along the text can be made rather than + * checking every character of the text. The reason that this works is that in + * lining up the pattern against the text, the last character of the pattern is + * compared to the character in the text. + * + * If the characters do not match, there is no need to continue searching + * backwards along the text. This leaves us with two cases. + * + * Case 1: + * If the character in the text does not match any of the characters in the + * pattern, then the next character in the text to check is located m characters + * farther along the text, where m is the length of the pattern. + * + * Case 2: + * If the character in the text is in the pattern, then a partial shift of the + * pattern along the text is done to line up along the matching character and + * the process is repeated. + * + * There are two shift rules: + * + * [The bad character rule] + * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule) + * + * [The good suffix rule] + * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule) + * + * The shift rules are implemented as constant-time table lookups, using tables + * generated during the preprocessing of P. + * @author [Stoycho Kyosev](https://github.com/stoychoX) + */ + +#include /// for assert +#include /// for CHAR_MAX macro +#include /// for strlen +#include /// for IO operations +#include /// for std::string +#include /// for std::vector + +#define APLHABET_SIZE CHAR_MAX ///< number of symbols in the alphabet we use + +/** + * @namespace + * @brief String algorithms + */ +namespace strings { +/** + * @namespace + * @brief Functions for the [Boyer + * Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm) + * algorithm implementation + */ +namespace boyer_moore { +/** + * @brief A structure representing all the data we need to search the + * preprocessed pattern in text. + */ +struct pattern { + std::string pat; + + std::vector + bad_char; ///< bad char table used in [Bad Character + ///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/) + + std::vector + good_suffix; ///< good suffix table used for [Good Suffix + ///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp) +}; + +/** + * @brief A function that preprocess the good suffix thable + * + * @param str The string being preprocessed + * @param arg The good suffix table + * @returns void + */ +void init_good_suffix(const std::string& str, std::vector& arg) { + arg.resize(str.size() + 1, 0); + + // border_pos[i] - the index of the longest proper suffix of str[i..] which + // is also a proper prefix. + std::vector border_pos(str.size() + 1, 0); + + size_t current_char = str.length(); + + size_t border_index = str.length() + 1; + + border_pos[current_char] = border_index; + + while (current_char > 0) { + while (border_index <= str.length() && + str[current_char - 1] != str[border_index - 1]) { + if (arg[border_index] == 0) { + arg[border_index] = border_index - current_char; + } + + border_index = border_pos[border_index]; + } + + current_char--; + border_index--; + border_pos[current_char] = border_index; + } + + size_t largest_border_index = border_pos[0]; + + for (size_t i = 0; i < str.size(); i++) { + if (arg[i] == 0) { + arg[i] = largest_border_index; + } + + // If we go pass the largest border we find the next one as we iterate + if (i == largest_border_index) { + largest_border_index = border_pos[largest_border_index]; + } + } +} + +/** + * @brief A function that preprocess the bad char table + * + * @param str The string being preprocessed + * @param arg The bad char table + * @returns void + */ +void init_bad_char(const std::string& str, std::vector& arg) { + arg.resize(APLHABET_SIZE, str.length()); + + for (size_t i = 0; i < str.length(); i++) { + arg[str[i]] = str.length() - i - 1; + } +} + +/** + * @brief A function that initializes pattern + * + * @param str Text used for initialization + * @param arg Initialized structure + * @returns void + */ +void init_pattern(const std::string& str, pattern& arg) { + arg.pat = str; + init_bad_char(str, arg.bad_char); + init_good_suffix(str, arg.good_suffix); +} +/** + * @brief A function that implements Boyer-Moore's algorithm. + * + * @param str Text we are seatching in. + * @param arg pattern structure containing the preprocessed pattern + * @return Vector of indexes of the occurrences of pattern in text + */ +std::vector search(const std::string& str, const pattern& arg) { + size_t index_position = arg.pat.size() - 1; + std::vector index_storage; + + while (index_position < str.length()) { + size_t index_string = index_position; + int index_pattern = static_cast(arg.pat.size()) - 1; + + while (index_pattern >= 0 && + str[index_string] == arg.pat[index_pattern]) { + --index_pattern; + --index_string; + } + + if (index_pattern < 0) { + index_storage.push_back(index_position - arg.pat.length() + 1); + index_position += arg.good_suffix[0]; + } else { + index_position += std::max(arg.bad_char[str[index_string]], + arg.good_suffix[index_pattern + 1]); + } + } + + return index_storage; +} + +/** + * @brief Check if pat is prefix of str. + * + * @param str pointer to some part of the input text. + * @param pat the searched pattern. + * @param len length of the searched pattern + * @returns `true` if pat IS prefix of str. + * @returns `false` if pat is NOT a prefix of str. + */ +bool is_prefix(const char* str, const char* pat, size_t len) { + if (strlen(str) < len) { + return false; + } + + for (size_t i = 0; i < len; i++) { + if (str[i] != pat[i]) { + return false; + } + } + + return true; +} +} // namespace boyer_moore +} // namespace strings +/** + * @brief A test case in which we search for every appearance of the word 'and' + * @param text The text in which we search for appearance of the word 'and' + * @returns void + */ +void and_test(const char* text) { + strings::boyer_moore::pattern ands; + strings::boyer_moore::init_pattern("and", ands); + std::vector indexes = strings::boyer_moore::search(text, ands); + + assert(indexes.size() == 2); + assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3)); + assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3)); +} + +/** + * @brief A test case in which we search for every appearance of the word 'pat' + * @param text The text in which we search for appearance of the word 'pat' + * @returns void + */ +void pat_test(const char* text) { + strings::boyer_moore::pattern pat; + strings::boyer_moore::init_pattern("pat", pat); + std::vector indexes = strings::boyer_moore::search(text, pat); + + assert(indexes.size() == 6); + + for (const auto& currentIndex : indexes) { + assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3)); + } +} +/** + * @brief Self-test implementations + * @returns void + */ +static void tests() { + const char* text = + "When pat Mr. and Mrs. pat Dursley woke up on the dull, gray \ + Tuesday our story starts, \ + there was nothing about pat the cloudy sky outside to pat suggest that\ + strange and \ + mysterious things would pat soon be happening all pat over the \ + country."; + + and_test(text); + pat_test(text); + + std::cout << "All tests have successfully passed!\n"; +} + +/** + * @brief Main function + * @returns 0 on exit + */ +int main() { + tests(); // run self-test implementations + return 0; +}