mirror of
https://hub.njuu.cf/TheAlgorithms/C-Plus-Plus.git
synced 2023-10-11 13:05:55 +08:00
feat: add Boyer Moore algorithm implementation (#2441)
* add boyer moore algorithm implementation * add a one-line description of what the library/header is for * fix comments pattern and make tests static * add documentation * add namespaces * fix all warnings for clang-tydy.exe <filename> * Change lib from limits to climits (CHAR_MAX macro) Co-authored-by: David Leal <halfpacho@gmail.com> * Add breif description of boyer-moore algorithm * Fix styling * Add needed documentation * my commit * fix type of index_pattern * Fix clang-warnings * chore: apply suggestions from code review * chore: add print message after tests * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * Update strings/boyer_moore.cpp Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com> * fix: variable name * Update strings/boyer_moore.cpp Co-authored-by: David Leal <halfpacho@gmail.com> --------- Co-authored-by: David Leal <halfpacho@gmail.com> Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>
This commit is contained in:
parent
5ef7ad5cfe
commit
d7a9869dce
29
.vscode/settings.json
vendored
29
.vscode/settings.json
vendored
@ -59,6 +59,33 @@
|
||||
"stdexcept": "cpp",
|
||||
"streambuf": "cpp",
|
||||
"typeinfo": "cpp",
|
||||
"valarray": "cpp"
|
||||
"valarray": "cpp",
|
||||
"bit": "cpp",
|
||||
"charconv": "cpp",
|
||||
"compare": "cpp",
|
||||
"concepts": "cpp",
|
||||
"format": "cpp",
|
||||
"forward_list": "cpp",
|
||||
"ios": "cpp",
|
||||
"locale": "cpp",
|
||||
"queue": "cpp",
|
||||
"stack": "cpp",
|
||||
"xfacet": "cpp",
|
||||
"xhash": "cpp",
|
||||
"xiosbase": "cpp",
|
||||
"xlocale": "cpp",
|
||||
"xlocbuf": "cpp",
|
||||
"xlocinfo": "cpp",
|
||||
"xlocmes": "cpp",
|
||||
"xlocmon": "cpp",
|
||||
"xlocnum": "cpp",
|
||||
"xloctime": "cpp",
|
||||
"xmemory": "cpp",
|
||||
"xstddef": "cpp",
|
||||
"xstring": "cpp",
|
||||
"xtr1common": "cpp",
|
||||
"xtree": "cpp",
|
||||
"xutility": "cpp",
|
||||
"climits": "cpp"
|
||||
}
|
||||
}
|
||||
|
270
strings/boyer_moore.cpp
Normal file
270
strings/boyer_moore.cpp
Normal file
@ -0,0 +1,270 @@
|
||||
/**
|
||||
* @file
|
||||
* @brief
|
||||
* The [Boyer–Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm) algorithm searches for occurrences of pattern P in text T by
|
||||
* performing explicit character comparisons at different alignments. Instead of
|
||||
* a brute-force search of all alignments (of which there are n - m + 1),
|
||||
* Boyer–Moore uses information gained by preprocessing P to skip as many
|
||||
* alignments as possible.
|
||||
*
|
||||
* @details
|
||||
* The key insight in this algorithm is that if the end of the pattern is
|
||||
* compared to the text, then jumps along the text can be made rather than
|
||||
* checking every character of the text. The reason that this works is that in
|
||||
* lining up the pattern against the text, the last character of the pattern is
|
||||
* compared to the character in the text.
|
||||
*
|
||||
* If the characters do not match, there is no need to continue searching
|
||||
* backwards along the text. This leaves us with two cases.
|
||||
*
|
||||
* Case 1:
|
||||
* If the character in the text does not match any of the characters in the
|
||||
* pattern, then the next character in the text to check is located m characters
|
||||
* farther along the text, where m is the length of the pattern.
|
||||
*
|
||||
* Case 2:
|
||||
* If the character in the text is in the pattern, then a partial shift of the
|
||||
* pattern along the text is done to line up along the matching character and
|
||||
* the process is repeated.
|
||||
*
|
||||
* There are two shift rules:
|
||||
*
|
||||
* [The bad character rule]
|
||||
* (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule)
|
||||
*
|
||||
* [The good suffix rule]
|
||||
* (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule)
|
||||
*
|
||||
* The shift rules are implemented as constant-time table lookups, using tables
|
||||
* generated during the preprocessing of P.
|
||||
* @author [Stoycho Kyosev](https://github.com/stoychoX)
|
||||
*/
|
||||
|
||||
#include <cassert> /// for assert
|
||||
#include <climits> /// for CHAR_MAX macro
|
||||
#include <cstring> /// for strlen
|
||||
#include <iostream> /// for IO operations
|
||||
#include <string> /// for std::string
|
||||
#include <vector> /// for std::vector
|
||||
|
||||
#define APLHABET_SIZE CHAR_MAX ///< number of symbols in the alphabet we use
|
||||
|
||||
/**
|
||||
* @namespace
|
||||
* @brief String algorithms
|
||||
*/
|
||||
namespace strings {
|
||||
/**
|
||||
* @namespace
|
||||
* @brief Functions for the [Boyer
|
||||
* Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
|
||||
* algorithm implementation
|
||||
*/
|
||||
namespace boyer_moore {
|
||||
/**
|
||||
* @brief A structure representing all the data we need to search the
|
||||
* preprocessed pattern in text.
|
||||
*/
|
||||
struct pattern {
|
||||
std::string pat;
|
||||
|
||||
std::vector<size_t>
|
||||
bad_char; ///< bad char table used in [Bad Character
|
||||
///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/)
|
||||
|
||||
std::vector<size_t>
|
||||
good_suffix; ///< good suffix table used for [Good Suffix
|
||||
///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp)
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A function that preprocess the good suffix thable
|
||||
*
|
||||
* @param str The string being preprocessed
|
||||
* @param arg The good suffix table
|
||||
* @returns void
|
||||
*/
|
||||
void init_good_suffix(const std::string& str, std::vector<size_t>& arg) {
|
||||
arg.resize(str.size() + 1, 0);
|
||||
|
||||
// border_pos[i] - the index of the longest proper suffix of str[i..] which
|
||||
// is also a proper prefix.
|
||||
std::vector<size_t> border_pos(str.size() + 1, 0);
|
||||
|
||||
size_t current_char = str.length();
|
||||
|
||||
size_t border_index = str.length() + 1;
|
||||
|
||||
border_pos[current_char] = border_index;
|
||||
|
||||
while (current_char > 0) {
|
||||
while (border_index <= str.length() &&
|
||||
str[current_char - 1] != str[border_index - 1]) {
|
||||
if (arg[border_index] == 0) {
|
||||
arg[border_index] = border_index - current_char;
|
||||
}
|
||||
|
||||
border_index = border_pos[border_index];
|
||||
}
|
||||
|
||||
current_char--;
|
||||
border_index--;
|
||||
border_pos[current_char] = border_index;
|
||||
}
|
||||
|
||||
size_t largest_border_index = border_pos[0];
|
||||
|
||||
for (size_t i = 0; i < str.size(); i++) {
|
||||
if (arg[i] == 0) {
|
||||
arg[i] = largest_border_index;
|
||||
}
|
||||
|
||||
// If we go pass the largest border we find the next one as we iterate
|
||||
if (i == largest_border_index) {
|
||||
largest_border_index = border_pos[largest_border_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief A function that preprocess the bad char table
|
||||
*
|
||||
* @param str The string being preprocessed
|
||||
* @param arg The bad char table
|
||||
* @returns void
|
||||
*/
|
||||
void init_bad_char(const std::string& str, std::vector<size_t>& arg) {
|
||||
arg.resize(APLHABET_SIZE, str.length());
|
||||
|
||||
for (size_t i = 0; i < str.length(); i++) {
|
||||
arg[str[i]] = str.length() - i - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief A function that initializes pattern
|
||||
*
|
||||
* @param str Text used for initialization
|
||||
* @param arg Initialized structure
|
||||
* @returns void
|
||||
*/
|
||||
void init_pattern(const std::string& str, pattern& arg) {
|
||||
arg.pat = str;
|
||||
init_bad_char(str, arg.bad_char);
|
||||
init_good_suffix(str, arg.good_suffix);
|
||||
}
|
||||
/**
|
||||
* @brief A function that implements Boyer-Moore's algorithm.
|
||||
*
|
||||
* @param str Text we are seatching in.
|
||||
* @param arg pattern structure containing the preprocessed pattern
|
||||
* @return Vector of indexes of the occurrences of pattern in text
|
||||
*/
|
||||
std::vector<size_t> search(const std::string& str, const pattern& arg) {
|
||||
size_t index_position = arg.pat.size() - 1;
|
||||
std::vector<size_t> index_storage;
|
||||
|
||||
while (index_position < str.length()) {
|
||||
size_t index_string = index_position;
|
||||
int index_pattern = static_cast<int>(arg.pat.size()) - 1;
|
||||
|
||||
while (index_pattern >= 0 &&
|
||||
str[index_string] == arg.pat[index_pattern]) {
|
||||
--index_pattern;
|
||||
--index_string;
|
||||
}
|
||||
|
||||
if (index_pattern < 0) {
|
||||
index_storage.push_back(index_position - arg.pat.length() + 1);
|
||||
index_position += arg.good_suffix[0];
|
||||
} else {
|
||||
index_position += std::max(arg.bad_char[str[index_string]],
|
||||
arg.good_suffix[index_pattern + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
return index_storage;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check if pat is prefix of str.
|
||||
*
|
||||
* @param str pointer to some part of the input text.
|
||||
* @param pat the searched pattern.
|
||||
* @param len length of the searched pattern
|
||||
* @returns `true` if pat IS prefix of str.
|
||||
* @returns `false` if pat is NOT a prefix of str.
|
||||
*/
|
||||
bool is_prefix(const char* str, const char* pat, size_t len) {
|
||||
if (strlen(str) < len) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
if (str[i] != pat[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace boyer_moore
|
||||
} // namespace strings
|
||||
/**
|
||||
* @brief A test case in which we search for every appearance of the word 'and'
|
||||
* @param text The text in which we search for appearance of the word 'and'
|
||||
* @returns void
|
||||
*/
|
||||
void and_test(const char* text) {
|
||||
strings::boyer_moore::pattern ands;
|
||||
strings::boyer_moore::init_pattern("and", ands);
|
||||
std::vector<size_t> indexes = strings::boyer_moore::search(text, ands);
|
||||
|
||||
assert(indexes.size() == 2);
|
||||
assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3));
|
||||
assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief A test case in which we search for every appearance of the word 'pat'
|
||||
* @param text The text in which we search for appearance of the word 'pat'
|
||||
* @returns void
|
||||
*/
|
||||
void pat_test(const char* text) {
|
||||
strings::boyer_moore::pattern pat;
|
||||
strings::boyer_moore::init_pattern("pat", pat);
|
||||
std::vector<size_t> indexes = strings::boyer_moore::search(text, pat);
|
||||
|
||||
assert(indexes.size() == 6);
|
||||
|
||||
for (const auto& currentIndex : indexes) {
|
||||
assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3));
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @brief Self-test implementations
|
||||
* @returns void
|
||||
*/
|
||||
static void tests() {
|
||||
const char* text =
|
||||
"When pat Mr. and Mrs. pat Dursley woke up on the dull, gray \
|
||||
Tuesday our story starts, \
|
||||
there was nothing about pat the cloudy sky outside to pat suggest that\
|
||||
strange and \
|
||||
mysterious things would pat soon be happening all pat over the \
|
||||
country.";
|
||||
|
||||
and_test(text);
|
||||
pat_test(text);
|
||||
|
||||
std::cout << "All tests have successfully passed!\n";
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Main function
|
||||
* @returns 0 on exit
|
||||
*/
|
||||
int main() {
|
||||
tests(); // run self-test implementations
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user