TheAlgorithms-C-Plus-Plus/strings/boyer_moore.cpp

/**
 * @file
 * @brief
 * The
 * [Boyer–Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
 * algorithm searches for occurrences of pattern P in text T by performing
 * explicit character comparisons at different alignments. Instead of a
 * brute-force search of all alignments (of which there are n - m + 1),
 * Boyer–Moore uses information gained by preprocessing P to skip as many
 * alignments as possible.
 *
 * @details
 * The key insight in this algorithm is that if the end of the pattern is
 * compared to the text, then jumps along the text can be made rather than
 * checking every character of the text. The reason that this works is that in
 * lining up the pattern against the text, the last character of the pattern is
 * compared to the character in the text.
 *
 * If the characters do not match, there is no need to continue searching
 * backwards along the text. This leaves us with two cases.
 *
 * Case 1:
 * If the character in the text does not match any of the characters in the
 * pattern, then the next character in the text to check is located m characters
 * farther along the text, where m is the length of the pattern.
 *
 * Case 2:
 * If the character in the text is in the pattern, then a partial shift of the
 * pattern along the text is done to line up along the matching character and
 * the process is repeated.
 *
 * There are two shift rules:
 *
 * [The bad character rule]
 * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule)
 *
 * [The good suffix rule]
 * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule)
 *
 * The shift rules are implemented as constant-time table lookups, using tables
 * generated during the preprocessing of P.
 * @author [Stoycho Kyosev](https://github.com/stoychoX)
 */

#include <cassert>   /// for assert
#include <climits>   /// for CHAR_MAX macro
#include <cstring>   /// for strlen
#include <iostream>  /// for IO operations
#include <string>    /// for std::string
#include <vector>    /// for std::vector

#define APLHABET_SIZE CHAR_MAX  ///< number of symbols in the alphabet we use

/**
 * @namespace
 * @brief String algorithms
 */
namespace strings {
/**
 * @namespace
 * @brief Functions for the [Boyer
 * Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
 * algorithm implementation
 */
namespace boyer_moore {
/**
 * @brief A structure representing all the data we need to search the
 * preprocessed pattern in text.
 */
struct pattern {
    std::string pat;

    std::vector<size_t>
        bad_char;  ///< bad char table used in [Bad Character
                   ///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/)

    std::vector<size_t>
        good_suffix;  ///< good suffix table used for [Good Suffix
                      ///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp)
};

/**
 * @brief A function that preprocess the good suffix thable
 *
 * @param str The string being preprocessed
 * @param arg The good suffix table
 * @returns void
 */
void init_good_suffix(const std::string& str, std::vector<size_t>& arg) {
    arg.resize(str.size() + 1, 0);

    // border_pos[i] - the index of the longest proper suffix of str[i..] which
    // is also a proper prefix.
    std::vector<size_t> border_pos(str.size() + 1, 0);

    size_t current_char = str.length();

    size_t border_index = str.length() + 1;

    border_pos[current_char] = border_index;

    while (current_char > 0) {
        while (border_index <= str.length() &&
               str[current_char - 1] != str[border_index - 1]) {
            if (arg[border_index] == 0) {
                arg[border_index] = border_index - current_char;
            }

            border_index = border_pos[border_index];
        }

        current_char--;
        border_index--;
        border_pos[current_char] = border_index;
    }

    size_t largest_border_index = border_pos[0];

    for (size_t i = 0; i < str.size(); i++) {
        if (arg[i] == 0) {
            arg[i] = largest_border_index;
        }

        // If we go pass the largest border we find the next one as we iterate
        if (i == largest_border_index) {
            largest_border_index = border_pos[largest_border_index];
        }
    }
}

/**
 * @brief A function that preprocess the bad char table
 *
 * @param str The string being preprocessed
 * @param arg The bad char table
 * @returns void
 */
void init_bad_char(const std::string& str, std::vector<size_t>& arg) {
    arg.resize(APLHABET_SIZE, str.length());

    for (size_t i = 0; i < str.length(); i++) {
        arg[str[i]] = str.length() - i - 1;
    }
}

/**
 * @brief A function that initializes pattern
 *
 * @param str Text used for initialization
 * @param arg Initialized structure
 * @returns void
 */
void init_pattern(const std::string& str, pattern& arg) {
    arg.pat = str;
    init_bad_char(str, arg.bad_char);
    init_good_suffix(str, arg.good_suffix);
}
/**
 * @brief A function that implements Boyer-Moore's algorithm.
 *
 * @param str Text we are seatching in.
 * @param arg pattern structure containing the preprocessed pattern
 * @return Vector of indexes of the occurrences of pattern in text
 */
std::vector<size_t> search(const std::string& str, const pattern& arg) {
    size_t index_position = arg.pat.size() - 1;
    std::vector<size_t> index_storage;

    while (index_position < str.length()) {
        size_t index_string = index_position;
        int index_pattern = static_cast<int>(arg.pat.size()) - 1;

        while (index_pattern >= 0 &&
               str[index_string] == arg.pat[index_pattern]) {
            --index_pattern;
            --index_string;
        }

        if (index_pattern < 0) {
            index_storage.push_back(index_position - arg.pat.length() + 1);
            index_position += arg.good_suffix[0];
        } else {
            index_position += std::max(arg.bad_char[str[index_string]],
                                       arg.good_suffix[index_pattern + 1]);
        }
    }

    return index_storage;
}

/**
 * @brief Check if pat is prefix of str.
 *
 * @param str pointer to some part of the input text.
 * @param pat the searched pattern.
 * @param len length of the searched pattern
 * @returns `true` if pat IS prefix of str.
 * @returns `false` if pat is NOT a prefix of str.
 */
bool is_prefix(const char* str, const char* pat, size_t len) {
    if (strlen(str) < len) {
        return false;
    }

    for (size_t i = 0; i < len; i++) {
        if (str[i] != pat[i]) {
            return false;
        }
    }

    return true;
}
}  // namespace boyer_moore
}  // namespace strings
/**
 * @brief A test case in which we search for every appearance of the word 'and'
 * @param text The text in which we search for appearance of the word 'and'
 * @returns void
 */
void and_test(const char* text) {
    strings::boyer_moore::pattern ands;
    strings::boyer_moore::init_pattern("and", ands);
    std::vector<size_t> indexes = strings::boyer_moore::search(text, ands);

    assert(indexes.size() == 2);
    assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3));
    assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3));
}

/**
 * @brief  A test case in which we search for every appearance of the word 'pat'
 * @param text The text in which we search for appearance of the word 'pat'
 * @returns void
 */
void pat_test(const char* text) {
    strings::boyer_moore::pattern pat;
    strings::boyer_moore::init_pattern("pat", pat);
    std::vector<size_t> indexes = strings::boyer_moore::search(text, pat);

    assert(indexes.size() == 6);

    for (const auto& currentIndex : indexes) {
        assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3));
    }
}
/**
 * @brief Self-test implementations
 * @returns void
 */
static void tests() {
    const char* text =
        "When pat Mr. and Mrs. pat Dursley woke up on the dull, gray  \
                            Tuesday our story starts, \
                there was nothing about pat the cloudy sky outside to pat suggest that\
                        strange and \
                    mysterious things would pat soon be happening all pat over the \
                        country.";

    and_test(text);
    pat_test(text);

    std::cout << "All tests have successfully passed!\n";
}

/**
 * @brief Main function
 * @returns 0 on exit
 */
int main() {
    tests();  // run self-test implementations
    return 0;
}
-												feat: add Boyer Moore algorithm implementation (#2441)

* add boyer moore algorithm implementation

* add a one-line description of what the library/header is for

* fix comments pattern and make tests static

* add documentation

* add namespaces

* fix all warnings for clang-tydy.exe <filename>

* Change lib from limits to climits (CHAR_MAX macro)

Co-authored-by: David Leal <halfpacho@gmail.com>

* Add breif description of boyer-moore algorithm

* Fix styling

* Add needed documentation

* my commit

* fix type of index_pattern

* Fix clang-warnings

* chore: apply suggestions from code review

* chore: add print message after tests

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* fix: variable name

* Update strings/boyer_moore.cpp

Co-authored-by: David Leal <halfpacho@gmail.com>

---------

Co-authored-by: David Leal <halfpacho@gmail.com>
Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

											
										
										
											2023-06-17 05:38:38 +08:00
+								/**
 								 * @file
 								 * @brief
-												feat: update CMake version to 3.26.4 (#2486)

* update cmake version

* clang-format and clang-tidy fixes for 402c5627

---------

Co-authored-by: David Leal <halfpacho@gmail.com>
Co-authored-by: github-actions[bot] <github-actions@users.noreply.github.com>

											
										
										
											2023-06-17 06:08:45 +08:00
+								 * The
 								 * [Boyer–Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
 								 * algorithm searches for occurrences of pattern P in text T by performing
 								 * explicit character comparisons at different alignments. Instead of a
 								 * brute-force search of all alignments (of which there are n - m + 1),
-												feat: add Boyer Moore algorithm implementation (#2441)

* add boyer moore algorithm implementation

* add a one-line description of what the library/header is for

* fix comments pattern and make tests static

* add documentation

* add namespaces

* fix all warnings for clang-tydy.exe <filename>

* Change lib from limits to climits (CHAR_MAX macro)

Co-authored-by: David Leal <halfpacho@gmail.com>

* Add breif description of boyer-moore algorithm

* Fix styling

* Add needed documentation

* my commit

* fix type of index_pattern

* Fix clang-warnings

* chore: apply suggestions from code review

* chore: add print message after tests

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* Update strings/boyer_moore.cpp

Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

* fix: variable name

* Update strings/boyer_moore.cpp

Co-authored-by: David Leal <halfpacho@gmail.com>

---------

Co-authored-by: David Leal <halfpacho@gmail.com>
Co-authored-by: realstealthninja <68815218+realstealthninja@users.noreply.github.com>

											
										
										
											2023-06-17 05:38:38 +08:00
+								 * Boyer–Moore uses information gained by preprocessing P to skip as many
 								 * alignments as possible.
 								 *
 								 * @details
 								 * The key insight in this algorithm is that if the end of the pattern is
 								 * compared to the text, then jumps along the text can be made rather than
 								 * checking every character of the text. The reason that this works is that in
 								 * lining up the pattern against the text, the last character of the pattern is
 								 * compared to the character in the text.
 								 *
 								 * If the characters do not match, there is no need to continue searching
 								 * backwards along the text. This leaves us with two cases.
 								 *
 								 * Case 1:
 								 * If the character in the text does not match any of the characters in the
 								 * pattern, then the next character in the text to check is located m characters
 								 * farther along the text, where m is the length of the pattern.
 								 *
 								 * Case 2:
 								 * If the character in the text is in the pattern, then a partial shift of the
 								 * pattern along the text is done to line up along the matching character and
 								 * the process is repeated.
 								 *
 								 * There are two shift rules:
 								 *
 								 * [The bad character rule]
 								 * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule)
 								 *
 								 * [The good suffix rule]
 								 * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule)
 								 *
 								 * The shift rules are implemented as constant-time table lookups, using tables
 								 * generated during the preprocessing of P.
 								 * @author [Stoycho Kyosev](https://github.com/stoychoX)
 								 */
 								#include <cassert>   /// for assert
 								#include <climits>   /// for CHAR_MAX macro
 								#include <cstring>   /// for strlen
 								#include <iostream>  /// for IO operations
 								#include <string>    /// for std::string
 								#include <vector>    /// for std::vector
 								#define APLHABET_SIZE CHAR_MAX  ///< number of symbols in the alphabet we use
 								/**
 								 * @namespace
 								 * @brief String algorithms
 								 */
 								namespace strings {
 								/**
 								 * @namespace
 								 * @brief Functions for the [Boyer
 								 * Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
 								 * algorithm implementation
 								 */
 								namespace boyer_moore {
 								/**
 								 * @brief A structure representing all the data we need to search the
 								 * preprocessed pattern in text.
 								 */
 								struct pattern {
 								    std::string pat;
 								    std::vector<size_t>
 								        bad_char;  ///< bad char table used in [Bad Character
 								                   ///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/)
 								    std::vector<size_t>
 								        good_suffix;  ///< good suffix table used for [Good Suffix
 								                      ///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp)
 								};
 								/**
 								 * @brief A function that preprocess the good suffix thable
 								 *
 								 * @param str The string being preprocessed
 								 * @param arg The good suffix table
 								 * @returns void
 								 */
 								void init_good_suffix(const std::string& str, std::vector<size_t>& arg) {
 								    arg.resize(str.size() + 1, 0);
 								    // border_pos[i] - the index of the longest proper suffix of str[i..] which
 								    // is also a proper prefix.
 								    std::vector<size_t> border_pos(str.size() + 1, 0);
 								    size_t current_char = str.length();
 								    size_t border_index = str.length() + 1;
 								    border_pos[current_char] = border_index;
 								    while (current_char > 0) {
 								        while (border_index <= str.length() &&
 								               str[current_char - 1] != str[border_index - 1]) {
 								            if (arg[border_index] == 0) {
 								                arg[border_index] = border_index - current_char;
 								            }
 								            border_index = border_pos[border_index];
 								        }
 								        current_char--;
 								        border_index--;
 								        border_pos[current_char] = border_index;
 								    }
 								    size_t largest_border_index = border_pos[0];
 								    for (size_t i = 0; i < str.size(); i++) {
 								        if (arg[i] == 0) {
 								            arg[i] = largest_border_index;
 								        }
 								        // If we go pass the largest border we find the next one as we iterate
 								        if (i == largest_border_index) {
 								            largest_border_index = border_pos[largest_border_index];
 								        }
 								    }
 								}
 								/**
 								 * @brief A function that preprocess the bad char table
 								 *
 								 * @param str The string being preprocessed
 								 * @param arg The bad char table
 								 * @returns void
 								 */
 								void init_bad_char(const std::string& str, std::vector<size_t>& arg) {
 								    arg.resize(APLHABET_SIZE, str.length());
 								    for (size_t i = 0; i < str.length(); i++) {
 								        arg[str[i]] = str.length() - i - 1;
 								    }
 								}
 								/**
 								 * @brief A function that initializes pattern
 								 *
 								 * @param str Text used for initialization
 								 * @param arg Initialized structure
 								 * @returns void
 								 */
 								void init_pattern(const std::string& str, pattern& arg) {
 								    arg.pat = str;
 								    init_bad_char(str, arg.bad_char);
 								    init_good_suffix(str, arg.good_suffix);
 								}
 								/**
 								 * @brief A function that implements Boyer-Moore's algorithm.
 								 *
 								 * @param str Text we are seatching in.
 								 * @param arg pattern structure containing the preprocessed pattern
 								 * @return Vector of indexes of the occurrences of pattern in text
 								 */
 								std::vector<size_t> search(const std::string& str, const pattern& arg) {
 								    size_t index_position = arg.pat.size() - 1;
 								    std::vector<size_t> index_storage;
 								    while (index_position < str.length()) {
 								        size_t index_string = index_position;
 								        int index_pattern = static_cast<int>(arg.pat.size()) - 1;
 								        while (index_pattern >= 0 &&
 								               str[index_string] == arg.pat[index_pattern]) {
 								            --index_pattern;
 								            --index_string;
 								        }
 								        if (index_pattern < 0) {
 								            index_storage.push_back(index_position - arg.pat.length() + 1);
 								            index_position += arg.good_suffix[0];
 								        } else {
 								            index_position += std::max(arg.bad_char[str[index_string]],
 								                                       arg.good_suffix[index_pattern + 1]);
 								        }
 								    }
 								    return index_storage;
 								}
 								/**
 								 * @brief Check if pat is prefix of str.
 								 *
 								 * @param str pointer to some part of the input text.
 								 * @param pat the searched pattern.
 								 * @param len length of the searched pattern
 								 * @returns `true` if pat IS prefix of str.
 								 * @returns `false` if pat is NOT a prefix of str.
 								 */
 								bool is_prefix(const char* str, const char* pat, size_t len) {
 								    if (strlen(str) < len) {
 								        return false;
 								    }
 								    for (size_t i = 0; i < len; i++) {
 								        if (str[i] != pat[i]) {
 								            return false;
 								        }
 								    }
 								    return true;
 								}
 								}  // namespace boyer_moore
 								}  // namespace strings
 								/**
 								 * @brief A test case in which we search for every appearance of the word 'and'
 								 * @param text The text in which we search for appearance of the word 'and'
 								 * @returns void
 								 */
 								void and_test(const char* text) {
 								    strings::boyer_moore::pattern ands;
 								    strings::boyer_moore::init_pattern("and", ands);
 								    std::vector<size_t> indexes = strings::boyer_moore::search(text, ands);
 								    assert(indexes.size() == 2);
 								    assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3));
 								    assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3));
 								}
 								/**
 								 * @brief  A test case in which we search for every appearance of the word 'pat'
 								 * @param text The text in which we search for appearance of the word 'pat'
 								 * @returns void
 								 */
 								void pat_test(const char* text) {
 								    strings::boyer_moore::pattern pat;
 								    strings::boyer_moore::init_pattern("pat", pat);
 								    std::vector<size_t> indexes = strings::boyer_moore::search(text, pat);
 								    assert(indexes.size() == 6);
 								    for (const auto& currentIndex : indexes) {
 								        assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3));
 								    }
 								}
 								/**
 								 * @brief Self-test implementations
 								 * @returns void
 								 */
 								static void tests() {
 								    const char* text =
 								        "When pat Mr. and Mrs. pat Dursley woke up on the dull, gray  \
 								                            Tuesday our story starts, \
 								                there was nothing about pat the cloudy sky outside to pat suggest that\
 								                        strange and \
 								                    mysterious things would pat soon be happening all pat over the \
 								                        country.";
 								    and_test(text);
 								    pat_test(text);
 								    std::cout << "All tests have successfully passed!\n";
 								}
 								/**
 								 * @brief Main function
 								 * @returns 0 on exit
 								 */
 								int main() {
 								    tests();  // run self-test implementations
 								    return 0;
 								}