From 7309ef13707b8f2e03d894d2b457740013d13cc5 Mon Sep 17 00:00:00 2001 From: Yaniv Hollander Date: Wed, 6 Oct 2021 19:55:45 -0400 Subject: [PATCH] feat: Add the Windowed Median algorithm (#1649) * Create windowed_median.cpp * Update windowed_median.cpp * Update windowed_median.cpp * updating DIRECTORY.md * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update windowed_median.cpp * Update windowed_median.cpp * Update windowed_median.cpp * Update windowed_median.cpp * Update windowed_median.cpp * Update windowed_median.cpp * Update probability/windowed_median.cpp Co-authored-by: David Leal * clang-format and clang-tidy fixes for 7634ece0 * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update probability/windowed_median.cpp Co-authored-by: David Leal * Update windowed_median.cpp * Update windowed_median.cpp Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> Co-authored-by: David Leal --- DIRECTORY.md | 1 + probability/windowed_median.cpp | 226 ++++++++++++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 probability/windowed_median.cpp diff --git a/DIRECTORY.md b/DIRECTORY.md index 528fb0add..7d206c91c 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -267,6 +267,7 @@ * [Bayes Theorem](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/probability/bayes_theorem.cpp) * [Binomial Dist](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/probability/binomial_dist.cpp) * [Poisson Dist](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/probability/poisson_dist.cpp) + * [Windowed Median](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/probability/windowed_median.cpp) ## Range Queries * [Fenwick Tree](https://github.com/TheAlgorithms/C-Plus-Plus/blob/master/range_queries/fenwick_tree.cpp) diff --git a/probability/windowed_median.cpp b/probability/windowed_median.cpp new file mode 100644 index 000000000..8945b063e --- /dev/null +++ b/probability/windowed_median.cpp @@ -0,0 +1,226 @@ +/** + * @file + * @brief An implementation of a median calculation of a sliding window along a + * data stream + * + * @details + * Given a stream of integers, the algorithm calculates the median of a fixed size + * window at the back of the stream. The leading time complexity of this + * algorithm is O(log(N), and it is inspired by the known algorithm to [find + * median from (infinite) data + * stream](https://www.tutorialcup.com/interview/algorithm/find-median-from-data-stream.htm), + * with the proper modifications to account for the finite window size for which + * the median is requested + * + * ### Algorithm + * The sliding window is managed by a list, which guarantees O(1) for both + * pushing and popping. Each new value is pushed to the window back, while a + * value from the front of the window is popped. In addition, the algorithm + * manages a multi-value binary search tree (BST), implemented by std::multiset. + * For each new value that is inserted into the window, it is also inserted to the + * BST. When a value is popped from the window, it is also erased from the BST. + * Both insertion and erasion to/from the BST are O(logN) in time, with N the + * size of the window. Finally, the algorithm keeps a pointer to the root of the + * BST, and updates its position whenever values are inserted or erased to/from + * BST. The root of the tree is the median! Hence, median retrieval is always + * O(1) + * + * Time complexity: O(logN). Space complexity: O(N). N - size of window + * @author [Yaniv Hollander](https://github.com/YanivHollander) + */ +#include /// for assert +#include /// for std::rand - needed in testing +#include /// for std::time - needed in testing +#include /// for std::list - used to manage sliding window +#include /// for std::multiset - used to manage multi-value sorted sliding window values +#include /// for std::vector - needed in testing + +/** + * @namespace probability + * @brief Probability algorithms + */ +namespace probability { +/** + * @namespace windowed_median + * @brief Functions for the Windowed Median algorithm implementation + */ +namespace windowed_median { +using Window = std::list; +using size_type = Window::size_type; + +/** + * @class WindowedMedian + * @brief A class to calculate the median of a leading sliding window at the + * back of a stream of integer values. + */ +class WindowedMedian { + const size_type _windowSize; ///< sliding window size + Window _window; ///< a sliding window of values along the stream + std::multiset _sortedValues; ///< a DS to represent a balanced + /// multi-value binary search tree (BST) + std::multiset::const_iterator + _itMedian; ///< an iterator that points to the root of the multi-value + /// BST + + /** + * @brief Inserts a value to a sorted multi-value BST + * @param value Value to insert + */ + void insertToSorted(int value) { + _sortedValues.insert(value); /// Insert value to BST - O(logN) + const auto sz = _sortedValues.size(); + if (sz == 1) { /// For the first value, set median iterator to BST root + _itMedian = _sortedValues.begin(); + return; + } + + /// If new value goes to left tree branch, and number of elements is + /// even, the new median in the balanced tree is the left child of the + /// median before the insertion + if (value < *_itMedian && sz % 2 == 0) { + --_itMedian; // O(1) - traversing one step to the left child + } + + /// However, if the new value goes to the right branch, the previous + /// median's right child is the new median in the balanced tree + else if (value >= *_itMedian && sz % 2 != 0) { + ++_itMedian; /// O(1) - traversing one step to the right child + } + } + + /** + * @brief Erases a value from a sorted multi-value BST + * @param value Value to insert + */ + void eraseFromSorted(int value) { + const auto sz = _sortedValues.size(); + + /// If the erased value is on the left branch or the median itself and + /// the number of elements is even, the new median will be the right + /// child of the current one + if (value <= *_itMedian && sz % 2 == 0) { + ++_itMedian; /// O(1) - traversing one step to the right child + } + + /// However, if the erased value is on the right branch or the median + /// itself, and the number of elements is odd, the new median will be the + /// left child of the current one + else if (value >= *_itMedian && sz % 2 != 0) { + --_itMedian; // O(1) - traversing one step to the left child + } + + /// Find the (first) position of the value we want to erase, and erase it + const auto it = _sortedValues.find(value); // O(logN) + _sortedValues.erase(it); // O(logN) + } + + public: + /** + * @brief Constructs a WindowedMedian object + * @param windowSize Sliding window size + */ + explicit WindowedMedian(size_type windowSize) : _windowSize(windowSize){}; + + /** + * @brief Insert a new value to the stream + * @param value New value to insert + */ + void insert(int value) { + + /// Push new value to the back of the sliding window - O(1) + _window.push_back(value); + insertToSorted(value); // Insert value to the multi-value BST - O(logN) + if (_window.size() > _windowSize) { /// If exceeding size of window, pop + /// from its left side + eraseFromSorted(_window.front()); /// Erase from the multi-value BST + /// the window left side value + _window + .pop_front(); /// Pop the left side value from the window - O(1) + } + } + + /** + * @brief Gets the median of the values in the sliding window + * @return Median of sliding window. For even window size return the average + * between the two values in the middle + */ + float getMedian() const { + if (_sortedValues.size() % 2 != 0) { + return *_itMedian; // O(1) + } + return 0.5f * *_itMedian + 0.5f * *next(_itMedian); /// O(1) + } + + /** + * @brief A naive and inefficient method to obtain the median of the sliding + * window. Used for testing! + * @return Median of sliding window. For even window size return the average + * between the two values in the middle + */ + float getMedianNaive() const { + auto window = _window; + window.sort(); /// Sort window - O(NlogN) + auto median = + *next(window.begin(), + window.size() / 2); /// Find value in the middle - O(N) + if (window.size() % 2 != 0) { + return median; + } + return 0.5f * median + + 0.5f * *next(window.begin(), window.size() / 2 - 1); /// O(N) + } +}; +} /// namespace windowed_median +} /// namespace probability + +/** + * @brief Self-test implementations + * @param vals Stream of values + * @param windowSize Size of sliding window + */ +static void test(const std::vector &vals, int windowSize) { + probability::windowed_median::WindowedMedian windowedMedian(windowSize); + for (const auto val : vals) { + windowedMedian.insert(val); + + /// Comparing medians: efficient function vs. Naive one + assert(windowedMedian.getMedian() == windowedMedian.getMedianNaive()); + } +} + +/** + * @brief Main function + * @param argc command line argument count (ignored) + * @param argv command line array of arguments (ignored) + * @returns 0 on exit + */ +int main(int argc, const char *argv[]) { + + /// A few fixed test cases + test({1, 2, 3, 4, 5, 6, 7, 8, 9}, 3); /// Array of sorted values; odd window size + test({9, 8, 7, 6, 5, 4, 3, 2, 1}, 3); /// Array of sorted values - decreasing; odd window size + test({9, 8, 7, 6, 5, 4, 5, 6}, 4); /// Even window size + test({3, 3, 3, 3, 3, 3, 3, 3, 3}, 3); /// Array with repeating values + test({3, 3, 3, 3, 7, 3, 3, 3, 3}, 3); /// Array with same values except one + test({4, 3, 3, -5, -5, 1, 3, 4, 5}, 5); /// Array that includes repeating values including negatives + + /// Array with large values - sum of few pairs exceeds MAX_INT. Window size is even - testing calculation of + /// average median between two middle values + test({470211272, 101027544, 1457850878, 1458777923, 2007237709, 823564440, + 1115438165, 1784484492, 74243042, 114807987}, 6); + + /// Random test cases + std::srand(static_cast(std::time(nullptr))); + std::vector vals; + for (int i = 8; i < 100; i++) { + const auto n = 1 + std::rand() / ((RAND_MAX + 5u) / 20); /// Array size in the range [5, 20] + auto windowSize = 1 + std::rand() / ((RAND_MAX + 3u) / 10); /// Window size in the range [3, 10] + vals.clear(); + vals.reserve(n); + for (int i = 0; i < n; i++) { + vals.push_back(rand() - RAND_MAX); /// Random array values (positive/negative) + } + test(vals, windowSize); /// Testing randomized test + } + return 0; +}