TheAlgorithms-C-Plus-Plus/hashing/md5.cpp
tGautot 404e2c15b3
feat: Add MD5 Hashing Algorithm (#1595)
* clang-format and clang-tidy fixes for 89d118bb

* feat: add md5 hashing algorithm, namespace md5

* updating DIRECTORY.md

* docs: added missing docs and cleared syntax

* Added newline at the end of file

* clang-format and clang-tidy fixes for b7b32ac7

* Comments for libs, put global funcs as static

* clang-format and clang-tidy fixes for d5da4807

* docs: added additional comments

* docs: Fixed some newlines in doxygen docs

* Changed incorrect comment

* Fixed docs & cleared potential endianness problems

* Removed useless line

* Clarified how to exit interactive mode

* Better wording

* Improved interactive mode

Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
Co-authored-by: David Leal <halfpacho@gmail.com>
2021-09-24 11:35:31 -05:00

384 lines
14 KiB
C++

/**
* @file
* @author [tGautot](https://github.com/tGautot)
* @brief Simple C++ implementation of the [MD5 Hashing
* Algorithm](https://en.wikipedia.org/wiki/MD5)
* @details
* The [MD5 Algorithm](https://en.wikipedia.org/wiki/MD5) is a
* hashing algorithm which was designed in 1991 by [Ronal
* Rivest](https://en.wikipedia.org/wiki/Ron_Rivest).
*
* MD5 is one of the most used hashing algorithm there is. Some of its
* use cases are:
* 1. Providing checksum for downloaded software
* 2. Store salted password
*
* However MD5 has be know to be cryptographically weak for quite some
* time, yet it is still widely used. This weakness was exploited by the
* [Flame Malware](https://en.wikipedia.org/wiki/Flame_(malware)) in 2012
*
* ### Algorithm
* First of all, all values are expected to be in [little endian]
* (https://en.wikipedia.org/wiki/Endianness). This is especially important
* when using part of the bytestring as an integer.
*
* The first step of the algorithm is to pad the message for its length to
* be a multiple of 64 (bytes). This is done by first adding 0x80 (10000000)
* and then only zeroes until the last 8 bytes must be filled, where then the
* 64 bit size of the input will be added
*
* Once this is done, the algo breaks down this padded message
* into 64 bytes chunks. Each chunk is used for one *round*, a round
* breaks the chunk into 16 blocks of 4 bytes. During these rounds
* the algorithm will update its 128 bit state (represented by 4 ints: A,B,C,D)
* For more precisions on these operations please see the [Wikipedia
* aritcle](https://en.wikipedia.org/wiki/MD5#Algorithm).
* The signature given by MD5 is its 128 bit state once all rounds are done.
* @note This is a simple implementation for a byte string but
* some implmenetations can work on bytestream, messages of unknown length.
*/
#include <algorithm> /// Used for std::copy
#include <array> /// Used for std::array
#include <cassert> /// Used for assert
#include <cstring> /// Used for std::memcopy
#include <iostream> /// Used for IO operations
#include <string> /// Used for strings
#include <vector> /// Used for std::vector
/**
* @namespace hashing
* @brief Hashing algorithms
*/
namespace hashing {
/**
* @namespace MD5
* @brief Functions for the [MD5](https://en.wikipedia.org/wiki/MD5) algorithm
* implementation
*/
namespace md5 {
/**
* @brief Rotates the bits of a 32-bit unsigned integer
* @param n Integer to rotate
* @param rotate How many bits for the rotation
* @return uint32_t The rotated integer
*/
uint32_t leftRotate32bits(uint32_t n, std::size_t rotate) {
return (n << rotate) | (n >> (32 - rotate));
}
/**
* @brief Checks whether integers are stored as big endian or not
* @note Taken from [this](https://stackoverflow.com/a/1001373) StackOverflow
* post
* @return true IF integers are detected to work as big-endian
* @return false IF integers are detected to work as little-endian
*/
bool isBigEndian() {
union {
uint32_t i;
std::array<char, 4> c;
} bint = {0x01020304};
return bint.c[0] == 1;
}
/**
* @brief Sets 32-bit integer to little-endian if needed
* @param n Number to set to little-endian (uint32_t)
* @return uint32_t param n with binary representation as little-endian
*/
uint32_t toLittleEndian32(uint32_t n) {
if (!isBigEndian()) {
return ((n << 24) & 0xFF000000) | ((n << 8) & 0x00FF0000) |
((n >> 8) & 0x0000FF00) | ((n >> 24) & 0x000000FF);
}
// Machine works on little endian, no need to change anything
return n;
}
/**
* @brief Sets 64-bit integer to little-endian if needed
* @param n Number to set to little-endian (uint64_t)
* @return uint64_t param n with binary representation as little-endian
*/
uint64_t toLittleEndian64(uint64_t n) {
if (!isBigEndian()) {
return ((n << 56) & 0xFF00000000000000) |
((n << 40) & 0x00FF000000000000) |
((n << 24) & 0x0000FF0000000000) |
((n << 8) & 0x000000FF00000000) |
((n >> 8) & 0x00000000FF000000) |
((n >> 24) & 0x0000000000FF0000) |
((n >> 40) & 0x000000000000FF00) |
((n >> 56) & 0x00000000000000FF);
;
}
// Machine works on little endian, no need to change anything
return n;
}
/**
* @brief Transforms the 128-bit MD5 signature into a 32 char hex string
* @param sig The MD5 signature (Expected 16 bytes)
* @return std::string The hex signature
*/
std::string sig2hex(void* sig) {
const char* hexChars = "0123456789abcdef";
auto* intsig = static_cast<uint8_t*>(sig);
std::string hex = "";
for (uint8_t i = 0; i < 16; i++) {
hex.push_back(hexChars[(intsig[i] >> 4) & 0xF]);
hex.push_back(hexChars[(intsig[i]) & 0xF]);
}
return hex;
}
/**
* @brief The MD5 algorithm itself, taking in a bytestring
* @param input_bs The bytestring to hash
* @param input_size The size (in BYTES) of the input
* @return void* Pointer to the 128-bit signature
*/
void* hash_bs(const void* input_bs, uint64_t input_size) {
auto* input = static_cast<const uint8_t*>(input_bs);
// Step 0: Initial Data (Those are decided in the MD5 protocol)
// s is the shift used in the leftrotate each round
std::array<uint32_t, 64> s = {
7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
// K is pseudo-random values used each round
// The values can be obtained by the following python code:
/**
* @brief Values of K are pseudo-random and used to "salt" each round
* The values can be obtained by the following python code
* @code{.py}
* from math import floor, sin
*
* for i in range(64):
* print(floor(2**32 * abs(sin(i+1))))
* @endcode
*/
std::array<uint32_t, 64> K = {
3614090360, 3905402710, 606105819, 3250441966, 4118548399, 1200080426,
2821735955, 4249261313, 1770035416, 2336552879, 4294925233, 2304563134,
1804603682, 4254626195, 2792965006, 1236535329, 4129170786, 3225465664,
643717713, 3921069994, 3593408605, 38016083, 3634488961, 3889429448,
568446438, 3275163606, 4107603335, 1163531501, 2850285829, 4243563512,
1735328473, 2368359562, 4294588738, 2272392833, 1839030562, 4259657740,
2763975236, 1272893353, 4139469664, 3200236656, 681279174, 3936430074,
3572445317, 76029189, 3654602809, 3873151461, 530742520, 3299628645,
4096336452, 1126891415, 2878612391, 4237533241, 1700485571, 2399980690,
4293915773, 2240044497, 1873313359, 4264355552, 2734768916, 1309151649,
4149444226, 3174756917, 718787259, 3951481745};
// The initial 128-bit state
uint32_t a0 = 0x67452301, A = 0;
uint32_t b0 = 0xefcdab89, B = 0;
uint32_t c0 = 0x98badcfe, C = 0;
uint32_t d0 = 0x10325476, D = 0;
// Step 1: Processing the bytestring
// First compute the size the padded message will have
// so it is possible to allocate the right amount of memory
uint64_t padded_message_size = 0;
if (input_size % 64 < 56) {
padded_message_size = input_size + 64 - (input_size % 64);
} else {
padded_message_size = input_size + 128 - (input_size % 64);
}
std::vector<uint8_t> padded_message(padded_message_size);
// Beginning of the padded message is the original message
std::copy(input, input + input_size, padded_message.begin());
// Afterwards comes a single 1 bit and then only zeroes
padded_message[input_size] = 1 << 7; // 10000000
for (uint64_t i = input_size; i % 64 != 56; i++) {
if (i == input_size) {
continue; // pass first iteration
}
padded_message[i] = 0;
}
// We then have to add the 64-bit size of the message at the end
// When there is a conversion from int to bytestring or vice-versa
// We always need to make sure it is little endian
uint64_t input_bitsize_le = toLittleEndian64(input_size * 8);
for (uint8_t i = 0; i < 8; i++) {
padded_message[padded_message_size - 8 + i] =
(input_bitsize_le >> (56 - 8 * i)) & 0xFF;
}
// Already allocate memory for blocks
std::array<uint32_t, 16> blocks{};
// Rounds
for (uint64_t chunk = 0; chunk * 64 < padded_message_size; chunk++) {
// First, build the 16 32-bits blocks from the chunk
for (uint8_t bid = 0; bid < 16; bid++) {
blocks[bid] = 0;
// Having to build a 32-bit word from 4-bit words
// Add each and shift them to the left
for (uint8_t cid = 0; cid < 4; cid++) {
blocks[bid] = (blocks[bid] << 8) +
padded_message[chunk * 64 + bid * 4 + cid];
}
}
A = a0;
B = b0;
C = c0;
D = d0;
// Main "hashing" loop
for (uint8_t i = 0; i < 64; i++) {
uint32_t F = 0, g = 0;
if (i < 16) {
F = (B & C) | ((~B) & D);
g = i;
} else if (i < 32) {
F = (D & B) | ((~D) & C);
g = (5 * i + 1) % 16;
} else if (i < 48) {
F = B ^ C ^ D;
g = (3 * i + 5) % 16;
} else {
F = C ^ (B | (~D));
g = (7 * i) % 16;
}
// Update the accumulators
F += A + K[i] + toLittleEndian32(blocks[g]);
A = D;
D = C;
C = B;
B += leftRotate32bits(F, s[i]);
}
// Update the state with this chunk's hash
a0 += A;
b0 += B;
c0 += C;
d0 += D;
}
// Build signature from state
// Note, any type could be used for the signature
// uint8_t was used to make the 16 bytes obvious
// The sig needs to be little endian
auto* sig = new uint8_t[16];
for (uint8_t i = 0; i < 4; i++) {
sig[i] = (a0 >> (8 * i)) & 0xFF;
sig[i + 4] = (b0 >> (8 * i)) & 0xFF;
sig[i + 8] = (c0 >> (8 * i)) & 0xFF;
sig[i + 12] = (d0 >> (8 * i)) & 0xFF;
}
return sig;
}
/**
* @brief Converts the string to bytestring and calls the main algorithm
* @param message Plain character message to hash
* @return void* Pointer to the MD5 signature
*/
void* hash(const std::string& message) {
return hash_bs(&message[0], message.size());
}
} // namespace md5
} // namespace hashing
/**
* @brief Self-test implementations of well-known MD5 hashes
* @returns void
*/
static void test() {
// Hashes empty string and stores signature
void* sig = hashing::md5::hash("");
std::cout << "Hashing empty string" << std::endl;
// Prints signature hex representation
std::cout << hashing::md5::sig2hex(sig) << std::endl << std::endl;
// Test with cassert whether sig is correct from the expected value
assert(hashing::md5::sig2hex(sig).compare(
"d41d8cd98f00b204e9800998ecf8427e") == 0);
// Hashes "The quick brown fox jumps over the lazy dog" and stores signature
void* sig2 =
hashing::md5::hash("The quick brown fox jumps over the lazy dog");
std::cout << "Hashing The quick brown fox jumps over the lazy dog"
<< std::endl;
// Prints signature hex representation
std::cout << hashing::md5::sig2hex(sig2) << std::endl << std::endl;
// Test with cassert whether sig is correct from the expected value
assert(hashing::md5::sig2hex(sig2).compare(
"9e107d9d372bb6826bd81d3542a419d6") == 0);
// Hashes "The quick brown fox jumps over the lazy dog." (notice the
// additional period) and stores signature
void* sig3 =
hashing::md5::hash("The quick brown fox jumps over the lazy dog.");
std::cout << "Hashing "
"The quick brown fox jumps over the lazy dog."
<< std::endl;
// Prints signature hex representation
std::cout << hashing::md5::sig2hex(sig3) << std::endl << std::endl;
// Test with cassert whether sig is correct from the expected value
assert(hashing::md5::sig2hex(sig3).compare(
"e4d909c290d0fb1ca068ffaddf22cbd0") == 0);
// Hashes "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
// and stores signature
void* sig4 = hashing::md5::hash(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
std::cout
<< "Hashing "
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
<< std::endl;
// Prints signature hex representation
std::cout << hashing::md5::sig2hex(sig4) << std::endl << std::endl;
// Test with cassert whether sig is correct from the expected value
assert(hashing::md5::sig2hex(sig4).compare(
"d174ab98d277d9f5a5611c2c9f419d9f") == 0);
}
/**
* @brief Puts user in a loop where inputs can be given and MD5 hash will be
* computed and printed
* @returns void
*/
static void interactive() {
while (true) {
std::string input;
std::cout << "Enter a message to be hashed (Ctrl-C to exit): "
<< std::endl;
std::getline(std::cin, input);
void* sig = hashing::md5::hash(input);
std::cout << "Hash is: " << hashing::md5::sig2hex(sig) << std::endl;
while (true) {
std::cout << "Want to enter another message? (y/n) ";
std::getline(std::cin, input);
if (input.compare("y") == 0) {
break;
} else if (input.compare("n") == 0) {
return;
}
}
}
}
/**
* @brief Main function
* @returns 0 on exit
*/
int main() {
test(); // run self-test implementations
// Launch interactive mode where user can input messages and see
// their hash
interactive();
return 0;
}