From 4acc28ba55d6bd084dfefbb51bd8de6ac7e894ce Mon Sep 17 00:00:00 2001 From: Maxim R <49735721+mrmaxguns@users.noreply.github.com> Date: Wed, 6 May 2020 12:42:18 -0500 Subject: [PATCH] Added new algorithm: cracking caesar cipher with the chi-squared test (#1950) * added decrypt_caesar_with_chi_squared.py and ran all checks * Updated default parameters Removed mistake with mutable default arguments Co-authored-by: Christian Clauss * Updated handling for optional arguments Co-authored-by: Christian Clauss * Changed return statement to tuple Made function return a tuple instead of a list * Added more doctests * Fixed spelling mistakes * black . - reformatted decrypt_caesar_with_chi_squared.py * Updated if statements to fit the updated code * Minimized amount of lines in the code. Co-authored-by: Christian Clauss --- ciphers/decrypt_caesar_with_chi_squared.py | 228 +++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 ciphers/decrypt_caesar_with_chi_squared.py diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py new file mode 100644 index 000000000..3c37631c7 --- /dev/null +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -0,0 +1,228 @@ +def decrypt_caesar_with_chi_squared( + ciphertext: str, + cipher_alphabet=None, + frequencies_dict=None, + case_sensetive: bool = False, +) -> list: + """ + Basic Usage + =========== + Arguments: + * ciphertext (str): the text to decode (encoded with the caesar cipher) + + Optional Arguments: + * cipher_alphabet (list): the alphabet used for the cipher (each letter is + a string separated by commas) + * frequencies_dict (dict): a dictionary of word frequencies where keys are + the letters and values are a percentage representation of the frequency as + a decimal/float + * case_sensetive (bool): a boolean value: True if the case matters during + decryption, False if it doesn't + + Returns: + * A tuple in the form of: + ( + most_likely_cipher, + most_likely_cipher_chi_squared_value, + decoded_most_likely_cipher + ) + + where... + - most_likely_cipher is an integer representing the shift of the smallest + chi-squared statistic (most likely key) + - most_likely_cipher_chi_squared_value is a float representing the + chi-squared statistic of the most likely shift + - decoded_most_likely_cipher is a string with the decoded cipher + (decoded by the most_likely_cipher key) + + + The Chi-squared test + ==================== + + The caesar cipher + ----------------- + The caesar cipher is a very insecure encryption algorithm, however it has + been used since Julius Caesar. The cipher is a simple substitution cipher + where each character in the plain text is replaced by a character in the + alphabet a certain number of characters after the original character. The + number of characters away is called the shift or key. For example: + + Plain text: hello + Key: 1 + Cipher text: ifmmp + (each letter in hello has been shifted one to the right in the eng. alphabet) + + As you can imagine, this doesn't provide lots of security. In fact + decrypting ciphertext by brute-force is extremely easy even by hand. However + one way to do that is the chi-squared test. + + The chi-squared test + ------------------- + Each letter in the english alphabet has a frequency, or the amount of times + it shows up compared to other letters (usually expressed as a decimal + representing the percentage likelihood). The most common letter in the + english language is "e" with a frequency of 0.11162 or 11.162%. The test is + completed in the following fashion. + + 1. The ciphertext is decoded in a brute force way (every combination of the + 26 possible combinations) + 2. For every combination, for each letter in the combination, the average + amount of times the letter should appear the message is calculated by + multiplying the total number of characters by the frequency of the letter + + For example: + In a message of 100 characters, e should appear around 11.162 times. + + 3. Then, to calculate the margin of error (the amount of times the letter + SHOULD appear with the amount of times the letter DOES appear), we use + the chi-squared test. The following formula is used: + + Let: + - n be the number of times the letter actually appears + - p be the predicted value of the number of times the letter should + appear (see #2) + - let v be the chi-squared test result (referred to here as chi-squared + value/statistic) + + (n - p)^2 + --------- = v + p + + 4. Each chi squared value for each letter is then added up to the total. + The total is the chi-squared statistic for that encryption key. + 5. The encryption key with the lowest chi-squared value is the most likely + to be the decoded answer. + + Further Reading + ================ + + * http://practicalcryptography.com/cryptanalysis/text-characterisation/chi-squared-statistic/ + * https://en.wikipedia.org/wiki/Letter_frequency + * https://en.wikipedia.org/wiki/Chi-squared_test + * https://en.m.wikipedia.org/wiki/Caesar_cipher + + Doctests + ======== + >>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!') + (7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!') + + >>> decrypt_caesar_with_chi_squared('crybd cdbsxq') + (10, 233.35343938980898, 'short string') + + >>> decrypt_caesar_with_chi_squared(12) + Traceback (most recent call last): + AttributeError: 'int' object has no attribute 'lower' + """ + alphabet_letters = cipher_alphabet or [chr(i) for i in range(97, 123)] + frequencies_dict = frequencies_dict or {} + + if frequencies_dict == {}: + # Frequencies of letters in the english language (how much they show up) + frequencies = { + "a": 0.08497, + "b": 0.01492, + "c": 0.02202, + "d": 0.04253, + "e": 0.11162, + "f": 0.02228, + "g": 0.02015, + "h": 0.06094, + "i": 0.07546, + "j": 0.00153, + "k": 0.01292, + "l": 0.04025, + "m": 0.02406, + "n": 0.06749, + "o": 0.07507, + "p": 0.01929, + "q": 0.00095, + "r": 0.07587, + "s": 0.06327, + "t": 0.09356, + "u": 0.02758, + "v": 0.00978, + "w": 0.02560, + "x": 0.00150, + "y": 0.01994, + "z": 0.00077, + } + else: + # Custom frequencies dictionary + frequencies = frequencies_dict + + if not case_sensetive: + ciphertext = ciphertext.lower() + + # Chi squared statistic values + chi_squared_statistic_values = {} + + # cycle through all of the shifts + for shift in range(len(alphabet_letters)): + decrypted_with_shift = "" + + # decrypt the message with the shift + for letter in ciphertext: + try: + # Try to index the letter in the alphabet + new_key = (alphabet_letters.index(letter) - shift) % len( + alphabet_letters + ) + decrypted_with_shift += alphabet_letters[new_key] + except ValueError: + # Append the character if it isn't in the alphabet + decrypted_with_shift += letter + + chi_squared_statistic = 0 + + # Loop through each letter in the decoded message with the shift + for letter in decrypted_with_shift: + if case_sensetive: + if letter in frequencies: + # Get the amount of times the letter occurs in the message + occurrences = decrypted_with_shift.count(letter) + + # Get the excepcted amount of times the letter should appear based on letter frequencies + expected = frequencies[letter] * occurrences + + # Complete the chi squared statistic formula + chi_letter_value = ((occurrences - expected) ** 2) / expected + + # Add the margin of error to the total chi squared statistic + chi_squared_statistic += chi_letter_value + else: + if letter.lower() in frequencies: + # Get the amount of times the letter occurs in the message + occurrences = decrypted_with_shift.count(letter) + + # Get the excepcted amount of times the letter should appear based on letter frequencies + expected = frequencies[letter] * occurrences + + # Complete the chi squared statistic formula + chi_letter_value = ((occurrences - expected) ** 2) / expected + + # Add the margin of error to the total chi squared statistic + chi_squared_statistic += chi_letter_value + + # Add the data to the chi_squared_statistic_values dictionary + chi_squared_statistic_values[shift] = [ + chi_squared_statistic, + decrypted_with_shift, + ] + + # Get the most likely cipher by finding the cipher with the smallest chi squared statistic + most_likely_cipher = min( + chi_squared_statistic_values, key=chi_squared_statistic_values.get + ) + + # Get all the data from the most likely cipher (key, decoded message) + most_likely_cipher_chi_squared_value = chi_squared_statistic_values[ + most_likely_cipher + ][0] + decoded_most_likely_cipher = chi_squared_statistic_values[most_likely_cipher][1] + + # Return the data on the most likely shift + return ( + most_likely_cipher, + most_likely_cipher_chi_squared_value, + decoded_most_likely_cipher, + )