mirror of
https://hub.njuu.cf/TheAlgorithms/Python.git
synced 2023-10-11 13:06:12 +08:00
Added new algorithm: cracking caesar cipher with the chi-squared test (#1950)
* added decrypt_caesar_with_chi_squared.py and ran all checks * Updated default parameters Removed mistake with mutable default arguments Co-authored-by: Christian Clauss <cclauss@me.com> * Updated handling for optional arguments Co-authored-by: Christian Clauss <cclauss@me.com> * Changed return statement to tuple Made function return a tuple instead of a list * Added more doctests * Fixed spelling mistakes * black . - reformatted decrypt_caesar_with_chi_squared.py * Updated if statements to fit the updated code * Minimized amount of lines in the code. Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
parent
3d4ccc383a
commit
4acc28ba55
228
ciphers/decrypt_caesar_with_chi_squared.py
Normal file
228
ciphers/decrypt_caesar_with_chi_squared.py
Normal file
@ -0,0 +1,228 @@
|
||||
def decrypt_caesar_with_chi_squared(
|
||||
ciphertext: str,
|
||||
cipher_alphabet=None,
|
||||
frequencies_dict=None,
|
||||
case_sensetive: bool = False,
|
||||
) -> list:
|
||||
"""
|
||||
Basic Usage
|
||||
===========
|
||||
Arguments:
|
||||
* ciphertext (str): the text to decode (encoded with the caesar cipher)
|
||||
|
||||
Optional Arguments:
|
||||
* cipher_alphabet (list): the alphabet used for the cipher (each letter is
|
||||
a string separated by commas)
|
||||
* frequencies_dict (dict): a dictionary of word frequencies where keys are
|
||||
the letters and values are a percentage representation of the frequency as
|
||||
a decimal/float
|
||||
* case_sensetive (bool): a boolean value: True if the case matters during
|
||||
decryption, False if it doesn't
|
||||
|
||||
Returns:
|
||||
* A tuple in the form of:
|
||||
(
|
||||
most_likely_cipher,
|
||||
most_likely_cipher_chi_squared_value,
|
||||
decoded_most_likely_cipher
|
||||
)
|
||||
|
||||
where...
|
||||
- most_likely_cipher is an integer representing the shift of the smallest
|
||||
chi-squared statistic (most likely key)
|
||||
- most_likely_cipher_chi_squared_value is a float representing the
|
||||
chi-squared statistic of the most likely shift
|
||||
- decoded_most_likely_cipher is a string with the decoded cipher
|
||||
(decoded by the most_likely_cipher key)
|
||||
|
||||
|
||||
The Chi-squared test
|
||||
====================
|
||||
|
||||
The caesar cipher
|
||||
-----------------
|
||||
The caesar cipher is a very insecure encryption algorithm, however it has
|
||||
been used since Julius Caesar. The cipher is a simple substitution cipher
|
||||
where each character in the plain text is replaced by a character in the
|
||||
alphabet a certain number of characters after the original character. The
|
||||
number of characters away is called the shift or key. For example:
|
||||
|
||||
Plain text: hello
|
||||
Key: 1
|
||||
Cipher text: ifmmp
|
||||
(each letter in hello has been shifted one to the right in the eng. alphabet)
|
||||
|
||||
As you can imagine, this doesn't provide lots of security. In fact
|
||||
decrypting ciphertext by brute-force is extremely easy even by hand. However
|
||||
one way to do that is the chi-squared test.
|
||||
|
||||
The chi-squared test
|
||||
-------------------
|
||||
Each letter in the english alphabet has a frequency, or the amount of times
|
||||
it shows up compared to other letters (usually expressed as a decimal
|
||||
representing the percentage likelihood). The most common letter in the
|
||||
english language is "e" with a frequency of 0.11162 or 11.162%. The test is
|
||||
completed in the following fashion.
|
||||
|
||||
1. The ciphertext is decoded in a brute force way (every combination of the
|
||||
26 possible combinations)
|
||||
2. For every combination, for each letter in the combination, the average
|
||||
amount of times the letter should appear the message is calculated by
|
||||
multiplying the total number of characters by the frequency of the letter
|
||||
|
||||
For example:
|
||||
In a message of 100 characters, e should appear around 11.162 times.
|
||||
|
||||
3. Then, to calculate the margin of error (the amount of times the letter
|
||||
SHOULD appear with the amount of times the letter DOES appear), we use
|
||||
the chi-squared test. The following formula is used:
|
||||
|
||||
Let:
|
||||
- n be the number of times the letter actually appears
|
||||
- p be the predicted value of the number of times the letter should
|
||||
appear (see #2)
|
||||
- let v be the chi-squared test result (referred to here as chi-squared
|
||||
value/statistic)
|
||||
|
||||
(n - p)^2
|
||||
--------- = v
|
||||
p
|
||||
|
||||
4. Each chi squared value for each letter is then added up to the total.
|
||||
The total is the chi-squared statistic for that encryption key.
|
||||
5. The encryption key with the lowest chi-squared value is the most likely
|
||||
to be the decoded answer.
|
||||
|
||||
Further Reading
|
||||
================
|
||||
|
||||
* http://practicalcryptography.com/cryptanalysis/text-characterisation/chi-squared-statistic/
|
||||
* https://en.wikipedia.org/wiki/Letter_frequency
|
||||
* https://en.wikipedia.org/wiki/Chi-squared_test
|
||||
* https://en.m.wikipedia.org/wiki/Caesar_cipher
|
||||
|
||||
Doctests
|
||||
========
|
||||
>>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!')
|
||||
(7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!')
|
||||
|
||||
>>> decrypt_caesar_with_chi_squared('crybd cdbsxq')
|
||||
(10, 233.35343938980898, 'short string')
|
||||
|
||||
>>> decrypt_caesar_with_chi_squared(12)
|
||||
Traceback (most recent call last):
|
||||
AttributeError: 'int' object has no attribute 'lower'
|
||||
"""
|
||||
alphabet_letters = cipher_alphabet or [chr(i) for i in range(97, 123)]
|
||||
frequencies_dict = frequencies_dict or {}
|
||||
|
||||
if frequencies_dict == {}:
|
||||
# Frequencies of letters in the english language (how much they show up)
|
||||
frequencies = {
|
||||
"a": 0.08497,
|
||||
"b": 0.01492,
|
||||
"c": 0.02202,
|
||||
"d": 0.04253,
|
||||
"e": 0.11162,
|
||||
"f": 0.02228,
|
||||
"g": 0.02015,
|
||||
"h": 0.06094,
|
||||
"i": 0.07546,
|
||||
"j": 0.00153,
|
||||
"k": 0.01292,
|
||||
"l": 0.04025,
|
||||
"m": 0.02406,
|
||||
"n": 0.06749,
|
||||
"o": 0.07507,
|
||||
"p": 0.01929,
|
||||
"q": 0.00095,
|
||||
"r": 0.07587,
|
||||
"s": 0.06327,
|
||||
"t": 0.09356,
|
||||
"u": 0.02758,
|
||||
"v": 0.00978,
|
||||
"w": 0.02560,
|
||||
"x": 0.00150,
|
||||
"y": 0.01994,
|
||||
"z": 0.00077,
|
||||
}
|
||||
else:
|
||||
# Custom frequencies dictionary
|
||||
frequencies = frequencies_dict
|
||||
|
||||
if not case_sensetive:
|
||||
ciphertext = ciphertext.lower()
|
||||
|
||||
# Chi squared statistic values
|
||||
chi_squared_statistic_values = {}
|
||||
|
||||
# cycle through all of the shifts
|
||||
for shift in range(len(alphabet_letters)):
|
||||
decrypted_with_shift = ""
|
||||
|
||||
# decrypt the message with the shift
|
||||
for letter in ciphertext:
|
||||
try:
|
||||
# Try to index the letter in the alphabet
|
||||
new_key = (alphabet_letters.index(letter) - shift) % len(
|
||||
alphabet_letters
|
||||
)
|
||||
decrypted_with_shift += alphabet_letters[new_key]
|
||||
except ValueError:
|
||||
# Append the character if it isn't in the alphabet
|
||||
decrypted_with_shift += letter
|
||||
|
||||
chi_squared_statistic = 0
|
||||
|
||||
# Loop through each letter in the decoded message with the shift
|
||||
for letter in decrypted_with_shift:
|
||||
if case_sensetive:
|
||||
if letter in frequencies:
|
||||
# Get the amount of times the letter occurs in the message
|
||||
occurrences = decrypted_with_shift.count(letter)
|
||||
|
||||
# Get the excepcted amount of times the letter should appear based on letter frequencies
|
||||
expected = frequencies[letter] * occurrences
|
||||
|
||||
# Complete the chi squared statistic formula
|
||||
chi_letter_value = ((occurrences - expected) ** 2) / expected
|
||||
|
||||
# Add the margin of error to the total chi squared statistic
|
||||
chi_squared_statistic += chi_letter_value
|
||||
else:
|
||||
if letter.lower() in frequencies:
|
||||
# Get the amount of times the letter occurs in the message
|
||||
occurrences = decrypted_with_shift.count(letter)
|
||||
|
||||
# Get the excepcted amount of times the letter should appear based on letter frequencies
|
||||
expected = frequencies[letter] * occurrences
|
||||
|
||||
# Complete the chi squared statistic formula
|
||||
chi_letter_value = ((occurrences - expected) ** 2) / expected
|
||||
|
||||
# Add the margin of error to the total chi squared statistic
|
||||
chi_squared_statistic += chi_letter_value
|
||||
|
||||
# Add the data to the chi_squared_statistic_values dictionary
|
||||
chi_squared_statistic_values[shift] = [
|
||||
chi_squared_statistic,
|
||||
decrypted_with_shift,
|
||||
]
|
||||
|
||||
# Get the most likely cipher by finding the cipher with the smallest chi squared statistic
|
||||
most_likely_cipher = min(
|
||||
chi_squared_statistic_values, key=chi_squared_statistic_values.get
|
||||
)
|
||||
|
||||
# Get all the data from the most likely cipher (key, decoded message)
|
||||
most_likely_cipher_chi_squared_value = chi_squared_statistic_values[
|
||||
most_likely_cipher
|
||||
][0]
|
||||
decoded_most_likely_cipher = chi_squared_statistic_values[most_likely_cipher][1]
|
||||
|
||||
# Return the data on the most likely shift
|
||||
return (
|
||||
most_likely_cipher,
|
||||
most_likely_cipher_chi_squared_value,
|
||||
decoded_most_likely_cipher,
|
||||
)
|
Loading…
Reference in New Issue
Block a user