diff --git a/DIRECTORY.md b/DIRECTORY.md index 6e64f034d..74c63d144 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -52,6 +52,7 @@ * [Rsa Factorization](https://github.com/TheAlgorithms/Python/blob/master/ciphers/rsa_factorization.py) * [Rsa Key Generator](https://github.com/TheAlgorithms/Python/blob/master/ciphers/rsa_key_generator.py) * [Shuffled Shift Cipher](https://github.com/TheAlgorithms/Python/blob/master/ciphers/shuffled_shift_cipher.py) + * [Simple Keyword Cypher](https://github.com/TheAlgorithms/Python/blob/master/ciphers/simple_keyword_cypher.py) * [Simple Substitution Cipher](https://github.com/TheAlgorithms/Python/blob/master/ciphers/simple_substitution_cipher.py) * [Trafid Cipher](https://github.com/TheAlgorithms/Python/blob/master/ciphers/trafid_cipher.py) * [Transposition Cipher](https://github.com/TheAlgorithms/Python/blob/master/ciphers/transposition_cipher.py) @@ -95,6 +96,7 @@ * [Heap](https://github.com/TheAlgorithms/Python/blob/master/data_structures/heap/heap.py) * [Min Heap](https://github.com/TheAlgorithms/Python/blob/master/data_structures/heap/min_heap.py) * Linked List + * [Circular Linked List](https://github.com/TheAlgorithms/Python/blob/master/data_structures/linked_list/circular_linked_list.py) * [Doubly Linked List](https://github.com/TheAlgorithms/Python/blob/master/data_structures/linked_list/doubly_linked_list.py) * [From Sequence](https://github.com/TheAlgorithms/Python/blob/master/data_structures/linked_list/from_sequence.py) * [Is Palindrome](https://github.com/TheAlgorithms/Python/blob/master/data_structures/linked_list/is_palindrome.py) @@ -102,6 +104,7 @@ * [Singly Linked List](https://github.com/TheAlgorithms/Python/blob/master/data_structures/linked_list/singly_linked_list.py) * [Swap Nodes](https://github.com/TheAlgorithms/Python/blob/master/data_structures/linked_list/swap_nodes.py) * Queue + * [Circular Queue](https://github.com/TheAlgorithms/Python/blob/master/data_structures/queue/circular_queue.py) * [Double Ended Queue](https://github.com/TheAlgorithms/Python/blob/master/data_structures/queue/double_ended_queue.py) * [Linked Queue](https://github.com/TheAlgorithms/Python/blob/master/data_structures/queue/linked_queue.py) * [Queue On List](https://github.com/TheAlgorithms/Python/blob/master/data_structures/queue/queue_on_list.py) @@ -149,6 +152,7 @@ * [Fibonacci](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/fibonacci.py) * [Floyd Warshall](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/floyd_warshall.py) * [Fractional Knapsack](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/fractional_knapsack.py) + * [Fractional Knapsack 2](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/fractional_knapsack_2.py) * [Integer Partition](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/integer_partition.py) * [K Means Clustering Tensorflow](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/k_means_clustering_tensorflow.py) * [Knapsack](https://github.com/TheAlgorithms/Python/blob/master/dynamic_programming/knapsack.py) @@ -224,6 +228,7 @@ * [K Means Clust](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/k_means_clust.py) * [K Nearest Neighbours](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/k_nearest_neighbours.py) * [Knn Sklearn](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/knn_sklearn.py) + * [Linear Discriminant Analysis](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/linear_discriminant_analysis.py) * [Linear Regression](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/linear_regression.py) * [Logistic Regression](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/logistic_regression.py) * [Polymonial Regression](https://github.com/TheAlgorithms/Python/blob/master/machine_learning/polymonial_regression.py) @@ -521,6 +526,7 @@ * [Naive String Search](https://github.com/TheAlgorithms/Python/blob/master/strings/naive_string_search.py) * [Rabin Karp](https://github.com/TheAlgorithms/Python/blob/master/strings/rabin_karp.py) * [Remove Duplicate](https://github.com/TheAlgorithms/Python/blob/master/strings/remove_duplicate.py) + * [Reverse Words](https://github.com/TheAlgorithms/Python/blob/master/strings/reverse_words.py) * [Word Occurence](https://github.com/TheAlgorithms/Python/blob/master/strings/word_occurence.py) ## Traversals @@ -528,6 +534,7 @@ ## Web Programming * [Crawl Google Results](https://github.com/TheAlgorithms/Python/blob/master/web_programming/crawl_google_results.py) + * [Current Stock Price](https://github.com/TheAlgorithms/Python/blob/master/web_programming/current_stock_price.py) * [Fetch Bbc News](https://github.com/TheAlgorithms/Python/blob/master/web_programming/fetch_bbc_news.py) * [Fetch Github Info](https://github.com/TheAlgorithms/Python/blob/master/web_programming/fetch_github_info.py) * [Get Imdbtop](https://github.com/TheAlgorithms/Python/blob/master/web_programming/get_imdbtop.py) diff --git a/machine_learning/linear_discriminant_analysis.py b/machine_learning/linear_discriminant_analysis.py new file mode 100644 index 000000000..8a89f6f59 --- /dev/null +++ b/machine_learning/linear_discriminant_analysis.py @@ -0,0 +1,330 @@ +""" + Linear Discriminant Analysis + + + Assumptions About Data : + 1. The input variables has a gaussian distribution. + 2. The variance calculated for each input variables by class grouping is the + same. + 3. The mix of classes in your training set is representative of the problem. + + + Learning The Model : + The LDA model requires the estimation of statistics from the training data : + 1. Mean of each input value for each class. + 2. Probability of an instance belong to each class. + 3. Covariance for the input data for each class + + Calculate the class means : + mean(x) = 1/n ( for i = 1 to i = n --> sum(xi)) + + Calculate the class probabilities : + P(y = 0) = count(y = 0) / (count(y = 0) + count(y = 1)) + P(y = 1) = count(y = 1) / (count(y = 0) + count(y = 1)) + + Calculate the variance : + We can calculate the variance for dataset in two steps : + 1. Calculate the squared difference for each input variable from the + group mean. + 2. Calculate the mean of the squared difference. + ------------------------------------------------ + Squared_Difference = (x - mean(k)) ** 2 + Variance = (1 / (count(x) - count(classes))) * + (for i = 1 to i = n --> sum(Squared_Difference(xi))) + + Making Predictions : + discriminant(x) = x * (mean / variance) - + ((mean ** 2) / (2 * variance)) + Ln(probability) + --------------------------------------------------------------------------- + After calculating the discriminant value for each class, the class with the + largest discriminant value is taken as the prediction. + + Author: @EverLookNeverSee +""" + +from math import log +from os import name, system +from random import gauss + + +# Make a training dataset drawn from a gaussian distribution +def gaussian_distribution(mean: float, std_dev: float, instance_count: int) -> list: + """ + Generate gaussian distribution instances based-on given mean and standard deviation + :param mean: mean value of class + :param std_dev: value of standard deviation entered by usr or default value of it + :param instance_count: instance number of class + :return: a list containing generated values based-on given mean, std_dev and + instance_count + """ + return [gauss(mean, std_dev) for _ in range(instance_count)] + + +# Make corresponding Y flags to detecting classes +def y_generator(class_count: int, instance_count: list) -> list: + """ + Generate y values for corresponding classes + :param class_count: Number of classes(data groupings) in dataset + :param instance_count: number of instances in class + :return: corresponding values for data groupings in dataset + """ + + return [k for k in range(class_count) for _ in range(instance_count[k])] + + +# Calculate the class means +def calculate_mean(instance_count: int, items: list) -> float: + """ + Calculate given class mean + :param instance_count: Number of instances in class + :param items: items that related to specific class(data grouping) + :return: calculated actual mean of considered class + """ + # the sum of all items divided by number of instances + return sum(items) / instance_count + + +# Calculate the class probabilities +def calculate_probabilities(instance_count: int, total_count: int) -> float: + """ + Calculate the probability that a given instance will belong to which class + :param instance_count: number of instances in class + :param total_count: the number of all instances + :return: value of probability for considered class + """ + # number of instances in specific class divided by number of all instances + return instance_count / total_count + + +# Calculate the variance +def calculate_variance(items: list, means: list, total_count: int) -> float: + """ + Calculate the variance + :param items: a list containing all items(gaussian distribution of all classes) + :param means: a list containing real mean values of each class + :param total_count: the number of all instances + :return: calculated variance for considered dataset + """ + squared_diff = [] # An empty list to store all squared differences + # iterate over number of elements in items + for i in range(len(items)): + # for loop iterates over number of elements in inner layer of items + for j in range(len(items[i])): + # appending squared differences to 'squared_diff' list + squared_diff.append((items[i][j] - means[i]) ** 2) + + # one divided by (the number of all instances - number of classes) multiplied by + # sum of all squared differences + n_classes = len(means) # Number of classes in dataset + return 1 / (total_count - n_classes) * sum(squared_diff) + + +# Making predictions +def predict_y_values( + x_items: list, means: list, variance: float, probabilities: list +) -> list: + """ This function predicts new indexes(groups for our data) + :param x_items: a list containing all items(gaussian distribution of all classes) + :param means: a list containing real mean values of each class + :param variance: calculated value of variance by calculate_variance function + :param probabilities: a list containing all probabilities of classes + :return: a list containing predicted Y values + """ + # An empty list to store generated discriminant values of all items in dataset for + # each class + results = [] + # for loop iterates over number of elements in list + for i in range(len(x_items)): + # for loop iterates over number of inner items of each element + for j in range(len(x_items[i])): + temp = [] # to store all discriminant values of each item as a list + # for loop iterates over number of classes we have in our dataset + for k in range(len(x_items)): + # appending values of discriminants for each class to 'temp' list + temp.append( + x_items[i][j] * (means[k] / variance) + - (means[k] ** 2 / (2 * variance)) + + log(probabilities[k]) + ) + # appending discriminant values of each item to 'results' list + results.append(temp) + print("Generated Discriminants: \n", results) + return [l.index(max(l)) for l in results] + + +# Calculating Accuracy +def accuracy(actual_y: list, predicted_y: list) -> float: + """ + Calculate the value of accuracy based-on predictions + :param actual_y:a list containing initial Y values generated by 'y_generator' + function + :param predicted_y: a list containing predicted Y values generated by + 'predict_y_values' function + :return: percentage of accuracy + """ + # iterate over one element of each list at a time (zip mode) + # prediction is correct if actual Y value equals to predicted Y value + correct = sum(1 for i, j in zip(actual_y, predicted_y) if i == j) + # percentage of accuracy equals to number of correct predictions divided by number + # of all data and multiplied by 100 + return (correct / len(actual_y)) * 100 + + +# Main Function +def main(): + """ This function starts execution phase """ + while True: + print(" Linear Discriminant Analysis ".center(100, "*")) + print("*" * 100, "\n") + print("First of all we should specify the number of classes that") + print("we want to generate as training dataset") + # Trying to get number of classes + n_classes = 0 + while True: + try: + user_input = int( + input("Enter the number of classes (Data Groupings): ").strip() + ) + if user_input > 0: + n_classes = user_input + break + else: + print( + f"Your entered value is {user_input} , Number of classes " + f"should be positive!" + ) + continue + except ValueError: + print("Your entered value is not numerical!") + + print("-" * 100) + + std_dev = 1.0 # Default value for standard deviation of dataset + # Trying to get the value of standard deviation + while True: + try: + user_sd = float( + input( + "Enter the value of standard deviation" + "(Default value is 1.0 for all classes): " + ).strip() + or "1.0" + ) + if user_sd >= 0.0: + std_dev = user_sd + break + else: + print( + f"Your entered value is {user_sd}, Standard deviation should " + f"not be negative!" + ) + continue + except ValueError: + print("Your entered value is not numerical!") + + print("-" * 100) + + # Trying to get number of instances in classes and theirs means to generate + # dataset + counts = [] # An empty list to store instance counts of classes in dataset + for i in range(n_classes): + while True: + try: + user_count = int( + input(f"Enter The number of instances for class_{i+1}: ") + ) + if user_count > 0: + counts.append(user_count) + break + else: + print( + f"Your entered value is {user_count}, Number of " + f"instances should be positive!" + ) + continue + except ValueError: + print("Your entered value is not numerical!") + print("-" * 100) + + # An empty list to store values of user-entered means of classes + user_means = [] + for a in range(n_classes): + while True: + try: + user_mean = float( + input(f"Enter the value of mean for class_{a+1}: ") + ) + if isinstance(user_mean, float): + user_means.append(user_mean) + break + print(f"You entered an invalid value: {user_mean}") + except ValueError: + print("Your entered value is not numerical!") + print("-" * 100) + + print("Standard deviation: ", std_dev) + # print out the number of instances in classes in separated line + for i, count in enumerate(counts, 1): + print(f"Number of instances in class_{i} is: {count}") + print("-" * 100) + + # print out mean values of classes separated line + for i, user_mean in enumerate(user_means, 1): + print(f"Mean of class_{i} is: {user_mean}") + print("-" * 100) + + # Generating training dataset drawn from gaussian distribution + x = [ + gaussian_distribution(user_means[j], std_dev, counts[j]) + for j in range(n_classes) + ] + print("Generated Normal Distribution: \n", x) + print("-" * 100) + + # Generating Ys to detecting corresponding classes + y = y_generator(n_classes, counts) + print("Generated Corresponding Ys: \n", y) + print("-" * 100) + + # Calculating the value of actual mean for each class + actual_means = [calculate_mean(counts[k], x[k]) for k in range(n_classes)] + # for loop iterates over number of elements in 'actual_means' list and print + # out them in separated line + for i, actual_mean in enumerate(actual_means, 1): + print(f"Actual(Real) mean of class_{i} is: {actual_mean}") + print("-" * 100) + + # Calculating the value of probabilities for each class + # An empty list to store values of probabilities for each class + probabilities = ( + calculate_probabilities(counts[i], sum(counts)) for i in range(n_classes) + ) + # for loop iterates over number of elements in 'probabilities' list and print + # out them in separated line + for i, probability in enumerate(probabilities, 1): + print("Probability of class_{} is: {}".format(i, probability)) + print("-" * 100) + + # Calculating the values of variance for each class + variance = calculate_variance(x, actual_means, sum(counts)) + print("Variance: ", variance) + print("-" * 100) + + # Predicting Y values + # storing predicted Y values in 'pre_indexes' variable + pre_indexes = predict_y_values(x, actual_means, variance, probabilities) + print("-" * 100) + + # Calculating Accuracy of the model + print(f"Accuracy: {accuracy(y, pre_indexes)}") + print("-" * 100) + print(" DONE ".center(100, "+")) + + if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q": + print("\n" + "GoodBye!".center(100, "-") + "\n") + break + system("cls" if name == "nt" else "clear") + + +if __name__ == "__main__": + main()