From a1d1a44f515b5769136c90342bc1955e3bc8a26e Mon Sep 17 00:00:00 2001 From: Shubham Shaswat Date: Wed, 2 Sep 2020 23:03:12 +0530 Subject: [PATCH] added idf-smooth (#2174) * added idf-smooth * added idf-smooth * added idf-smooth --- machine_learning/word_frequency_functions.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index e9e9e644b..9cf7b694c 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -83,16 +83,17 @@ the third document in the corpus.") return (len([doc for doc in docs if term in doc]), len(docs)) -def inverse_document_frequency(df: int, N: int) -> float: +def inverse_document_frequency(df: int, N: int, smoothing=False) -> float: """ Return an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is the Document Frequency. - @params : df, the Document Frequency, and N, - the number of documents in the corpus. - @returns : log10(N/df) + @params : df, the Document Frequency, N, + the number of documents in the corpus and + smoothing, if True return the idf-smooth + @returns : log10(N/df) or 1+log10(N/1+df) @examples : >>> inverse_document_frequency(3, 0) Traceback (most recent call last): @@ -104,7 +105,14 @@ def inverse_document_frequency(df: int, N: int) -> float: Traceback (most recent call last): ... ZeroDivisionError: df must be > 0 + >>> inverse_document_frequency(0, 3,True) + 1.477 """ + if smoothing: + if N == 0: + raise ValueError("log10(0) is undefined.") + return round(1 + log10(N / (1 + df)), 3) + if df == 0: raise ZeroDivisionError("df must be > 0") elif N == 0: