added idf-smooth (#2174)

* added idf-smooth

* added idf-smooth

* added idf-smooth
This commit is contained in:
Shubham Shaswat 2020-09-02 23:03:12 +05:30 committed by GitHub
parent e92e433dbe
commit a1d1a44f51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -83,16 +83,17 @@ the third document in the corpus.")
return (len([doc for doc in docs if term in doc]), len(docs)) return (len([doc for doc in docs if term in doc]), len(docs))
def inverse_document_frequency(df: int, N: int) -> float: def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
""" """
Return an integer denoting the importance Return an integer denoting the importance
of a word. This measure of importance is of a word. This measure of importance is
calculated by log10(N/df), where N is the calculated by log10(N/df), where N is the
number of documents and df is number of documents and df is
the Document Frequency. the Document Frequency.
@params : df, the Document Frequency, and N, @params : df, the Document Frequency, N,
the number of documents in the corpus. the number of documents in the corpus and
@returns : log10(N/df) smoothing, if True return the idf-smooth
@returns : log10(N/df) or 1+log10(N/1+df)
@examples : @examples :
>>> inverse_document_frequency(3, 0) >>> inverse_document_frequency(3, 0)
Traceback (most recent call last): Traceback (most recent call last):
@ -104,7 +105,14 @@ def inverse_document_frequency(df: int, N: int) -> float:
Traceback (most recent call last): Traceback (most recent call last):
... ...
ZeroDivisionError: df must be > 0 ZeroDivisionError: df must be > 0
>>> inverse_document_frequency(0, 3,True)
1.477
""" """
if smoothing:
if N == 0:
raise ValueError("log10(0) is undefined.")
return round(1 + log10(N / (1 + df)), 3)
if df == 0: if df == 0:
raise ZeroDivisionError("df must be > 0") raise ZeroDivisionError("df must be > 0")
elif N == 0: elif N == 0: