2019-10-04 15:59:45 +08:00
|
|
|
"""
|
|
|
|
The Jaccard similarity coefficient is a commonly used indicator of the
|
|
|
|
similarity between two sets. Let U be a set and A and B be subsets of U,
|
|
|
|
then the Jaccard index/similarity is defined to be the ratio of the number
|
|
|
|
of elements of their intersection and the number of elements of their union.
|
|
|
|
|
|
|
|
Inspired from Wikipedia and
|
|
|
|
the book Mining of Massive Datasets [MMDS 2nd Edition, Chapter 3]
|
|
|
|
|
|
|
|
https://en.wikipedia.org/wiki/Jaccard_index
|
|
|
|
https://mmds.org
|
|
|
|
|
|
|
|
Jaccard similarity is widely used with MinHashing.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2022-10-16 13:25:38 +08:00
|
|
|
def jaccard_similarity(set_a, set_b, alternative_union=False):
|
2019-10-04 15:59:45 +08:00
|
|
|
"""
|
|
|
|
Finds the jaccard similarity between two sets.
|
|
|
|
Essentially, its intersection over union.
|
|
|
|
|
|
|
|
The alternative way to calculate this is to take union as sum of the
|
|
|
|
number of items in the two sets. This will lead to jaccard similarity
|
|
|
|
of a set with itself be 1/2 instead of 1. [MMDS 2nd Edition, Page 77]
|
|
|
|
|
|
|
|
Parameters:
|
2022-10-13 06:54:20 +08:00
|
|
|
:set_a (set,list,tuple): A non-empty set/list
|
|
|
|
:set_b (set,list,tuple): A non-empty set/list
|
2019-10-04 15:59:45 +08:00
|
|
|
:alternativeUnion (boolean): If True, use sum of number of
|
|
|
|
items as union
|
|
|
|
|
|
|
|
Output:
|
|
|
|
(float) The jaccard similarity between the two sets.
|
|
|
|
|
|
|
|
Examples:
|
2022-10-13 06:54:20 +08:00
|
|
|
>>> set_a = {'a', 'b', 'c', 'd', 'e'}
|
|
|
|
>>> set_b = {'c', 'd', 'e', 'f', 'h', 'i'}
|
2022-10-16 13:25:38 +08:00
|
|
|
>>> jaccard_similarity(set_a, set_b)
|
2019-10-04 15:59:45 +08:00
|
|
|
0.375
|
|
|
|
|
2022-10-16 13:25:38 +08:00
|
|
|
>>> jaccard_similarity(set_a, set_a)
|
2019-10-04 15:59:45 +08:00
|
|
|
1.0
|
|
|
|
|
2022-10-16 13:25:38 +08:00
|
|
|
>>> jaccard_similarity(set_a, set_a, True)
|
2019-10-04 15:59:45 +08:00
|
|
|
0.5
|
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
>>> set_a = ['a', 'b', 'c', 'd', 'e']
|
|
|
|
>>> set_b = ('c', 'd', 'e', 'f', 'h', 'i')
|
2022-10-16 13:25:38 +08:00
|
|
|
>>> jaccard_similarity(set_a, set_b)
|
2019-10-04 15:59:45 +08:00
|
|
|
0.375
|
|
|
|
"""
|
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
if isinstance(set_a, set) and isinstance(set_b, set):
|
2019-10-04 15:59:45 +08:00
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
intersection = len(set_a.intersection(set_b))
|
2019-10-04 15:59:45 +08:00
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
if alternative_union:
|
|
|
|
union = len(set_a) + len(set_b)
|
2019-10-04 15:59:45 +08:00
|
|
|
else:
|
2022-10-13 06:54:20 +08:00
|
|
|
union = len(set_a.union(set_b))
|
2019-10-04 15:59:45 +08:00
|
|
|
|
|
|
|
return intersection / union
|
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
if isinstance(set_a, (list, tuple)) and isinstance(set_b, (list, tuple)):
|
2019-10-04 15:59:45 +08:00
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
intersection = [element for element in set_a if element in set_b]
|
2019-10-04 15:59:45 +08:00
|
|
|
|
2022-10-13 06:54:20 +08:00
|
|
|
if alternative_union:
|
|
|
|
union = len(set_a) + len(set_b)
|
2022-10-16 13:25:38 +08:00
|
|
|
return len(intersection) / union
|
2019-10-04 15:59:45 +08:00
|
|
|
else:
|
2022-10-13 06:54:20 +08:00
|
|
|
union = set_a + [element for element in set_b if element not in set_a]
|
2022-10-16 13:25:38 +08:00
|
|
|
return len(intersection) / len(union)
|
2019-10-04 15:59:45 +08:00
|
|
|
|
|
|
|
return len(intersection) / len(union)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2022-10-13 06:54:20 +08:00
|
|
|
set_a = {"a", "b", "c", "d", "e"}
|
|
|
|
set_b = {"c", "d", "e", "f", "h", "i"}
|
2022-10-16 13:25:38 +08:00
|
|
|
print(jaccard_similarity(set_a, set_b))
|