TheAlgorithms-Python/strings/jaro_winkler.py
mateuszz0000 3357768fc3
Jaro winkler (#2041)
* Added jaro_winkler first version

* Added doctests

* Fix flake warnings

* Refactor

* Fixes bug in jaro winkler implementation

* Commit suggestions

* Missing comming suggestions

* Remove unused math module

* Import doctest

Co-authored-by: John Law <johnlaw.po@gmail.com>
2020-05-31 00:14:55 +05:30

72 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"""
def jaro_winkler(str1: str, str2: str) -> float:
"""
JaroWinkler distance is a string metric measuring an edit distance between two sequences.
Output value is between 0.0 and 1.0.
>>> jaro_winkler("martha", "marhta")
0.9611111111111111
>>> jaro_winkler("CRATE", "TRACE")
0.7333333333333334
>>> jaro_winkler("test", "dbdbdbdb")
0.0
>>> jaro_winkler("test", "test")
1.0
>>> jaro_winkler("hello world", "HeLLo W0rlD")
0.6363636363636364
>>> jaro_winkler("test", "")
0.0
>>> jaro_winkler("hello", "world")
0.4666666666666666
>>> jaro_winkler("hell**o", "*world")
0.4365079365079365
"""
def get_matched_characters(_str1: str, _str2: str) -> str:
matched = []
limit = min(len(_str1), len(_str2)) // 2
for i, l in enumerate(_str1):
left = int(max(0, i - limit))
right = int(min(i + limit + 1, len(_str2)))
if l in _str2[left:right]:
matched.append(l)
_str2 = f"{_str2[0:_str2.index(l)]} {_str2[_str2.index(l) + 1:]}"
return ''.join(matched)
# matching characters
matching_1 = get_matched_characters(str1, str2)
matching_2 = get_matched_characters(str2, str1)
match_count = len(matching_1)
# transposition
transpositions = len(
[(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2]
) // 2
if not match_count:
jaro = 0.0
else:
jaro = 1 / 3 * (
match_count / len(str1)
+ match_count / len(str2)
+ (match_count - transpositions) / match_count)
# common prefix up to 4 characters
prefix_len = 0
for c1, c2 in zip(str1[:4], str2[:4]):
if c1 == c2:
prefix_len += 1
else:
break
return jaro + 0.1 * prefix_len * (1 - jaro)
if __name__ == '__main__':
import doctest
doctest.testmod()
print(jaro_winkler("hello", "world"))