2021-09-07 19:37:03 +08:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2019-10-22 15:42:56 +08:00
|
|
|
from collections import deque
|
|
|
|
|
|
|
|
|
|
|
|
class Automaton:
|
2021-08-25 19:35:36 +08:00
|
|
|
def __init__(self, keywords: list[str]):
|
2022-10-15 09:07:03 +08:00
|
|
|
self.adlist: list[dict] = []
|
2019-10-22 15:42:56 +08:00
|
|
|
self.adlist.append(
|
|
|
|
{"value": "", "next_states": [], "fail_state": 0, "output": []}
|
|
|
|
)
|
|
|
|
|
|
|
|
for keyword in keywords:
|
|
|
|
self.add_keyword(keyword)
|
|
|
|
self.set_fail_transitions()
|
|
|
|
|
2021-09-07 19:37:03 +08:00
|
|
|
def find_next_state(self, current_state: int, char: str) -> int | None:
|
2019-10-22 15:42:56 +08:00
|
|
|
for state in self.adlist[current_state]["next_states"]:
|
|
|
|
if char == self.adlist[state]["value"]:
|
|
|
|
return state
|
|
|
|
return None
|
|
|
|
|
2020-10-06 16:31:15 +08:00
|
|
|
def add_keyword(self, keyword: str) -> None:
|
2019-10-22 15:42:56 +08:00
|
|
|
current_state = 0
|
|
|
|
for character in keyword:
|
2021-08-25 19:35:36 +08:00
|
|
|
next_state = self.find_next_state(current_state, character)
|
|
|
|
if next_state is None:
|
2019-10-22 15:42:56 +08:00
|
|
|
self.adlist.append(
|
|
|
|
{
|
|
|
|
"value": character,
|
|
|
|
"next_states": [],
|
|
|
|
"fail_state": 0,
|
|
|
|
"output": [],
|
|
|
|
}
|
|
|
|
)
|
|
|
|
self.adlist[current_state]["next_states"].append(len(self.adlist) - 1)
|
|
|
|
current_state = len(self.adlist) - 1
|
2021-08-25 19:35:36 +08:00
|
|
|
else:
|
|
|
|
current_state = next_state
|
2019-10-22 15:42:56 +08:00
|
|
|
self.adlist[current_state]["output"].append(keyword)
|
|
|
|
|
2020-10-06 16:31:15 +08:00
|
|
|
def set_fail_transitions(self) -> None:
|
2021-08-25 19:35:36 +08:00
|
|
|
q: deque = deque()
|
2019-10-22 15:42:56 +08:00
|
|
|
for node in self.adlist[0]["next_states"]:
|
|
|
|
q.append(node)
|
|
|
|
self.adlist[node]["fail_state"] = 0
|
|
|
|
while q:
|
|
|
|
r = q.popleft()
|
|
|
|
for child in self.adlist[r]["next_states"]:
|
|
|
|
q.append(child)
|
|
|
|
state = self.adlist[r]["fail_state"]
|
|
|
|
while (
|
2020-02-13 04:49:41 +08:00
|
|
|
self.find_next_state(state, self.adlist[child]["value"]) is None
|
2019-10-22 15:42:56 +08:00
|
|
|
and state != 0
|
|
|
|
):
|
|
|
|
state = self.adlist[state]["fail_state"]
|
|
|
|
self.adlist[child]["fail_state"] = self.find_next_state(
|
|
|
|
state, self.adlist[child]["value"]
|
|
|
|
)
|
2019-10-26 01:05:23 +08:00
|
|
|
if self.adlist[child]["fail_state"] is None:
|
2019-10-22 15:42:56 +08:00
|
|
|
self.adlist[child]["fail_state"] = 0
|
|
|
|
self.adlist[child]["output"] = (
|
|
|
|
self.adlist[child]["output"]
|
|
|
|
+ self.adlist[self.adlist[child]["fail_state"]]["output"]
|
|
|
|
)
|
|
|
|
|
2021-09-07 19:37:03 +08:00
|
|
|
def search_in(self, string: str) -> dict[str, list[int]]:
|
2019-10-22 15:42:56 +08:00
|
|
|
"""
|
|
|
|
>>> A = Automaton(["what", "hat", "ver", "er"])
|
|
|
|
>>> A.search_in("whatever, err ... , wherever")
|
|
|
|
{'what': [0], 'hat': [1], 'ver': [5, 25], 'er': [6, 10, 22, 26]}
|
|
|
|
"""
|
2022-10-16 01:29:42 +08:00
|
|
|
result: dict = {} # returns a dict with keywords and list of its occurrences
|
2019-10-22 15:42:56 +08:00
|
|
|
current_state = 0
|
|
|
|
for i in range(len(string)):
|
|
|
|
while (
|
2019-10-26 01:05:23 +08:00
|
|
|
self.find_next_state(current_state, string[i]) is None
|
2019-10-22 15:42:56 +08:00
|
|
|
and current_state != 0
|
|
|
|
):
|
|
|
|
current_state = self.adlist[current_state]["fail_state"]
|
2021-08-25 19:35:36 +08:00
|
|
|
next_state = self.find_next_state(current_state, string[i])
|
|
|
|
if next_state is None:
|
2019-10-22 15:42:56 +08:00
|
|
|
current_state = 0
|
|
|
|
else:
|
2021-08-25 19:35:36 +08:00
|
|
|
current_state = next_state
|
2019-10-22 15:42:56 +08:00
|
|
|
for key in self.adlist[current_state]["output"]:
|
2022-11-20 19:00:27 +08:00
|
|
|
if key not in result:
|
2019-10-22 15:42:56 +08:00
|
|
|
result[key] = []
|
2020-01-03 22:25:36 +08:00
|
|
|
result[key].append(i - len(key) + 1)
|
2019-10-22 15:42:56 +08:00
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
|
|
|
|
doctest.testmod()
|