algo/python/36_ac_automata/ac_automata.py
2018-12-13 21:42:50 +00:00

81 lines
2.4 KiB
Python

"""
Aho-Corasick Algorithm
Author: Wenru Dong
"""
from collections import deque
from typing import List
class ACNode:
def __init__(self, data: str):
self._data = data
self._children = [None] * 26
self._is_ending_char = False
self._length = -1
self._suffix = None
class ACAutomata:
def __init__(self):
self._root = ACNode("/")
def _build_suffix_link(self) -> None:
q = deque()
q.append(self._root)
while q:
node = q.popleft()
for child in node._children:
if child:
if node == self._root:
child._suffix = self._root
else:
suffix = node._suffix
while suffix:
suffix_child = suffix._children[ord(child._data) - ord("a")]
if suffix_child:
child._suffix = suffix_child
break
suffix = suffix._suffix
if not suffix:
child._suffix = self._root
q.append(child)
def _insert(self, text: str) -> None:
node = self._root
for index, char in map(lambda x: (ord(x) - ord("a"), x), text):
if not node._children[index]:
node._children[index] = ACNode(char)
node = node._children[index]
node._is_ending_char = True
node._length = len(text)
def insert(self, patterns: List[str]) -> None:
for pattern in patterns:
self._insert(pattern)
self._build_suffix_link()
def match(self, text: str) -> None:
node = self._root
for i, char in enumerate(text):
index = ord(char) - ord("a")
while not node._children[index] and node != self._root:
node = node._suffix
node = node._children[index]
if not node:
node = self._root
tmp = node
while tmp != self._root:
if tmp._is_ending_char:
print(f"匹配起始下标{i - tmp._length + 1},长度{tmp._length}")
tmp = tmp._suffix
if __name__ == "__main__":
patterns = ["at", "art", "oars", "soar"]
ac = ACAutomata()
ac.insert(patterns)
ac.match("soarsoars")