diff --git a/python/35_trie/trie_.py b/python/35_trie/trie_.py index e41d1aa..51b56c3 100644 --- a/python/35_trie/trie_.py +++ b/python/35_trie/trie_.py @@ -15,16 +15,18 @@ class Node: self.children = [] def insert_child(self, c): + self._insert_child(Node(c)) + + def _insert_child(self, node): """ 插入一个子节点 :param c: :return: """ - v = ord(c) + v = ord(node.data) idx = self._find_insert_idx(v) length = len(self.children) - node = Node(c) if idx == length: self.children.append(node) else: @@ -33,6 +35,9 @@ class Node: self.children[i] = self.children[i-1] self.children[idx] = node + def has_child(self, c): + return True if self.get_child(c) is not None else False + def get_child(self, c): """ 搜索子节点并返回 diff --git a/python/36_ac_automata/ac_automata_.py b/python/36_ac_automata/ac_automata_.py new file mode 100644 index 0000000..3d324ec --- /dev/null +++ b/python/36_ac_automata/ac_automata_.py @@ -0,0 +1,88 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +from trie_ import Node, Trie +from queue import Queue + + +class ACNode(Node): + def __init__(self, c: str): + super(ACNode, self).__init__(c) + self.fail = None + self.length = 0 + + def insert_child(self, c: str): + self._insert_child(ACNode(c)) + + +class ACTrie(Trie): + def __init__(self): + self.root = ACNode(None) + + +def ac_automata(main: str, ac_trie: ACTrie) -> list: + root = ac_trie.root + build_failure_pointer(ac_trie) + + ret = [] + p = root + for i, c in enumerate(main): + while p != root and not p.has_child(c): + p = p.fail + + if p.has_child(c): # a char matched, try to find all potential pattern matched + q = p.get_child(c) + while q != root: + if q.is_ending_char: + ret.append((i-q.length+1, i)) + # ret.append(main[i-q.length+1:i+1]) + q = q.fail + p = p.get_child(c) + + return ret + + +def build_failure_pointer(ac_trie: ACTrie) -> None: + root = ac_trie.root + + # queue: [(node, node.length) ....] + node_queue = Queue() + node_queue.put((root, root.length)) + + root.fail = None + while not node_queue.empty(): + p, length = node_queue.get() + for pc in p.children: + pc.length = length + 1 + if p == root: + pc.fail = root + else: + q = p.fail + # same as kmp + while q != root and not q.has_child(pc.data): + q = q.fail + + # cases now: + # 1. q == root + # 2. q != root and q.has_child(pc.data) + if q.has_child(pc.data): + pc.fail = q.get_child(pc.data) + else: + pc.fail = root + node_queue.put((pc, pc.length)) + + +if __name__ == '__main__': + ac_trie = ACTrie() + ac_trie.gen_tree(['fuck', 'shit', 'TMD', '傻叉']) + + print('--- ac automata ---') + m_str = 'fuck you, what is that shit, TMD你就是个傻叉傻叉傻叉叉' + print('original str : {}'.format(m_str)) + + filter_range_list = ac_automata(m_str, ac_trie) + str_filtered = m_str + for start, end in filter_range_list: + str_filtered = str_filtered.replace(str_filtered[start:end+1], '*'*(end+1-start)) + + print('after filtered: {}'.format(str_filtered))