algo/python/33_bm/bm.py
2018-12-08 11:05:43 +00:00

77 lines
2.3 KiB
Python

"""
Boyer-Moore string-search algorithm.
Author: Wenru Dong
"""
from typing import List, Tuple
SIZE = 256
def _generate_bad_character_table(pattern: str) -> List[int]:
bc = [-1] * SIZE
for i, char in enumerate(pattern):
bc[ord(char)] = i
return bc
def _generate_good_suffix_table(pattern: str) -> Tuple[List[bool], List[int]]:
m = len(pattern)
# prefix[k] records whether the last k-character suffix of pattern
# can match with the first k-character prefix of pattern.
# suffix[k] records the starting index of the last substring of
# pattern that can match with the last k-character suffix of pattern.
prefix, suffix = [False] * m, [-1] * m
# For each substring patter[:i+1], we find the common suffix with
# pattern, and the starting index of this common suffix.
# This way we can re-write previous suffix[k] to record the index
# as large as possible, hence the last substring.
for i in range(m - 1):
j = i # starting index of the common suffix
k = 0 # length of the common suffix
while j >= 0 and pattern[j] == pattern[~k]:
j -= 1
k += 1
suffix[k] = j + 1
if j == -1: prefix[k] = True
return (prefix, suffix)
def _move_by_good_suffix(bad_character_index: int, suffix: List[int], prefix: List[bool]) -> int:
k = len(suffix) - 1 - bad_character_index
if suffix[k] != -1: return bad_character_index - suffix[k] + 1
# Test from k - 1
for r, can_match_prefix in enumerate(reversed(prefix[:k]), bad_character_index + 2):
if can_match_prefix: return r
return len(suffix)
def bm(s: str, pattern: str) -> int:
bc = _generate_bad_character_table(pattern)
prefix, suffix = _generate_good_suffix_table(pattern)
n, m = len(s), len(pattern)
i = 0
while i <= n - m:
j = m - 1 # bad character index in pattern
while j >= 0:
if s[i + j] != pattern[j]: break
j -= 1
if j < 0: return i
x = j - bc[ord(s[i + j])]
y = 0
if j < m - 1:
y = _move_by_good_suffix(j, suffix, prefix)
i += max(x, y)
return -1
if __name__ == "__main__":
s = "Here is a simple example"
pattern = "example"
print(bm(s, pattern))
s = "abcdcccdc"
pattern = "cccd"
print(s.find(pattern) == bm(s, pattern))