Skip to content

Commit 781e8b4

Browse files
committed
feat(strings): add suffix automaton implementation
1 parent c79034c commit 781e8b4

File tree

1 file changed

+129
-0
lines changed

1 file changed

+129
-0
lines changed

strings/suffix_automaton.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
Suffix Automaton
3+
----------------
4+
A suffix automaton (SAM) is a minimal deterministic finite automaton (DFA)
5+
that recognizes all substrings of a given string.
6+
7+
It can be built in O(n) time and space, where n is the length of the string.
8+
Suffix automatons are useful for:
9+
- counting distinct substrings
10+
- checking substring existence
11+
- finding longest common substrings between strings
12+
13+
Reference:
14+
https://cp-algorithms.com/string/suffix-automaton.html
15+
16+
Example:
17+
>>> sa = build_suffix_automaton("ababa")
18+
>>> is_substring(sa, "aba")
19+
True
20+
>>> is_substring(sa, "abc")
21+
False
22+
>>> count_distinct_substrings(sa)
23+
9
24+
"""
25+
26+
from typing import TypedDict
27+
28+
29+
class State(TypedDict):
30+
length: int
31+
link: int
32+
next: dict[str, int]
33+
34+
35+
def build_suffix_automaton(s: str) -> list[State]:
36+
"""
37+
Build the suffix automaton for the given string.
38+
39+
Each state is represented as a dictionary with:
40+
'length' -> length of the longest substring for this state
41+
'link' -> suffix link (integer)
42+
'next' -> transitions (dict: char -> state index)
43+
44+
>>> sa = build_suffix_automaton("ababa")
45+
>>> isinstance(sa, list)
46+
True
47+
>>> all(isinstance(state, dict) for state in sa)
48+
True
49+
"""
50+
sa: list[State] = [{"length": 0, "link": -1, "next": {}}]
51+
last = 0
52+
53+
for ch in s:
54+
cur = len(sa)
55+
sa.append({"length": sa[last]["length"] + 1, "link": 0, "next": {}})
56+
p = last
57+
while p >= 0 and ch not in sa[p]["next"]:
58+
sa[p]["next"][ch] = cur
59+
p = sa[p]["link"]
60+
61+
if p == -1:
62+
sa[cur]["link"] = 0
63+
else:
64+
q = sa[p]["next"][ch]
65+
if sa[p]["length"] + 1 == sa[q]["length"]:
66+
sa[cur]["link"] = q
67+
else:
68+
clone = len(sa)
69+
sa.append(
70+
{
71+
"length": sa[p]["length"] + 1,
72+
"link": sa[q]["link"],
73+
"next": sa[q]["next"].copy(),
74+
}
75+
)
76+
while p >= 0 and sa[p]["next"].get(ch) == q:
77+
sa[p]["next"][ch] = clone
78+
p = sa[p]["link"]
79+
sa[q]["link"] = sa[cur]["link"] = clone
80+
last = cur
81+
return sa
82+
83+
84+
def is_substring(sa: list[State], substring: str) -> bool:
85+
"""
86+
Checks whether the given substring exists in the automaton.
87+
88+
>>> sa = build_suffix_automaton("ababa")
89+
>>> is_substring(sa, "aba")
90+
True
91+
>>> is_substring(sa, "bab")
92+
True
93+
>>> is_substring(sa, "abc")
94+
False
95+
"""
96+
state = 0
97+
for ch in substring:
98+
if ch not in sa[state]["next"]:
99+
return False
100+
state = sa[state]["next"][ch]
101+
return True
102+
103+
104+
def count_distinct_substrings(sa: list[State]) -> int:
105+
"""
106+
Returns the number of distinct substrings in the original string
107+
represented by the suffix automaton.
108+
109+
The number of distinct substrings is:
110+
sum(len[v] - len[link[v]]) for all states v != 0
111+
112+
>>> sa = build_suffix_automaton("ababa")
113+
>>> count_distinct_substrings(sa)
114+
9
115+
>>> count_distinct_substrings(build_suffix_automaton("aaaa"))
116+
4
117+
>>> count_distinct_substrings(build_suffix_automaton("abc"))
118+
6
119+
"""
120+
total = 0
121+
for v in range(1, len(sa)):
122+
total += sa[v]["length"] - sa[sa[v]["link"]]["length"]
123+
return total
124+
125+
126+
if __name__ == "__main__":
127+
from doctest import testmod
128+
129+
testmod()

0 commit comments

Comments
 (0)