1
1
//! String processing algorithms.
2
2
use std:: cmp:: { max, min} ;
3
+ use std:: collections:: { hash_map:: Entry , HashMap , VecDeque } ;
3
4
4
- /// Data structure for Knuth-Morris-Pratt string matching against a pattern.
5
- pub struct Matcher < ' a , T > {
5
+ /// Prefix trie, easily augmentable by adding more fields and/or methods
6
+ pub struct Trie < C : std:: hash:: Hash + Eq > {
7
+ links : Vec < HashMap < C , usize > > ,
8
+ }
9
+
10
+ impl < C : std:: hash:: Hash + Eq > Default for Trie < C > {
11
+ /// Creates an empty trie with a root node.
12
+ fn default ( ) -> Self {
13
+ Self {
14
+ links : vec ! [ HashMap :: new( ) ] ,
15
+ }
16
+ }
17
+ }
18
+
19
+ impl < C : std:: hash:: Hash + Eq > Trie < C > {
20
+ /// Inserts a word into the trie, and returns the index of its node.
21
+ pub fn insert ( & mut self , word : impl IntoIterator < Item = C > ) -> usize {
22
+ let mut node = 0 ;
23
+
24
+ for ch in word {
25
+ let len = self . links . len ( ) ;
26
+ node = match self . links [ node] . entry ( ch) {
27
+ Entry :: Occupied ( entry) => * entry. get ( ) ,
28
+ Entry :: Vacant ( entry) => {
29
+ entry. insert ( len) ;
30
+ self . links . push ( HashMap :: new ( ) ) ;
31
+ len
32
+ }
33
+ }
34
+ }
35
+ node
36
+ }
37
+
38
+ /// Finds a word in the trie, and returns the index of its node.
39
+ pub fn get ( & self , word : impl IntoIterator < Item = C > ) -> Option < usize > {
40
+ let mut node = 0 ;
41
+ for ch in word {
42
+ node = * self . links [ node] . get ( & ch) ?;
43
+ }
44
+ Some ( node)
45
+ }
46
+ }
47
+
48
+ /// Single-pattern matching with the Knuth-Morris-Pratt algorithm
49
+ pub struct Matcher < ' a , C : Eq > {
6
50
/// The string pattern to search for.
7
- pub pattern : & ' a [ T ] ,
51
+ pub pattern : & ' a [ C ] ,
8
52
/// KMP match failure automaton. fail[i] is the length of the longest
9
- /// proper prefix-suffix of pattern[0... i].
53
+ /// proper prefix-suffix of pattern[0..= i].
10
54
pub fail : Vec < usize > ,
11
55
}
12
56
13
- impl < ' a , T : Eq > Matcher < ' a , T > {
57
+ impl < ' a , C : Eq > Matcher < ' a , C > {
14
58
/// Precomputes the automaton that allows linear-time string matching.
15
59
///
16
60
/// # Example
@@ -33,7 +77,7 @@ impl<'a, T: Eq> Matcher<'a, T> {
33
77
/// # Panics
34
78
///
35
79
/// Panics if pattern is empty.
36
- pub fn new ( pattern : & ' a [ T ] ) -> Self {
80
+ pub fn new ( pattern : & ' a [ C ] ) -> Self {
37
81
let mut fail = Vec :: with_capacity ( pattern. len ( ) ) ;
38
82
fail. push ( 0 ) ;
39
83
let mut len = 0 ;
@@ -49,10 +93,10 @@ impl<'a, T: Eq> Matcher<'a, T> {
49
93
Self { pattern, fail }
50
94
}
51
95
52
- /// KMP algorithm, sets matches [i] = length of longest prefix of pattern
53
- /// matching a suffix of text[0... i].
54
- pub fn kmp_match ( & self , text : & [ T ] ) -> Vec < usize > {
55
- let mut matches = Vec :: with_capacity ( text. len ( ) ) ;
96
+ /// KMP algorithm, sets match_lens [i] = length of longest prefix of pattern
97
+ /// matching a suffix of text[0..= i].
98
+ pub fn kmp_match ( & self , text : & [ C ] ) -> Vec < usize > {
99
+ let mut match_lens = Vec :: with_capacity ( text. len ( ) ) ;
56
100
let mut len = 0 ;
57
101
for ch in text {
58
102
if len == self . pattern . len ( ) {
@@ -64,9 +108,94 @@ impl<'a, T: Eq> Matcher<'a, T> {
64
108
if self . pattern [ len] == * ch {
65
109
len += 1 ;
66
110
}
67
- matches . push ( len) ;
111
+ match_lens . push ( len) ;
68
112
}
69
- matches
113
+ match_lens
114
+ }
115
+ }
116
+
117
+ /// Multi-pattern matching with the Aho-Corasick algorithm
118
+ pub struct MultiMatcher < C : std:: hash:: Hash + Eq > {
119
+ /// A prefix trie storing the string patterns to search for.
120
+ pub trie : Trie < C > ,
121
+ /// Stores which completed pattern string each node corresponds to.
122
+ pub pat_id : Vec < Option < usize > > ,
123
+ /// Aho-Corasick failure automaton. fail[i] is the node corresponding to the
124
+ /// longest prefix-suffix of the node corresponding to i.
125
+ pub fail : Vec < usize > ,
126
+ /// Shortcut to the next match along the failure chain, or to the root.
127
+ pub fast : Vec < usize > ,
128
+ }
129
+
130
+ impl < C : std:: hash:: Hash + Eq > MultiMatcher < C > {
131
+ fn next ( trie : & Trie < C > , fail : & [ usize ] , mut node : usize , ch : & C ) -> usize {
132
+ loop {
133
+ if let Some ( & child) = trie. links [ node] . get ( ch) {
134
+ return child;
135
+ } else if node == 0 {
136
+ return 0 ;
137
+ }
138
+ node = fail[ node] ;
139
+ }
140
+ }
141
+
142
+ /// Precomputes the automaton that allows linear-time string matching.
143
+ /// If there are duplicate patterns, all but one copy will be ignored.
144
+ pub fn new ( patterns : Vec < impl IntoIterator < Item = C > > ) -> Self {
145
+ let mut trie = Trie :: default ( ) ;
146
+ let pat_nodes: Vec < usize > = patterns. into_iter ( ) . map ( |pat| trie. insert ( pat) ) . collect ( ) ;
147
+
148
+ let mut pat_id = vec ! [ None ; trie. links. len( ) ] ;
149
+ for ( i, node) in pat_nodes. into_iter ( ) . enumerate ( ) {
150
+ pat_id[ node] = Some ( i) ;
151
+ }
152
+
153
+ let mut fail = vec ! [ 0 ; trie. links. len( ) ] ;
154
+ let mut fast = vec ! [ 0 ; trie. links. len( ) ] ;
155
+ let mut q: VecDeque < usize > = trie. links [ 0 ] . values ( ) . cloned ( ) . collect ( ) ;
156
+
157
+ while let Some ( node) = q. pop_front ( ) {
158
+ for ( ch, & child) in & trie. links [ node] {
159
+ let nx = Self :: next ( & trie, & fail, fail[ node] , & ch) ;
160
+ fail[ child] = nx;
161
+ fast[ child] = if pat_id[ nx] . is_some ( ) { nx } else { fast[ nx] } ;
162
+ q. push_back ( child) ;
163
+ }
164
+ }
165
+
166
+ Self {
167
+ trie,
168
+ pat_id,
169
+ fail,
170
+ fast,
171
+ }
172
+ }
173
+
174
+ /// Aho-Corasick algorithm, sets match_nodes[i] = node corresponding to
175
+ /// longest prefix of some pattern matching a suffix of text[0..=i].
176
+ pub fn ac_match ( & self , text : & [ C ] ) -> Vec < usize > {
177
+ let mut match_nodes = Vec :: with_capacity ( text. len ( ) ) ;
178
+ let mut node = 0 ;
179
+ for ch in text {
180
+ node = Self :: next ( & self . trie , & self . fail , node, & ch) ;
181
+ match_nodes. push ( node) ;
182
+ }
183
+ match_nodes
184
+ }
185
+
186
+ /// For each non-empty match, returns where in the text it ends, and the index
187
+ /// of the corresponding pattern.
188
+ pub fn get_end_pos_and_pat_id ( & self , match_nodes : & [ usize ] ) -> Vec < ( usize , usize ) > {
189
+ let mut res = vec ! [ ] ;
190
+ for ( text_pos, & ( mut node) ) in match_nodes. iter ( ) . enumerate ( ) {
191
+ while node != 0 {
192
+ if let Some ( id) = self . pat_id [ node] {
193
+ res. push ( ( text_pos + 1 , id) ) ;
194
+ }
195
+ node = self . fast [ node] ;
196
+ }
197
+ }
198
+ res
70
199
}
71
200
}
72
201
@@ -155,39 +284,6 @@ impl SuffixArray {
155
284
}
156
285
}
157
286
158
- /// Prefix trie
159
- #[ derive( Default ) ]
160
- pub struct Trie < K : std:: hash:: Hash + Eq > {
161
- count : usize ,
162
- branches : std:: collections:: HashMap < K , Trie < K > > ,
163
- }
164
-
165
- impl < K : std:: hash:: Hash + Eq + Default > Trie < K > {
166
- /// Inserts a word into the trie.
167
- pub fn insert ( & mut self , word : impl IntoIterator < Item = K > ) {
168
- let mut node = self ;
169
- node. count += 1 ;
170
-
171
- for ch in word {
172
- node = { node } . branches . entry ( ch) . or_default ( ) ;
173
- node. count += 1 ;
174
- }
175
- }
176
-
177
- /// Computes the number of inserted words that start with the given prefix.
178
- pub fn get ( & self , prefix : impl IntoIterator < Item = K > ) -> usize {
179
- let mut node = self ;
180
-
181
- for ch in prefix {
182
- match node. branches . get ( & ch) {
183
- Some ( sub) => node = sub,
184
- None => return 0 ,
185
- }
186
- }
187
- node. count
188
- }
189
- }
190
-
191
287
/// Manacher's algorithm for computing palindrome substrings in linear time.
192
288
/// pal[2*i] = odd length of palindrome centred at text[i].
193
289
/// pal[2*i+1] = even length of palindrome centred at text[i+0.5].
@@ -226,7 +322,7 @@ mod test {
226
322
use super :: * ;
227
323
228
324
#[ test]
229
- fn test_kmp ( ) {
325
+ fn test_kmp_matching ( ) {
230
326
let text = b"banana" ;
231
327
let pattern = b"ana" ;
232
328
@@ -235,6 +331,27 @@ mod test {
235
331
assert_eq ! ( matches, vec![ 0 , 1 , 2 , 3 , 2 , 3 ] ) ;
236
332
}
237
333
334
+ #[ test]
335
+ fn test_ac_matching ( ) {
336
+ let text = b"banana bans, apple benefits." ;
337
+ let dict = vec ! [
338
+ "banana" . bytes( ) ,
339
+ "benefit" . bytes( ) ,
340
+ "banapple" . bytes( ) ,
341
+ "ban" . bytes( ) ,
342
+ "fit" . bytes( ) ,
343
+ ] ;
344
+
345
+ let matcher = MultiMatcher :: new ( dict) ;
346
+ let match_nodes = matcher. ac_match ( text) ;
347
+ let end_pos_and_id = matcher. get_end_pos_and_pat_id ( & match_nodes) ;
348
+
349
+ assert_eq ! (
350
+ end_pos_and_id,
351
+ vec![ ( 3 , 3 ) , ( 6 , 0 ) , ( 10 , 3 ) , ( 26 , 1 ) , ( 26 , 4 ) ]
352
+ ) ;
353
+ }
354
+
238
355
#[ test]
239
356
fn test_suffix_array ( ) {
240
357
let text1 = b"bobocel" ;
@@ -258,24 +375,6 @@ mod test {
258
375
}
259
376
}
260
377
261
- #[ test]
262
- fn test_trie ( ) {
263
- let dict = vec ! [ "banana" , "benefit" , "banapple" , "ban" ] ;
264
-
265
- let trie = dict. into_iter ( ) . fold ( Trie :: default ( ) , |mut trie, word| {
266
- Trie :: insert ( & mut trie, word. bytes ( ) ) ;
267
- trie
268
- } ) ;
269
-
270
- assert_eq ! ( trie. get( "" . bytes( ) ) , 4 ) ;
271
- assert_eq ! ( trie. get( "b" . bytes( ) ) , 4 ) ;
272
- assert_eq ! ( trie. get( "ba" . bytes( ) ) , 3 ) ;
273
- assert_eq ! ( trie. get( "ban" . bytes( ) ) , 3 ) ;
274
- assert_eq ! ( trie. get( "bana" . bytes( ) ) , 2 ) ;
275
- assert_eq ! ( trie. get( "banan" . bytes( ) ) , 1 ) ;
276
- assert_eq ! ( trie. get( "bane" . bytes( ) ) , 0 ) ;
277
- }
278
-
279
378
#[ test]
280
379
fn test_palindrome ( ) {
281
380
let text = b"banana" ;
0 commit comments