@@ -7,6 +7,7 @@ use crate::{
77 prefilter:: Prefilter ,
88 primitives:: { PatternID , StateID } ,
99 search:: { Anchored , HalfMatch , Input , MatchError } ,
10+ start,
1011 } ,
1112} ;
1213
@@ -226,21 +227,50 @@ pub unsafe trait Automaton {
226227 /// ```
227228 fn next_eoi_state ( & self , current : StateID ) -> StateID ;
228229
229- /// Return the ID of the start state for this lazy DFA when executing a
230- /// forward search .
230+ /// Return the ID of the start state for this DFA for the given starting
231+ /// configuration .
231232 ///
232233 /// Unlike typical DFA implementations, the start state for DFAs in this
233234 /// crate is dependent on a few different factors:
234235 ///
235236 /// * The [`Anchored`] mode of the search. Unanchored, anchored and
236237 /// anchored searches for a specific [`PatternID`] all use different start
237238 /// states.
238- /// * The position at which the search begins, via [`Input::start`]. This
239- /// and the byte immediately preceding the start of the search (if one
240- /// exists) influence which look-behind assertions are true at the start
241- /// of the search. This in turn influences which start state is selected.
242- /// * Whether the search is a forward or reverse search. This routine can
243- /// only be used for forward searches.
239+ /// * Whether a "look-behind" byte exists. For example, the `^` anchor
240+ /// matches if and only if there is no look-behind byte.
241+ /// * The specific value of that look-behind byte. For example, a `(?m:^)`
242+ /// assertion only matches when there is either no look-behind byte, or
243+ /// when the look-behind byte is a line terminator.
244+ ///
245+ /// The [starting configuration](start::Config) provides the above
246+ /// information.
247+ ///
248+ /// This routine can be used for either forward or reverse searches.
249+ /// Although, as a convenience, if you have an [`Input`], then it may
250+ /// be more succinct to use [`Automaton::start_state_forward`] or
251+ /// [`Automaton::start_state_reverse`]. Note, for example, that the
252+ /// convenience routines return a [`MatchError`] on failure where as this
253+ /// routine returns a [`StartError`].
254+ ///
255+ /// # Errors
256+ ///
257+ /// This may return a [`StartError`] if the search needs to give up when
258+ /// determining the start state (for example, if it sees a "quit" byte).
259+ /// This can also return an error if the given configuration contains an
260+ /// unsupported [`Anchored`] configuration.
261+ fn start_state (
262+ & self ,
263+ config : & start:: Config ,
264+ ) -> Result < StateID , StartError > ;
265+ 266+ /// Return the ID of the start state for this DFA when executing a forward
267+ /// search.
268+ ///
269+ /// This is a convenience routine for calling [`Automaton::start_state`]
270+ /// that converts the given [`Input`] to a [start
271+ /// configuration](start::Config). Additionally, if an error occurs, it is
272+ /// converted from a [`StartError`] to a [`MatchError`] using the offset
273+ /// information in the given [`Input`].
244274 ///
245275 /// # Errors
246276 ///
@@ -251,23 +281,30 @@ pub unsafe trait Automaton {
251281 fn start_state_forward (
252282 & self ,
253283 input : & Input < ' _ > ,
254- ) -> Result < StateID , MatchError > ;
284+ ) -> Result < StateID , MatchError > {
285+ let config = start:: Config :: from_input_forward ( input) ;
286+ self . start_state ( & config) . map_err ( |err| match err {
287+ StartError :: Quit { byte } => {
288+ let offset = input
289+ . start ( )
290+ . checked_sub ( 1 )
291+ . expect ( "no quit in start without look-behind" ) ;
292+ MatchError :: quit ( byte, offset)
293+ }
294+ StartError :: UnsupportedAnchored { mode } => {
295+ MatchError :: unsupported_anchored ( mode)
296+ }
297+ } )
298+ }
255299
256- /// Return the ID of the start state for this lazy DFA when executing a
257- /// reverse search.
300+ /// Return the ID of the start state for this DFA when executing a reverse
301+ /// search.
258302 ///
259- /// Unlike typical DFA implementations, the start state for DFAs in this
260- /// crate is dependent on a few different factors:
261- ///
262- /// * The [`Anchored`] mode of the search. Unanchored, anchored and
263- /// anchored searches for a specific [`PatternID`] all use different start
264- /// states.
265- /// * The position at which the search begins, via [`Input::start`]. This
266- /// and the byte immediately preceding the start of the search (if one
267- /// exists) influence which look-behind assertions are true at the start
268- /// of the search. This in turn influences which start state is selected.
269- /// * Whether the search is a forward or reverse search. This routine can
270- /// only be used for reverse searches.
303+ /// This is a convenience routine for calling [`Automaton::start_state`]
304+ /// that converts the given [`Input`] to a [start
305+ /// configuration](start::Config). Additionally, if an error occurs, it is
306+ /// converted from a [`StartError`] to a [`MatchError`] using the offset
307+ /// information in the given [`Input`].
271308 ///
272309 /// # Errors
273310 ///
@@ -278,7 +315,18 @@ pub unsafe trait Automaton {
278315 fn start_state_reverse (
279316 & self ,
280317 input : & Input < ' _ > ,
281- ) -> Result < StateID , MatchError > ;
318+ ) -> Result < StateID , MatchError > {
319+ let config = start:: Config :: from_input_reverse ( input) ;
320+ self . start_state ( & config) . map_err ( |err| match err {
321+ StartError :: Quit { byte } => {
322+ let offset = input. end ( ) ;
323+ MatchError :: quit ( byte, offset)
324+ }
325+ StartError :: UnsupportedAnchored { mode } => {
326+ MatchError :: unsupported_anchored ( mode)
327+ }
328+ } )
329+ }
282330
283331 /// If this DFA has a universal starting state for the given anchor mode
284332 /// and the DFA supports universal starting states, then this returns that
@@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
17981846 ( * * self ) . next_eoi_state ( current)
17991847 }
18001848
1849+ #[ inline]
1850+ fn start_state (
1851+ & self ,
1852+ config : & start:: Config ,
1853+ ) -> Result < StateID , StartError > {
1854+ ( * * self ) . start_state ( config)
1855+ }
1856+ 18011857 #[ inline]
18021858 fn start_state_forward (
18031859 & self ,
@@ -2015,6 +2071,90 @@ impl OverlappingState {
20152071 }
20162072}
20172073
2074+ /// An error that can occur when computing the start state for a search.
2075+ ///
2076+ /// Computing a start state can fail for a few reasons, either based on
2077+ /// incorrect configuration or even based on whether the look-behind byte
2078+ /// triggers a quit state. Typically one does not need to handle this error
2079+ /// if you're using [`Automaton::start_state_forward`] (or its reverse
2080+ /// counterpart), as that routine automatically converts `StartError` to a
2081+ /// [`MatchError`] for you.
2082+ ///
2083+ /// This error may be returned by the [`Automaton::start_state`] routine.
2084+ ///
2085+ /// This error implements the `std::error::Error` trait when the `std` feature
2086+ /// is enabled.
2087+ ///
2088+ /// This error is marked as non-exhaustive. New variants may be added in a
2089+ /// semver compatible release.
2090+ #[ non_exhaustive]
2091+ #[ derive( Clone , Debug ) ]
2092+ pub enum StartError {
2093+ /// An error that occurs when a starting configuration's look-behind byte
2094+ /// is in this DFA's quit set.
2095+ Quit {
2096+ /// The quit byte that was found.
2097+ byte : u8 ,
2098+ } ,
2099+ /// An error that occurs when the caller requests an anchored mode that
2100+ /// isn't supported by the DFA.
2101+ UnsupportedAnchored {
2102+ /// The anchored mode given that is unsupported.
2103+ mode : Anchored ,
2104+ } ,
2105+ }
2106+ 2107+ impl StartError {
2108+ pub ( crate ) fn quit ( byte : u8 ) -> StartError {
2109+ StartError :: Quit { byte }
2110+ }
2111+ 2112+ pub ( crate ) fn unsupported_anchored ( mode : Anchored ) -> StartError {
2113+ StartError :: UnsupportedAnchored { mode }
2114+ }
2115+ }
2116+ 2117+ #[ cfg( feature = "std" ) ]
2118+ impl std:: error:: Error for StartError { }
2119+ 2120+ impl core:: fmt:: Display for StartError {
2121+ fn fmt ( & self , f : & mut core:: fmt:: Formatter < ' _ > ) -> core:: fmt:: Result {
2122+ match * self {
2123+ StartError :: Quit { byte } => write ! (
2124+ f,
2125+ "error computing start state because the look-behind byte \
2126+ {:?} triggered a quit state" ,
2127+ crate :: util:: escape:: DebugByte ( byte) ,
2128+ ) ,
2129+ StartError :: UnsupportedAnchored { mode : Anchored :: Yes } => {
2130+ write ! (
2131+ f,
2132+ "error computing start state because \
2133+ anchored searches are not supported or enabled"
2134+ )
2135+ }
2136+ StartError :: UnsupportedAnchored { mode : Anchored :: No } => {
2137+ write ! (
2138+ f,
2139+ "error computing start state because \
2140+ unanchored searches are not supported or enabled"
2141+ )
2142+ }
2143+ StartError :: UnsupportedAnchored {
2144+ mode : Anchored :: Pattern ( pid) ,
2145+ } => {
2146+ write ! (
2147+ f,
2148+ "error computing start state because \
2149+ anchored searches for a specific pattern ({}) \
2150+ are not supported or enabled" ,
2151+ pid. as_usize( ) ,
2152+ )
2153+ }
2154+ }
2155+ }
2156+ }
2157+ 20182158/// Runs the given overlapping `search` function (forwards or backwards) until
20192159/// a match is found whose offset does not split a codepoint.
20202160///
0 commit comments