1763 lines
74 KiB
Rust
1763 lines
74 KiB
Rust
/*!
|
|
Provides a noncontiguous NFA implementation of Aho-Corasick.
|
|
|
|
This is a low-level API that generally only needs to be used in niche
|
|
circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
|
|
instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
|
|
only necessary when one needs access to the [`Automaton`] trait implementation.
|
|
*/
|
|
|
|
use alloc::{
|
|
collections::{BTreeSet, VecDeque},
|
|
vec,
|
|
vec::Vec,
|
|
};
|
|
|
|
use crate::{
|
|
automaton::Automaton,
|
|
util::{
|
|
alphabet::{ByteClassSet, ByteClasses},
|
|
error::{BuildError, MatchError},
|
|
prefilter::{self, opposite_ascii_case, Prefilter},
|
|
primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
|
|
remapper::Remapper,
|
|
search::{Anchored, MatchKind},
|
|
special::Special,
|
|
},
|
|
};
|
|
|
|
/// A noncontiguous NFA implementation of Aho-Corasick.
|
|
///
|
|
/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
|
|
/// this type directly. Using an `NFA` directly is typically only necessary
|
|
/// when one needs access to the [`Automaton`] trait implementation.
|
|
///
|
|
/// This NFA represents the "core" implementation of Aho-Corasick in this
|
|
/// crate. Namely, constructing this NFA involving building a trie and then
|
|
/// filling in the failure transitions between states, similar to what is
|
|
/// described in any standard textbook description of Aho-Corasick.
|
|
///
|
|
/// In order to minimize heap usage and to avoid additional construction costs,
|
|
/// this implementation represents the transitions of all states as distinct
|
|
/// sparse memory allocations. This is where it gets its name from. That is,
|
|
/// this NFA has no contiguous memory allocation for its transition table. Each
|
|
/// state gets its own allocation.
|
|
///
|
|
/// While the sparse representation keeps memory usage to somewhat reasonable
|
|
/// levels, it is still quite large and also results in somewhat mediocre
|
|
/// search performance. For this reason, it is almost always a good idea to
|
|
/// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
|
|
/// marginally slower to build, but has higher throughput and can sometimes use
|
|
/// an order of magnitude less memory. The main reason to use a noncontiguous
|
|
/// NFA is when you need the fastest possible construction time, or when a
|
|
/// contiguous NFA does not have the desired capacity. (The total number of NFA
|
|
/// states it can have is fewer than a noncontiguous NFA.)
|
|
///
|
|
/// # Example
|
|
///
|
|
/// This example shows how to build an `NFA` directly and use it to execute
|
|
/// [`Automaton::try_find`]:
|
|
///
|
|
/// ```
|
|
/// use aho_corasick::{
|
|
/// automaton::Automaton,
|
|
/// nfa::noncontiguous::NFA,
|
|
/// Input, Match,
|
|
/// };
|
|
///
|
|
/// let patterns = &["b", "abc", "abcd"];
|
|
/// let haystack = "abcd";
|
|
///
|
|
/// let nfa = NFA::new(patterns).unwrap();
|
|
/// assert_eq!(
|
|
/// Some(Match::must(0, 1..2)),
|
|
/// nfa.try_find(&Input::new(haystack))?,
|
|
/// );
|
|
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
|
/// ```
|
|
///
|
|
/// It is also possible to implement your own version of `try_find`. See the
|
|
/// [`Automaton`] documentation for an example.
|
|
#[derive(Clone)]
|
|
pub struct NFA {
|
|
/// The match semantics built into this NFA.
|
|
match_kind: MatchKind,
|
|
/// A set of states. Each state defines its own transitions, a fail
|
|
/// transition and a set of indices corresponding to matches.
|
|
///
|
|
/// The first state is always the fail state, which is used only as a
|
|
/// sentinel. Namely, in the final NFA, no transition into the fail state
|
|
/// exists. (Well, they do, but they aren't followed. Instead, the state's
|
|
/// failure transition is followed.)
|
|
///
|
|
/// The second state (index 1) is always the dead state. Dead states are
|
|
/// in every automaton, but only used when leftmost-{first,longest} match
|
|
/// semantics are enabled. Specifically, they instruct search to stop
|
|
/// at specific points in order to report the correct match location. In
|
|
/// the standard Aho-Corasick construction, there are no transitions to
|
|
/// the dead state.
|
|
///
|
|
/// The third state (index 2) is generally intended to be the starting or
|
|
/// "root" state.
|
|
states: Vec<State>,
|
|
/// Transitions stored in a sparse representation via a linked list.
|
|
///
|
|
/// Each transition contains three pieces of information: the byte it
|
|
/// is defined for, the state it transitions to and a link to the next
|
|
/// transition in the same state (or `StateID::ZERO` if it is the last
|
|
/// transition).
|
|
///
|
|
/// The first transition for each state is determined by `State::sparse`.
|
|
///
|
|
/// Note that this contains a complete set of all transitions in this NFA,
|
|
/// including states that have a dense representation for transitions.
|
|
/// (Adding dense transitions for a state doesn't remove its sparse
|
|
/// transitions, since deleting transitions from this particular sparse
|
|
/// representation would be fairly expensive.)
|
|
sparse: Vec<Transition>,
|
|
/// Transitions stored in a dense representation.
|
|
///
|
|
/// A state has a row in this table if and only if `State::dense` is
|
|
/// not equal to `StateID::ZERO`. When not zero, there are precisely
|
|
/// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense`
|
|
/// in this table.
|
|
///
|
|
/// Generally a very small minority of states have a dense representation
|
|
/// since it uses so much memory.
|
|
dense: Vec<StateID>,
|
|
/// Matches stored in linked list for each state.
|
|
///
|
|
/// Like sparse transitions, each match has a link to the next match in the
|
|
/// state.
|
|
///
|
|
/// The first match for each state is determined by `State::matches`.
|
|
matches: Vec<Match>,
|
|
/// The length, in bytes, of each pattern in this NFA. This slice is
|
|
/// indexed by `PatternID`.
|
|
///
|
|
/// The number of entries in this vector corresponds to the total number of
|
|
/// patterns in this automaton.
|
|
pattern_lens: Vec<SmallIndex>,
|
|
/// A prefilter for quickly skipping to candidate matches, if pertinent.
|
|
prefilter: Option<Prefilter>,
|
|
/// A set of equivalence classes in terms of bytes. We compute this while
|
|
/// building the NFA, but don't use it in the NFA's states. Instead, we
|
|
/// use this for building the DFA. We store it on the NFA since it's easy
|
|
/// to compute while visiting the patterns.
|
|
byte_classes: ByteClasses,
|
|
/// The length, in bytes, of the shortest pattern in this automaton. This
|
|
/// information is useful for detecting whether an automaton matches the
|
|
/// empty string or not.
|
|
min_pattern_len: usize,
|
|
/// The length, in bytes, of the longest pattern in this automaton. This
|
|
/// information is useful for keeping correct buffer sizes when searching
|
|
/// on streams.
|
|
max_pattern_len: usize,
|
|
/// The information required to deduce which states are "special" in this
|
|
/// NFA.
|
|
///
|
|
/// Since the DEAD and FAIL states are always the first two states and
|
|
/// there are only ever two start states (which follow all of the match
|
|
/// states), it follows that we can determine whether a state is a fail,
|
|
/// dead, match or start with just a few comparisons on the ID itself:
|
|
///
|
|
/// is_dead(sid): sid == NFA::DEAD
|
|
/// is_fail(sid): sid == NFA::FAIL
|
|
/// is_match(sid): NFA::FAIL < sid && sid <= max_match_id
|
|
/// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id
|
|
///
|
|
/// Note that this only applies to the NFA after it has been constructed.
|
|
/// During construction, the start states are the first ones added and the
|
|
/// match states are inter-leaved with non-match states. Once all of the
|
|
/// states have been added, the states are shuffled such that the above
|
|
/// predicates hold.
|
|
special: Special,
|
|
}
|
|
|
|
impl NFA {
|
|
/// Create a new Aho-Corasick noncontiguous NFA using the default
|
|
/// configuration.
|
|
///
|
|
/// Use a [`Builder`] if you want to change the configuration.
|
|
pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
|
|
where
|
|
I: IntoIterator<Item = P>,
|
|
P: AsRef<[u8]>,
|
|
{
|
|
NFA::builder().build(patterns)
|
|
}
|
|
|
|
/// A convenience method for returning a new Aho-Corasick noncontiguous NFA
|
|
/// builder.
|
|
///
|
|
/// This usually permits one to just import the `NFA` type.
|
|
pub fn builder() -> Builder {
|
|
Builder::new()
|
|
}
|
|
}
|
|
|
|
impl NFA {
|
|
/// The DEAD state is a sentinel state like the FAIL state. The DEAD state
|
|
/// instructs any search to stop and return any currently recorded match,
|
|
/// or no match otherwise. Generally speaking, it is impossible for an
|
|
/// unanchored standard search to enter a DEAD state. But an anchored
|
|
/// search can, and so to can a leftmost search.
|
|
///
|
|
/// We put DEAD before FAIL so that DEAD is always 0. We repeat this
|
|
/// decision across the other Aho-Corasicm automata, so that DEAD
|
|
/// states there are always 0 too. It's not that we need all of the
|
|
/// implementations to agree, but rather, the contiguous NFA and the DFA
|
|
/// use a sort of "premultiplied" state identifier where the only state
|
|
/// whose ID is always known and constant is the first state. Subsequent
|
|
/// state IDs depend on how much space has already been used in the
|
|
/// transition table.
|
|
pub(crate) const DEAD: StateID = StateID::new_unchecked(0);
|
|
/// The FAIL state mostly just corresponds to the ID of any transition on a
|
|
/// state that isn't explicitly defined. When one transitions into the FAIL
|
|
/// state, one must follow the previous state's failure transition before
|
|
/// doing the next state lookup. In this way, FAIL is more of a sentinel
|
|
/// than a state that one actually transitions into. In particular, it is
|
|
/// never exposed in the `Automaton` interface.
|
|
pub(crate) const FAIL: StateID = StateID::new_unchecked(1);
|
|
|
|
/// Returns the equivalence classes of bytes found while constructing
|
|
/// this NFA.
|
|
///
|
|
/// Note that the NFA doesn't actually make use of these equivalence
|
|
/// classes. Instead, these are useful for building the DFA when desired.
|
|
pub(crate) fn byte_classes(&self) -> &ByteClasses {
|
|
&self.byte_classes
|
|
}
|
|
|
|
/// Returns a slice containing the length of each pattern in this searcher.
|
|
/// It is indexed by `PatternID` and has length `NFA::patterns_len`.
|
|
///
|
|
/// This is exposed for convenience when building a contiguous NFA. But it
|
|
/// can be reconstructed from the `Automaton` API if necessary.
|
|
pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
|
|
&self.pattern_lens
|
|
}
|
|
|
|
/// Returns a slice of all states in this non-contiguous NFA.
|
|
pub(crate) fn states(&self) -> &[State] {
|
|
&self.states
|
|
}
|
|
|
|
/// Returns the underlying "special" state information for this NFA.
|
|
pub(crate) fn special(&self) -> &Special {
|
|
&self.special
|
|
}
|
|
|
|
/// Swaps the states at `id1` and `id2`.
|
|
///
|
|
/// This does not update the transitions of any state to account for the
|
|
/// state swap.
|
|
pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
|
|
self.states.swap(id1.as_usize(), id2.as_usize());
|
|
}
|
|
|
|
/// Re-maps all state IDs in this NFA according to the `map` function
|
|
/// given.
|
|
pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
|
|
let alphabet_len = self.byte_classes.alphabet_len();
|
|
for state in self.states.iter_mut() {
|
|
state.fail = map(state.fail);
|
|
let mut link = state.sparse;
|
|
while link != StateID::ZERO {
|
|
let t = &mut self.sparse[link];
|
|
t.next = map(t.next);
|
|
link = t.link;
|
|
}
|
|
if state.dense != StateID::ZERO {
|
|
let start = state.dense.as_usize();
|
|
for next in self.dense[start..][..alphabet_len].iter_mut() {
|
|
*next = map(*next);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Iterate over all of the transitions for the given state ID.
|
|
pub(crate) fn iter_trans(
|
|
&self,
|
|
sid: StateID,
|
|
) -> impl Iterator<Item = Transition> + '_ {
|
|
let mut link = self.states[sid].sparse;
|
|
core::iter::from_fn(move || {
|
|
if link == StateID::ZERO {
|
|
return None;
|
|
}
|
|
let t = self.sparse[link];
|
|
link = t.link;
|
|
Some(t)
|
|
})
|
|
}
|
|
|
|
/// Iterate over all of the matches for the given state ID.
|
|
pub(crate) fn iter_matches(
|
|
&self,
|
|
sid: StateID,
|
|
) -> impl Iterator<Item = PatternID> + '_ {
|
|
let mut link = self.states[sid].matches;
|
|
core::iter::from_fn(move || {
|
|
if link == StateID::ZERO {
|
|
return None;
|
|
}
|
|
let m = self.matches[link];
|
|
link = m.link;
|
|
Some(m.pid)
|
|
})
|
|
}
|
|
|
|
/// Return the link following the one given. If the one given is the last
|
|
/// link for the given state, then return `None`.
|
|
///
|
|
/// If no previous link is given, then this returns the first link in the
|
|
/// state, if one exists.
|
|
///
|
|
/// This is useful for manually iterating over the transitions in a single
|
|
/// state without borrowing the NFA. This permits mutating other parts of
|
|
/// the NFA during iteration. Namely, one can access the transition pointed
|
|
/// to by the link via `self.sparse[link]`.
|
|
fn next_link(
|
|
&self,
|
|
sid: StateID,
|
|
prev: Option<StateID>,
|
|
) -> Option<StateID> {
|
|
let link =
|
|
prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link);
|
|
if link == StateID::ZERO {
|
|
None
|
|
} else {
|
|
Some(link)
|
|
}
|
|
}
|
|
|
|
/// Follow the transition for the given byte in the given state. If no such
|
|
/// transition exists, then the FAIL state ID is returned.
|
|
#[inline(always)]
|
|
fn follow_transition(&self, sid: StateID, byte: u8) -> StateID {
|
|
let s = &self.states[sid];
|
|
// This is a special case that targets starting states and states
|
|
// near a start state. Namely, after the initial trie is constructed,
|
|
// we look for states close to the start state to convert to a dense
|
|
// representation for their transitions. This winds up using a lot more
|
|
// memory per state in exchange for faster transition lookups. But
|
|
// since we only do this for a small number of states (by default), the
|
|
// memory usage is usually minimal.
|
|
//
|
|
// This has *massive* benefit when executing searches because the
|
|
// unanchored starting state is by far the hottest state and is
|
|
// frequently visited. Moreover, the 'for' loop below that works
|
|
// decently on an actually sparse state is disastrous on a state that
|
|
// is nearly or completely dense.
|
|
if s.dense == StateID::ZERO {
|
|
self.follow_transition_sparse(sid, byte)
|
|
} else {
|
|
let class = usize::from(self.byte_classes.get(byte));
|
|
self.dense[s.dense.as_usize() + class]
|
|
}
|
|
}
|
|
|
|
/// Like `follow_transition`, but always uses the sparse representation.
|
|
#[inline(always)]
|
|
fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID {
|
|
for t in self.iter_trans(sid) {
|
|
if byte <= t.byte {
|
|
if byte == t.byte {
|
|
return t.next;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
NFA::FAIL
|
|
}
|
|
|
|
/// Set the transition for the given byte to the state ID given.
|
|
///
|
|
/// Note that one should not set transitions to the FAIL state. It is not
|
|
/// technically incorrect, but it wastes space. If a transition is not
|
|
/// defined, then it is automatically assumed to lead to the FAIL state.
|
|
fn add_transition(
|
|
&mut self,
|
|
prev: StateID,
|
|
byte: u8,
|
|
next: StateID,
|
|
) -> Result<(), BuildError> {
|
|
if self.states[prev].dense != StateID::ZERO {
|
|
let dense = self.states[prev].dense;
|
|
let class = usize::from(self.byte_classes.get(byte));
|
|
self.dense[dense.as_usize() + class] = next;
|
|
}
|
|
|
|
let head = self.states[prev].sparse;
|
|
if head == StateID::ZERO || byte < self.sparse[head].byte {
|
|
let new_link = self.alloc_transition()?;
|
|
self.sparse[new_link] = Transition { byte, next, link: head };
|
|
self.states[prev].sparse = new_link;
|
|
return Ok(());
|
|
} else if byte == self.sparse[head].byte {
|
|
self.sparse[head].next = next;
|
|
return Ok(());
|
|
}
|
|
|
|
// We handled the only cases where the beginning of the transition
|
|
// chain needs to change. At this point, we now know that there is
|
|
// at least one entry in the transition chain and the byte for that
|
|
// transition is less than the byte for the transition we're adding.
|
|
let (mut link_prev, mut link_next) = (head, self.sparse[head].link);
|
|
while link_next != StateID::ZERO && byte > self.sparse[link_next].byte
|
|
{
|
|
link_prev = link_next;
|
|
link_next = self.sparse[link_next].link;
|
|
}
|
|
if link_next == StateID::ZERO || byte < self.sparse[link_next].byte {
|
|
let link = self.alloc_transition()?;
|
|
self.sparse[link] = Transition { byte, next, link: link_next };
|
|
self.sparse[link_prev].link = link;
|
|
} else {
|
|
assert_eq!(byte, self.sparse[link_next].byte);
|
|
self.sparse[link_next].next = next;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// This sets every possible transition (all 255 of them) for the given
|
|
/// state to the name `next` value.
|
|
///
|
|
/// This is useful for efficiently initializing start/dead states.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// This requires that the state has no transitions added to it already.
|
|
/// If it has any transitions, then this panics. It will also panic if
|
|
/// the state has been densified prior to calling this.
|
|
fn init_full_state(
|
|
&mut self,
|
|
prev: StateID,
|
|
next: StateID,
|
|
) -> Result<(), BuildError> {
|
|
assert_eq!(
|
|
StateID::ZERO,
|
|
self.states[prev].dense,
|
|
"state must not be dense yet"
|
|
);
|
|
assert_eq!(
|
|
StateID::ZERO,
|
|
self.states[prev].sparse,
|
|
"state must have zero transitions"
|
|
);
|
|
let mut prev_link = StateID::ZERO;
|
|
for byte in 0..=255 {
|
|
let new_link = self.alloc_transition()?;
|
|
self.sparse[new_link] =
|
|
Transition { byte, next, link: StateID::ZERO };
|
|
if prev_link == StateID::ZERO {
|
|
self.states[prev].sparse = new_link;
|
|
} else {
|
|
self.sparse[prev_link].link = new_link;
|
|
}
|
|
prev_link = new_link;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Add a match for the given pattern ID to the state for the given ID.
|
|
fn add_match(
|
|
&mut self,
|
|
sid: StateID,
|
|
pid: PatternID,
|
|
) -> Result<(), BuildError> {
|
|
let head = self.states[sid].matches;
|
|
let mut link = head;
|
|
while self.matches[link].link != StateID::ZERO {
|
|
link = self.matches[link].link;
|
|
}
|
|
let new_match_link = self.alloc_match()?;
|
|
self.matches[new_match_link].pid = pid;
|
|
if link == StateID::ZERO {
|
|
self.states[sid].matches = new_match_link;
|
|
} else {
|
|
self.matches[link].link = new_match_link;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Copy matches from the `src` state to the `dst` state. This is useful
|
|
/// when a match state can be reached via a failure transition. In which
|
|
/// case, you'll want to copy the matches (if any) from the state reached
|
|
/// by the failure transition to the original state you were at.
|
|
fn copy_matches(
|
|
&mut self,
|
|
src: StateID,
|
|
dst: StateID,
|
|
) -> Result<(), BuildError> {
|
|
let head_dst = self.states[dst].matches;
|
|
let mut link_dst = head_dst;
|
|
while self.matches[link_dst].link != StateID::ZERO {
|
|
link_dst = self.matches[link_dst].link;
|
|
}
|
|
let mut link_src = self.states[src].matches;
|
|
while link_src != StateID::ZERO {
|
|
let new_match_link =
|
|
StateID::new(self.matches.len()).map_err(|e| {
|
|
BuildError::state_id_overflow(
|
|
StateID::MAX.as_u64(),
|
|
e.attempted(),
|
|
)
|
|
})?;
|
|
self.matches.push(Match {
|
|
pid: self.matches[link_src].pid,
|
|
link: StateID::ZERO,
|
|
});
|
|
if link_dst == StateID::ZERO {
|
|
self.states[dst].matches = new_match_link;
|
|
} else {
|
|
self.matches[link_dst].link = new_match_link;
|
|
}
|
|
|
|
link_dst = new_match_link;
|
|
link_src = self.matches[link_src].link;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Create a new entry in `NFA::trans`, if there's room, and return that
|
|
/// entry's ID. If there's no room, then an error is returned.
|
|
fn alloc_transition(&mut self) -> Result<StateID, BuildError> {
|
|
let id = StateID::new(self.sparse.len()).map_err(|e| {
|
|
BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
|
|
})?;
|
|
self.sparse.push(Transition::default());
|
|
Ok(id)
|
|
}
|
|
|
|
/// Create a new entry in `NFA::matches`, if there's room, and return that
|
|
/// entry's ID. If there's no room, then an error is returned.
|
|
fn alloc_match(&mut self) -> Result<StateID, BuildError> {
|
|
let id = StateID::new(self.matches.len()).map_err(|e| {
|
|
BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
|
|
})?;
|
|
self.matches.push(Match::default());
|
|
Ok(id)
|
|
}
|
|
|
|
/// Create a new set of `N` transitions in this NFA's dense transition
|
|
/// table. The ID return corresponds to the index at which the `N`
|
|
/// transitions begin. So `id+0` is the first transition and `id+(N-1)` is
|
|
/// the last.
|
|
///
|
|
/// `N` is determined via `NFA::byte_classes::alphabet_len`.
|
|
fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> {
|
|
let id = StateID::new(self.dense.len()).map_err(|e| {
|
|
BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
|
|
})?;
|
|
// We use FAIL because it's the correct default. If a state doesn't
|
|
// have a transition defined for every possible byte value, then the
|
|
// transition function should return NFA::FAIL.
|
|
self.dense.extend(
|
|
core::iter::repeat(NFA::FAIL)
|
|
.take(self.byte_classes.alphabet_len()),
|
|
);
|
|
Ok(id)
|
|
}
|
|
|
|
/// Allocate and add a fresh state to the underlying NFA and return its
|
|
/// ID (guaranteed to be one more than the ID of the previously allocated
|
|
/// state). If the ID would overflow `StateID`, then this returns an error.
|
|
fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
|
|
// This is OK because we error when building the trie if we see a
|
|
// pattern whose length cannot fit into a 'SmallIndex', and the longest
|
|
// possible depth corresponds to the length of the longest pattern.
|
|
let depth = SmallIndex::new(depth)
|
|
.expect("patterns longer than SmallIndex::MAX are not allowed");
|
|
let id = StateID::new(self.states.len()).map_err(|e| {
|
|
BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
|
|
})?;
|
|
self.states.push(State {
|
|
sparse: StateID::ZERO,
|
|
dense: StateID::ZERO,
|
|
matches: StateID::ZERO,
|
|
fail: self.special.start_unanchored_id,
|
|
depth,
|
|
});
|
|
Ok(id)
|
|
}
|
|
}
|
|
|
|
// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
|
|
// returns a valid state ID given a valid state ID. We otherwise claim that
|
|
// all other methods are correct as well.
|
|
unsafe impl Automaton for NFA {
|
|
#[inline(always)]
|
|
fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
|
|
match anchored {
|
|
Anchored::No => Ok(self.special.start_unanchored_id),
|
|
Anchored::Yes => Ok(self.special.start_anchored_id),
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn next_state(
|
|
&self,
|
|
anchored: Anchored,
|
|
mut sid: StateID,
|
|
byte: u8,
|
|
) -> StateID {
|
|
// This terminates since:
|
|
//
|
|
// 1. state.fail never points to the FAIL state.
|
|
// 2. All state.fail values point to a state closer to the start state.
|
|
// 3. The start state has no transitions to the FAIL state.
|
|
loop {
|
|
let next = self.follow_transition(sid, byte);
|
|
if next != NFA::FAIL {
|
|
return next;
|
|
}
|
|
// For an anchored search, we never follow failure transitions
|
|
// because failure transitions lead us down a path to matching
|
|
// a *proper* suffix of the path we were on. Thus, it can only
|
|
// produce matches that appear after the beginning of the search.
|
|
if anchored.is_anchored() {
|
|
return NFA::DEAD;
|
|
}
|
|
sid = self.states[sid].fail();
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn is_special(&self, sid: StateID) -> bool {
|
|
sid <= self.special.max_special_id
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn is_dead(&self, sid: StateID) -> bool {
|
|
sid == NFA::DEAD
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn is_match(&self, sid: StateID) -> bool {
|
|
// N.B. This returns true when sid==NFA::FAIL but that's okay because
|
|
// NFA::FAIL is not actually a valid state ID from the perspective of
|
|
// the Automaton trait. Namely, it is never returned by 'start_state'
|
|
// or by 'next_state'. So we don't need to care about it here.
|
|
!self.is_dead(sid) && sid <= self.special.max_match_id
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn is_start(&self, sid: StateID) -> bool {
|
|
sid == self.special.start_unanchored_id
|
|
|| sid == self.special.start_anchored_id
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn match_kind(&self) -> MatchKind {
|
|
self.match_kind
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn patterns_len(&self) -> usize {
|
|
self.pattern_lens.len()
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn pattern_len(&self, pid: PatternID) -> usize {
|
|
self.pattern_lens[pid].as_usize()
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn min_pattern_len(&self) -> usize {
|
|
self.min_pattern_len
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn max_pattern_len(&self) -> usize {
|
|
self.max_pattern_len
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn match_len(&self, sid: StateID) -> usize {
|
|
self.iter_matches(sid).count()
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
|
|
self.iter_matches(sid).nth(index).unwrap()
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn memory_usage(&self) -> usize {
|
|
self.states.len() * core::mem::size_of::<State>()
|
|
+ self.sparse.len() * core::mem::size_of::<Transition>()
|
|
+ self.matches.len() * core::mem::size_of::<Match>()
|
|
+ self.dense.len() * StateID::SIZE
|
|
+ self.pattern_lens.len() * SmallIndex::SIZE
|
|
+ self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn prefilter(&self) -> Option<&Prefilter> {
|
|
self.prefilter.as_ref()
|
|
}
|
|
}
|
|
|
|
/// A representation of a sparse NFA state for an Aho-Corasick automaton.
|
|
///
|
|
/// It contains the transitions to the next state, a failure transition for
|
|
/// cases where there exists no other transition for the current input byte
|
|
/// and the matches implied by visiting this state (if any).
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct State {
|
|
/// A pointer to `NFA::trans` corresponding to the head of a linked list
|
|
/// containing all of the transitions for this state.
|
|
///
|
|
/// This is `StateID::ZERO` if and only if this state has zero transitions.
|
|
sparse: StateID,
|
|
/// A pointer to a row of `N` transitions in `NFA::dense`. These
|
|
/// transitions correspond precisely to what is obtained by traversing
|
|
/// `sparse`, but permits constant time lookup.
|
|
///
|
|
/// When this is zero (which is true for most states in the default
|
|
/// configuration), then this state has no dense representation.
|
|
///
|
|
/// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is
|
|
/// typically much less than 256 (the maximum value).
|
|
dense: StateID,
|
|
/// A pointer to `NFA::matches` corresponding to the head of a linked list
|
|
/// containing all of the matches for this state.
|
|
///
|
|
/// This is `StateID::ZERO` if and only if this state is not a match state.
|
|
matches: StateID,
|
|
/// The state that should be transitioned to if the current byte in the
|
|
/// haystack does not have a corresponding transition defined in this
|
|
/// state.
|
|
fail: StateID,
|
|
/// The depth of this state. Specifically, this is the distance from this
|
|
/// state to the starting state. (For the special sentinel states DEAD and
|
|
/// FAIL, their depth is always 0.) The depth of a starting state is 0.
|
|
///
|
|
/// Note that depth is currently not used in this non-contiguous NFA. It
|
|
/// may in the future, but it is used in the contiguous NFA. Namely, it
|
|
/// permits an optimization where states near the starting state have their
|
|
/// transitions stored in a dense fashion, but all other states have their
|
|
/// transitions stored in a sparse fashion. (This non-contiguous NFA uses
|
|
/// a sparse representation for all states unconditionally.) In any case,
|
|
/// this is really the only convenient place to compute and store this
|
|
/// information, which we need when building the contiguous NFA.
|
|
depth: SmallIndex,
|
|
}
|
|
|
|
impl State {
|
|
/// Return true if and only if this state is a match state.
|
|
pub(crate) fn is_match(&self) -> bool {
|
|
self.matches != StateID::ZERO
|
|
}
|
|
|
|
/// Returns the failure transition for this state.
|
|
pub(crate) fn fail(&self) -> StateID {
|
|
self.fail
|
|
}
|
|
|
|
/// Returns the depth of this state. That is, the number of transitions
|
|
/// this state is from the start state of the NFA.
|
|
pub(crate) fn depth(&self) -> SmallIndex {
|
|
self.depth
|
|
}
|
|
}
|
|
|
|
/// A single transition in a non-contiguous NFA.
|
|
#[derive(Clone, Copy, Default)]
|
|
#[repr(packed)]
|
|
pub(crate) struct Transition {
|
|
byte: u8,
|
|
next: StateID,
|
|
link: StateID,
|
|
}
|
|
|
|
impl Transition {
|
|
/// Return the byte for which this transition is defined.
|
|
pub(crate) fn byte(&self) -> u8 {
|
|
self.byte
|
|
}
|
|
|
|
/// Return the ID of the state that this transition points to.
|
|
pub(crate) fn next(&self) -> StateID {
|
|
self.next
|
|
}
|
|
|
|
/// Return the ID of the next transition.
|
|
fn link(&self) -> StateID {
|
|
self.link
|
|
}
|
|
}
|
|
|
|
impl core::fmt::Debug for Transition {
|
|
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
|
write!(
|
|
f,
|
|
"Transition(byte: {:X?}, next: {:?}, link: {:?})",
|
|
self.byte,
|
|
self.next().as_usize(),
|
|
self.link().as_usize()
|
|
)
|
|
}
|
|
}
|
|
|
|
/// A single match in a non-contiguous NFA.
|
|
#[derive(Clone, Copy, Default)]
|
|
struct Match {
|
|
pid: PatternID,
|
|
link: StateID,
|
|
}
|
|
|
|
impl Match {
|
|
/// Return the pattern ID for this match.
|
|
pub(crate) fn pattern(&self) -> PatternID {
|
|
self.pid
|
|
}
|
|
|
|
/// Return the ID of the next match.
|
|
fn link(&self) -> StateID {
|
|
self.link
|
|
}
|
|
}
|
|
|
|
impl core::fmt::Debug for Match {
|
|
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
|
write!(
|
|
f,
|
|
"Match(pid: {:?}, link: {:?})",
|
|
self.pattern().as_usize(),
|
|
self.link().as_usize()
|
|
)
|
|
}
|
|
}
|
|
|
|
/// A builder for configuring an Aho-Corasick noncontiguous NFA.
|
|
///
|
|
/// This builder has a subset of the options available to a
|
|
/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
|
|
/// their behavior is identical.
|
|
#[derive(Clone, Debug)]
|
|
pub struct Builder {
|
|
match_kind: MatchKind,
|
|
prefilter: bool,
|
|
ascii_case_insensitive: bool,
|
|
dense_depth: usize,
|
|
}
|
|
|
|
impl Default for Builder {
|
|
fn default() -> Builder {
|
|
Builder {
|
|
match_kind: MatchKind::default(),
|
|
prefilter: true,
|
|
ascii_case_insensitive: false,
|
|
dense_depth: 3,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Builder {
|
|
/// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
|
|
pub fn new() -> Builder {
|
|
Builder::default()
|
|
}
|
|
|
|
/// Build an Aho-Corasick noncontiguous NFA from the given iterator of
|
|
/// patterns.
|
|
///
|
|
/// A builder may be reused to create more NFAs.
|
|
pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
|
|
where
|
|
I: IntoIterator<Item = P>,
|
|
P: AsRef<[u8]>,
|
|
{
|
|
debug!("building non-contiguous NFA");
|
|
let nfa = Compiler::new(self)?.compile(patterns)?;
|
|
debug!(
|
|
"non-contiguous NFA built, <states: {:?}, size: {:?}>",
|
|
nfa.states.len(),
|
|
nfa.memory_usage()
|
|
);
|
|
Ok(nfa)
|
|
}
|
|
|
|
/// Set the desired match semantics.
|
|
///
|
|
/// See
|
|
/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
|
|
/// for more documentation and examples.
|
|
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
|
|
self.match_kind = kind;
|
|
self
|
|
}
|
|
|
|
/// Enable ASCII-aware case insensitive matching.
|
|
///
|
|
/// See
|
|
/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
|
|
/// for more documentation and examples.
|
|
pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
|
|
self.ascii_case_insensitive = yes;
|
|
self
|
|
}
|
|
|
|
/// Set the limit on how many states use a dense representation for their
|
|
/// transitions. Other states will generally use a sparse representation.
|
|
///
|
|
/// See
|
|
/// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
|
|
/// for more documentation and examples.
|
|
pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
|
|
self.dense_depth = depth;
|
|
self
|
|
}
|
|
|
|
/// Enable heuristic prefilter optimizations.
|
|
///
|
|
/// See
|
|
/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
|
|
/// for more documentation and examples.
|
|
pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
|
|
self.prefilter = yes;
|
|
self
|
|
}
|
|
}
|
|
|
|
/// A compiler uses a builder configuration and builds up the NFA formulation
|
|
/// of an Aho-Corasick automaton. This roughly corresponds to the standard
|
|
/// formulation described in textbooks, with some tweaks to support leftmost
|
|
/// searching.
|
|
#[derive(Debug)]
|
|
struct Compiler<'a> {
|
|
builder: &'a Builder,
|
|
prefilter: prefilter::Builder,
|
|
nfa: NFA,
|
|
byteset: ByteClassSet,
|
|
}
|
|
|
|
impl<'a> Compiler<'a> {
|
|
fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
|
|
let prefilter = prefilter::Builder::new(builder.match_kind)
|
|
.ascii_case_insensitive(builder.ascii_case_insensitive);
|
|
Ok(Compiler {
|
|
builder,
|
|
prefilter,
|
|
nfa: NFA {
|
|
match_kind: builder.match_kind,
|
|
states: vec![],
|
|
sparse: vec![],
|
|
dense: vec![],
|
|
matches: vec![],
|
|
pattern_lens: vec![],
|
|
prefilter: None,
|
|
byte_classes: ByteClasses::singletons(),
|
|
min_pattern_len: usize::MAX,
|
|
max_pattern_len: 0,
|
|
special: Special::zero(),
|
|
},
|
|
byteset: ByteClassSet::empty(),
|
|
})
|
|
}
|
|
|
|
fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
|
|
where
|
|
I: IntoIterator<Item = P>,
|
|
P: AsRef<[u8]>,
|
|
{
|
|
// Add dummy transition/match links, so that no valid link will point
|
|
// to another link at index 0.
|
|
self.nfa.sparse.push(Transition::default());
|
|
self.nfa.matches.push(Match::default());
|
|
// Add a dummy dense transition so that no states can have dense==0
|
|
// represent a valid pointer to dense transitions. This permits
|
|
// dense==0 to be a sentinel indicating "no dense transitions."
|
|
self.nfa.dense.push(NFA::DEAD);
|
|
// the dead state, only used for leftmost and fixed to id==0
|
|
self.nfa.alloc_state(0)?;
|
|
// the fail state, which is never entered and fixed to id==1
|
|
self.nfa.alloc_state(0)?;
|
|
// unanchored start state, initially fixed to id==2 but later shuffled
|
|
// to appear after all non-start match states.
|
|
self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?;
|
|
// anchored start state, initially fixed to id==3 but later shuffled
|
|
// to appear after unanchored start state.
|
|
self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?;
|
|
// Initialize the unanchored starting state in order to make it dense,
|
|
// and thus make transition lookups on this state faster.
|
|
self.init_unanchored_start_state()?;
|
|
// Set all transitions on the DEAD state to point to itself. This way,
|
|
// the DEAD state can never be escaped. It MUST be used as a sentinel
|
|
// in any correct search.
|
|
self.add_dead_state_loop()?;
|
|
// Build the base trie from the given patterns.
|
|
self.build_trie(patterns)?;
|
|
self.nfa.states.shrink_to_fit();
|
|
// Turn our set of bytes into equivalent classes. This NFA
|
|
// implementation uses byte classes only for states that use a dense
|
|
// representation of transitions. (And that's why this comes before
|
|
// `self.densify()`, as the byte classes need to be set first.)
|
|
self.nfa.byte_classes = self.byteset.byte_classes();
|
|
// Add transitions (and maybe matches) to the anchored starting state.
|
|
// The anchored starting state is used for anchored searches. The only
|
|
// mechanical difference between it and the unanchored start state is
|
|
// that missing transitions map to the DEAD state instead of the FAIL
|
|
// state.
|
|
self.set_anchored_start_state()?;
|
|
// Rewrite transitions to the FAIL state on the unanchored start state
|
|
// as self-transitions. This keeps the start state active at all times.
|
|
self.add_unanchored_start_state_loop();
|
|
// Make some (possibly zero) states use a dense representation for
|
|
// transitions. It's important to do this right after the states
|
|
// and non-failure transitions are solidified. That way, subsequent
|
|
// accesses (particularly `fill_failure_transitions`) will benefit from
|
|
// the faster transition lookup in densified states.
|
|
self.densify()?;
|
|
// The meat of the Aho-Corasick algorithm: compute and write failure
|
|
// transitions. i.e., the state to move to when a transition isn't
|
|
// defined in the current state. These are epsilon transitions and thus
|
|
// make this formulation an NFA.
|
|
self.fill_failure_transitions()?;
|
|
// Handle a special case under leftmost semantics when at least one
|
|
// of the patterns is the empty string.
|
|
self.close_start_state_loop_for_leftmost();
|
|
// Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
|
|
// NON-MATCH, ... This permits us to very quickly query the type of
|
|
// the state we're currently in during a search.
|
|
self.shuffle();
|
|
self.nfa.prefilter = self.prefilter.build();
|
|
// Store the maximum ID of all *relevant* special states. Start states
|
|
// are only relevant when we have a prefilter, otherwise, there is zero
|
|
// reason to care about whether a state is a start state or not during
|
|
// a search. Indeed, without a prefilter, we are careful to explicitly
|
|
// NOT care about start states, otherwise the search can ping pong
|
|
// between the unrolled loop and the handling of special-status states
|
|
// and destroy perf.
|
|
self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
|
|
// Why the anchored starting state? Because we always put it
|
|
// after the unanchored starting state and it is therefore the
|
|
// maximum. Why put unanchored followed by anchored? No particular
|
|
// reason, but that's how the states are logically organized in the
|
|
// Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
|
|
self.nfa.special.start_anchored_id
|
|
} else {
|
|
self.nfa.special.max_match_id
|
|
};
|
|
self.nfa.sparse.shrink_to_fit();
|
|
self.nfa.dense.shrink_to_fit();
|
|
self.nfa.matches.shrink_to_fit();
|
|
self.nfa.pattern_lens.shrink_to_fit();
|
|
Ok(self.nfa)
|
|
}
|
|
|
|
/// This sets up the initial prefix trie that makes up the Aho-Corasick
|
|
/// automaton. Effectively, it creates the basic structure of the
|
|
/// automaton, where every pattern given has a path from the start state to
|
|
/// the end of the pattern.
|
|
fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
|
|
where
|
|
I: IntoIterator<Item = P>,
|
|
P: AsRef<[u8]>,
|
|
{
|
|
'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
|
|
let pid = PatternID::new(i).map_err(|e| {
|
|
BuildError::pattern_id_overflow(
|
|
PatternID::MAX.as_u64(),
|
|
e.attempted(),
|
|
)
|
|
})?;
|
|
let pat = pat.as_ref();
|
|
let patlen = SmallIndex::new(pat.len())
|
|
.map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?;
|
|
self.nfa.min_pattern_len =
|
|
core::cmp::min(self.nfa.min_pattern_len, pat.len());
|
|
self.nfa.max_pattern_len =
|
|
core::cmp::max(self.nfa.max_pattern_len, pat.len());
|
|
assert_eq!(
|
|
i,
|
|
self.nfa.pattern_lens.len(),
|
|
"expected number of patterns to match pattern ID"
|
|
);
|
|
self.nfa.pattern_lens.push(patlen);
|
|
// We add the pattern to the prefilter here because the pattern
|
|
// ID in the prefilter is determined with respect to the patterns
|
|
// added to the prefilter. That is, it isn't the ID we have here,
|
|
// but the one determined by its own accounting of patterns.
|
|
// To ensure they line up, we add every pattern we see to the
|
|
// prefilter, even if some patterns ultimately are impossible to
|
|
// match (in leftmost-first semantics specifically).
|
|
//
|
|
// Another way of doing this would be to expose an API in the
|
|
// prefilter to permit setting your own pattern IDs. Or to just use
|
|
// our own map and go between them. But this case is sufficiently
|
|
// rare that we don't bother and just make sure they're in sync.
|
|
if self.builder.prefilter {
|
|
self.prefilter.add(pat);
|
|
}
|
|
|
|
let mut prev = self.nfa.special.start_unanchored_id;
|
|
let mut saw_match = false;
|
|
for (depth, &b) in pat.iter().enumerate() {
|
|
// When leftmost-first match semantics are requested, we
|
|
// specifically stop adding patterns when a previously added
|
|
// pattern is a prefix of it. We avoid adding it because
|
|
// leftmost-first semantics imply that the pattern can never
|
|
// match. This is not just an optimization to save space! It
|
|
// is necessary for correctness. In fact, this is the only
|
|
// difference in the automaton between the implementations for
|
|
// leftmost-first and leftmost-longest.
|
|
saw_match = saw_match || self.nfa.states[prev].is_match();
|
|
if self.builder.match_kind.is_leftmost_first() && saw_match {
|
|
// Skip to the next pattern immediately. This avoids
|
|
// incorrectly adding a match after this loop terminates.
|
|
continue 'PATTERNS;
|
|
}
|
|
|
|
// Add this byte to our equivalence classes. These don't
|
|
// get used while building the trie, but other Aho-Corasick
|
|
// implementations may use them.
|
|
self.byteset.set_range(b, b);
|
|
if self.builder.ascii_case_insensitive {
|
|
let b = opposite_ascii_case(b);
|
|
self.byteset.set_range(b, b);
|
|
}
|
|
|
|
// If the transition from prev using the current byte already
|
|
// exists, then just move through it. Otherwise, add a new
|
|
// state. We track the depth here so that we can determine
|
|
// how to represent transitions. States near the start state
|
|
// use a dense representation that uses more memory but is
|
|
// faster. Other states use a sparse representation that uses
|
|
// less memory but is slower.
|
|
let next = self.nfa.follow_transition(prev, b);
|
|
if next != NFA::FAIL {
|
|
prev = next;
|
|
} else {
|
|
let next = self.nfa.alloc_state(depth)?;
|
|
self.nfa.add_transition(prev, b, next)?;
|
|
if self.builder.ascii_case_insensitive {
|
|
let b = opposite_ascii_case(b);
|
|
self.nfa.add_transition(prev, b, next)?;
|
|
}
|
|
prev = next;
|
|
}
|
|
}
|
|
// Once the pattern has been added, log the match in the final
|
|
// state that it reached.
|
|
self.nfa.add_match(prev, pid)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// This routine creates failure transitions according to the standard
|
|
/// textbook formulation of the Aho-Corasick algorithm, with a couple small
|
|
/// tweaks to support "leftmost" semantics.
|
|
///
|
|
/// Building failure transitions is the most interesting part of building
|
|
/// the Aho-Corasick automaton, because they are what allow searches to
|
|
/// be performed in linear time. Specifically, a failure transition is
|
|
/// a single transition associated with each state that points back to
|
|
/// the longest proper suffix of the pattern being searched. The failure
|
|
/// transition is followed whenever there exists no transition on the
|
|
/// current state for the current input byte. If there is no other proper
|
|
/// suffix, then the failure transition points back to the starting state.
|
|
///
|
|
/// For example, let's say we built an Aho-Corasick automaton with the
|
|
/// following patterns: 'abcd' and 'cef'. The trie looks like this:
|
|
///
|
|
/// ```ignore
|
|
/// a - S1 - b - S2 - c - S3 - d - S4*
|
|
/// /
|
|
/// S0 - c - S5 - e - S6 - f - S7*
|
|
/// ```
|
|
///
|
|
/// At this point, it should be fairly straight-forward to see how this
|
|
/// trie can be used in a simplistic way. At any given position in the
|
|
/// text we're searching (called the "subject" string), all we need to do
|
|
/// is follow the transitions in the trie by consuming one transition for
|
|
/// each byte in the subject string. If we reach a match state, then we can
|
|
/// report that location as a match.
|
|
///
|
|
/// The trick comes when searching a subject string like 'abcef'. We'll
|
|
/// initially follow the transition from S0 to S1 and wind up in S3 after
|
|
/// observng the 'c' byte. At this point, the next byte is 'e' but state
|
|
/// S3 has no transition for 'e', so the search fails. We then would need
|
|
/// to restart the search at the next position in 'abcef', which
|
|
/// corresponds to 'b'. The match would fail, but the next search starting
|
|
/// at 'c' would finally succeed. The problem with this approach is that
|
|
/// we wind up searching the subject string potentially many times. In
|
|
/// effect, this makes the algorithm have worst case `O(n * m)` complexity,
|
|
/// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
|
|
/// like to achieve a `O(n + m)` worst case complexity.
|
|
///
|
|
/// This is where failure transitions come in. Instead of dying at S3 in
|
|
/// the first search, the automaton can instruct the search to move to
|
|
/// another part of the automaton that corresponds to a suffix of what
|
|
/// we've seen so far. Recall that we've seen 'abc' in the subject string,
|
|
/// and the automaton does indeed have a non-empty suffix, 'c', that could
|
|
/// potentially lead to another match. Thus, the actual Aho-Corasick
|
|
/// automaton for our patterns in this case looks like this:
|
|
///
|
|
/// ```ignore
|
|
/// a - S1 - b - S2 - c - S3 - d - S4*
|
|
/// / /
|
|
/// / ----------------
|
|
/// / /
|
|
/// S0 - c - S5 - e - S6 - f - S7*
|
|
/// ```
|
|
///
|
|
/// That is, we have a failure transition from S3 to S5, which is followed
|
|
/// exactly in cases when we are in state S3 but see any byte other than
|
|
/// 'd' (that is, we've "failed" to find a match in this portion of our
|
|
/// trie). We know we can transition back to S5 because we've already seen
|
|
/// a 'c' byte, so we don't need to re-scan it. We can then pick back up
|
|
/// with the search starting at S5 and complete our match.
|
|
///
|
|
/// Adding failure transitions to a trie is fairly simple, but subtle. The
|
|
/// key issue is that you might have multiple failure transition that you
|
|
/// need to follow. For example, look at the trie for the patterns
|
|
/// 'abcd', 'b', 'bcd' and 'cd':
|
|
///
|
|
/// ```ignore
|
|
/// - a - S1 - b - S2* - c - S3 - d - S4*
|
|
/// / / /
|
|
/// / ------- -------
|
|
/// / / /
|
|
/// S0 --- b - S5* - c - S6 - d - S7*
|
|
/// \ /
|
|
/// \ --------
|
|
/// \ /
|
|
/// - c - S8 - d - S9*
|
|
/// ```
|
|
///
|
|
/// The failure transitions for this trie are defined from S2 to S5,
|
|
/// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
|
|
/// corresponds to a match, since its failure transition to S5 is itself
|
|
/// a match state.
|
|
///
|
|
/// Perhaps simplest way to think about adding these failure transitions
|
|
/// is recursively. That is, if you know the failure transitions for every
|
|
/// possible previous state that could be visited (e.g., when computing the
|
|
/// failure transition for S3, you already know the failure transitions
|
|
/// for S0, S1 and S2), then you can simply follow the failure transition
|
|
/// of the previous state and check whether the incoming transition is
|
|
/// defined after following the failure transition.
|
|
///
|
|
/// For example, when determining the failure state for S3, by our
|
|
/// assumptions, we already know that there is a failure transition from
|
|
/// S2 (the previous state) to S5. So we follow that transition and check
|
|
/// whether the transition connecting S2 to S3 is defined. Indeed, it is,
|
|
/// as there is a transition from S5 to S6 for the byte 'c'. If no such
|
|
/// transition existed, we could keep following the failure transitions
|
|
/// until we reach the start state, which is the failure transition for
|
|
/// every state that has no corresponding proper suffix.
|
|
///
|
|
/// We don't actually use recursion to implement this, but instead, use a
|
|
/// breadth first search of the automaton. Our base case is the start
|
|
/// state, whose failure transition is just a transition to itself.
|
|
///
|
|
/// When building a leftmost automaton, we proceed as above, but only
|
|
/// include a subset of failure transitions. Namely, we omit any failure
|
|
/// transitions that appear after a match state in the trie. This is
|
|
/// because failure transitions always point back to a proper suffix of
|
|
/// what has been seen so far. Thus, following a failure transition after
|
|
/// a match implies looking for a match that starts after the one that has
|
|
/// already been seen, which is of course therefore not the leftmost match.
|
|
///
|
|
/// N.B. I came up with this algorithm on my own, and after scouring all of
|
|
/// the other AC implementations I know of (Perl, Snort, many on GitHub).
|
|
/// I couldn't find any that implement leftmost semantics like this.
|
|
/// Perl of course needs leftmost-first semantics, but they implement it
|
|
/// with a seeming hack at *search* time instead of encoding it into the
|
|
/// automaton. There are also a couple Java libraries that support leftmost
|
|
/// longest semantics, but they do it by building a queue of matches at
|
|
/// search time, which is even worse than what Perl is doing. ---AG
|
|
fn fill_failure_transitions(&mut self) -> Result<(), BuildError> {
|
|
let is_leftmost = self.builder.match_kind.is_leftmost();
|
|
let start_uid = self.nfa.special.start_unanchored_id;
|
|
// Initialize the queue for breadth first search with all transitions
|
|
// out of the start state. We handle the start state specially because
|
|
// we only want to follow non-self transitions. If we followed self
|
|
// transitions, then this would never terminate.
|
|
let mut queue = VecDeque::new();
|
|
let mut seen = self.queued_set();
|
|
let mut prev_link = None;
|
|
while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
|
|
prev_link = Some(link);
|
|
let t = self.nfa.sparse[link];
|
|
|
|
// Skip anything we've seen before and any self-transitions on the
|
|
// start state.
|
|
if start_uid == t.next() || seen.contains(t.next) {
|
|
continue;
|
|
}
|
|
queue.push_back(t.next);
|
|
seen.insert(t.next);
|
|
// Under leftmost semantics, if a state immediately following
|
|
// the start state is a match state, then we never want to
|
|
// follow its failure transition since the failure transition
|
|
// necessarily leads back to the start state, which we never
|
|
// want to do for leftmost matching after a match has been
|
|
// found.
|
|
//
|
|
// We apply the same logic to non-start states below as well.
|
|
if is_leftmost && self.nfa.states[t.next].is_match() {
|
|
self.nfa.states[t.next].fail = NFA::DEAD;
|
|
}
|
|
}
|
|
while let Some(id) = queue.pop_front() {
|
|
let mut prev_link = None;
|
|
while let Some(link) = self.nfa.next_link(id, prev_link) {
|
|
prev_link = Some(link);
|
|
let t = self.nfa.sparse[link];
|
|
|
|
if seen.contains(t.next) {
|
|
// The only way to visit a duplicate state in a transition
|
|
// list is when ASCII case insensitivity is enabled. In
|
|
// this case, we want to skip it since it's redundant work.
|
|
// But it would also end up duplicating matches, which
|
|
// results in reporting duplicate matches in some cases.
|
|
// See the 'acasei010' regression test.
|
|
continue;
|
|
}
|
|
queue.push_back(t.next);
|
|
seen.insert(t.next);
|
|
|
|
// As above for start states, under leftmost semantics, once
|
|
// we see a match all subsequent states should have no failure
|
|
// transitions because failure transitions always imply looking
|
|
// for a match that is a suffix of what has been seen so far
|
|
// (where "seen so far" corresponds to the string formed by
|
|
// following the transitions from the start state to the
|
|
// current state). Under leftmost semantics, we specifically do
|
|
// not want to allow this to happen because we always want to
|
|
// report the match found at the leftmost position.
|
|
//
|
|
// The difference between leftmost-first and leftmost-longest
|
|
// occurs previously while we build the trie. For
|
|
// leftmost-first, we simply omit any entries that would
|
|
// otherwise require passing through a match state.
|
|
//
|
|
// Note that for correctness, the failure transition has to be
|
|
// set to the dead state for ALL states following a match, not
|
|
// just the match state itself. However, by setting the failure
|
|
// transition to the dead state on all match states, the dead
|
|
// state will automatically propagate to all subsequent states
|
|
// via the failure state computation below.
|
|
if is_leftmost && self.nfa.states[t.next].is_match() {
|
|
self.nfa.states[t.next].fail = NFA::DEAD;
|
|
continue;
|
|
}
|
|
let mut fail = self.nfa.states[id].fail;
|
|
while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL {
|
|
fail = self.nfa.states[fail].fail;
|
|
}
|
|
fail = self.nfa.follow_transition(fail, t.byte);
|
|
self.nfa.states[t.next].fail = fail;
|
|
self.nfa.copy_matches(fail, t.next)?;
|
|
}
|
|
// If the start state is a match state, then this automaton can
|
|
// match the empty string. This implies all states are match states
|
|
// since every position matches the empty string, so copy the
|
|
// matches from the start state to every state. Strictly speaking,
|
|
// this is only necessary for overlapping matches since each
|
|
// non-empty non-start match state needs to report empty matches
|
|
// in addition to its own. For the non-overlapping case, such
|
|
// states only report the first match, which is never empty since
|
|
// it isn't a start state.
|
|
if !is_leftmost {
|
|
self.nfa
|
|
.copy_matches(self.nfa.special.start_unanchored_id, id)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Shuffle the states so that they appear in this sequence:
|
|
///
|
|
/// DEAD, FAIL, MATCH..., START, START, NON-MATCH...
|
|
///
|
|
/// The idea here is that if we know how special states are laid out in our
|
|
/// transition table, then we can determine what "kind" of state we're in
|
|
/// just by comparing our current state ID with a particular value. In this
|
|
/// way, we avoid doing extra memory lookups.
|
|
///
|
|
/// Before shuffling begins, our states look something like this:
|
|
///
|
|
/// DEAD, FAIL, START, START, (MATCH | NON-MATCH)...
|
|
///
|
|
/// So all we need to do is move all of the MATCH states so that they
|
|
/// all appear before any NON-MATCH state, like so:
|
|
///
|
|
/// DEAD, FAIL, START, START, MATCH... NON-MATCH...
|
|
///
|
|
/// Then it's just a simple matter of swapping the two START states with
|
|
/// the last two MATCH states.
|
|
///
|
|
/// (This is the same technique used for fully compiled DFAs in
|
|
/// regex-automata.)
|
|
fn shuffle(&mut self) {
|
|
let old_start_uid = self.nfa.special.start_unanchored_id;
|
|
let old_start_aid = self.nfa.special.start_anchored_id;
|
|
assert!(old_start_uid < old_start_aid);
|
|
assert_eq!(
|
|
3,
|
|
old_start_aid.as_usize(),
|
|
"anchored start state should be at index 3"
|
|
);
|
|
// We implement shuffling by a sequence of pairwise swaps of states.
|
|
// Since we have a number of things referencing states via their
|
|
// IDs and swapping them changes their IDs, we need to record every
|
|
// swap we make so that we can remap IDs. The remapper handles this
|
|
// book-keeping for us.
|
|
let mut remapper = Remapper::new(&self.nfa, 0);
|
|
// The way we proceed here is by moving all match states so that
|
|
// they directly follow the start states. So it will go: DEAD, FAIL,
|
|
// START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
|
|
//
|
|
// To do that, we proceed forward through all states after
|
|
// START-ANCHORED and swap match states so that they appear before all
|
|
// non-match states.
|
|
let mut next_avail = StateID::from(4u8);
|
|
for i in next_avail.as_usize()..self.nfa.states.len() {
|
|
let sid = StateID::new(i).unwrap();
|
|
if !self.nfa.states[sid].is_match() {
|
|
continue;
|
|
}
|
|
remapper.swap(&mut self.nfa, sid, next_avail);
|
|
// The key invariant here is that only non-match states exist
|
|
// between 'next_avail' and 'sid' (with them being potentially
|
|
// equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
|
|
// to land on the leftmost non-match state. (Unless 'next_avail'
|
|
// and 'sid' are equivalent, in which case, a swap will occur but
|
|
// it is a no-op.)
|
|
next_avail = StateID::new(next_avail.one_more()).unwrap();
|
|
}
|
|
// Now we'd like to move the start states to immediately following the
|
|
// match states. (The start states may themselves be match states, but
|
|
// we'll handle that later.) We arrange the states this way so that we
|
|
// don't necessarily need to check whether a state is a start state or
|
|
// not before checking whether a state is a match state. For example,
|
|
// we'd like to be able to write this as our state machine loop:
|
|
//
|
|
// sid = start()
|
|
// for byte in haystack:
|
|
// sid = next(sid, byte)
|
|
// if sid <= nfa.max_start_id:
|
|
// if sid <= nfa.max_dead_id:
|
|
// # search complete
|
|
// elif sid <= nfa.max_match_id:
|
|
// # found match
|
|
//
|
|
// The important context here is that we might not want to look for
|
|
// start states at all. Namely, if a searcher doesn't have a prefilter,
|
|
// then there is no reason to care about whether we're in a start state
|
|
// or not. And indeed, if we did check for it, this very hot loop would
|
|
// ping pong between the special state handling and the main state
|
|
// transition logic. This in turn stalls the CPU by killing branch
|
|
// prediction.
|
|
//
|
|
// So essentially, we really want to be able to "forget" that start
|
|
// states even exist and this is why we put them at the end.
|
|
let new_start_aid =
|
|
StateID::new(next_avail.as_usize().checked_sub(1).unwrap())
|
|
.unwrap();
|
|
remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
|
|
let new_start_uid =
|
|
StateID::new(next_avail.as_usize().checked_sub(2).unwrap())
|
|
.unwrap();
|
|
remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
|
|
let new_max_match_id =
|
|
StateID::new(next_avail.as_usize().checked_sub(3).unwrap())
|
|
.unwrap();
|
|
self.nfa.special.max_match_id = new_max_match_id;
|
|
self.nfa.special.start_unanchored_id = new_start_uid;
|
|
self.nfa.special.start_anchored_id = new_start_aid;
|
|
// If one start state is a match state, then they both are.
|
|
if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
|
|
self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
|
|
}
|
|
remapper.remap(&mut self.nfa);
|
|
}
|
|
|
|
/// Attempts to convert the transition representation of a subset of states
|
|
/// in this NFA from sparse to dense. This can greatly improve search
|
|
/// performance since states with a higher number of transitions tend to
|
|
/// correlate with very active states.
|
|
///
|
|
/// We generally only densify states that are close to the start state.
|
|
/// These tend to be the most active states and thus benefit from a dense
|
|
/// representation more than other states.
|
|
///
|
|
/// This tends to best balance between memory usage and performance. In
|
|
/// particular, the *vast majority* of all states in a typical Aho-Corasick
|
|
/// automaton have only 1 transition and are usually farther from the start
|
|
/// state and thus don't get densified.
|
|
///
|
|
/// Note that this doesn't remove the sparse representation of transitions
|
|
/// for states that are densified. It could be done, but actually removing
|
|
/// entries from `NFA::sparse` is likely more expensive than it's worth.
|
|
fn densify(&mut self) -> Result<(), BuildError> {
|
|
for i in 0..self.nfa.states.len() {
|
|
let sid = StateID::new(i).unwrap();
|
|
// Don't bother densifying states that are only used as sentinels.
|
|
if sid == NFA::DEAD || sid == NFA::FAIL {
|
|
continue;
|
|
}
|
|
// Only densify states that are "close enough" to the start state.
|
|
if self.nfa.states[sid].depth.as_usize()
|
|
>= self.builder.dense_depth
|
|
{
|
|
continue;
|
|
}
|
|
let dense = self.nfa.alloc_dense_state()?;
|
|
let mut prev_link = None;
|
|
while let Some(link) = self.nfa.next_link(sid, prev_link) {
|
|
prev_link = Some(link);
|
|
let t = self.nfa.sparse[link];
|
|
|
|
let class = usize::from(self.nfa.byte_classes.get(t.byte));
|
|
let index = dense.as_usize() + class;
|
|
self.nfa.dense[index] = t.next;
|
|
}
|
|
self.nfa.states[sid].dense = dense;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Returns a set that tracked queued states.
|
|
///
|
|
/// This is only necessary when ASCII case insensitivity is enabled, since
|
|
/// it is the only way to visit the same state twice. Otherwise, this
|
|
/// returns an inert set that nevers adds anything and always reports
|
|
/// `false` for every member test.
|
|
fn queued_set(&self) -> QueuedSet {
|
|
if self.builder.ascii_case_insensitive {
|
|
QueuedSet::active()
|
|
} else {
|
|
QueuedSet::inert()
|
|
}
|
|
}
|
|
|
|
/// Initializes the unanchored start state by making it dense. This is
|
|
/// achieved by explicitly setting every transition to the FAIL state.
|
|
/// This isn't necessary for correctness, since any missing transition is
|
|
/// automatically assumed to be mapped to the FAIL state. We do this to
|
|
/// make the unanchored starting state dense, and thus in turn make
|
|
/// transition lookups on it faster. (Which is worth doing because it's
|
|
/// the most active state.)
|
|
fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> {
|
|
let start_uid = self.nfa.special.start_unanchored_id;
|
|
let start_aid = self.nfa.special.start_anchored_id;
|
|
self.nfa.init_full_state(start_uid, NFA::FAIL)?;
|
|
self.nfa.init_full_state(start_aid, NFA::FAIL)?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Setup the anchored start state by copying all of the transitions and
|
|
/// matches from the unanchored starting state with one change: the failure
|
|
/// transition is changed to the DEAD state, so that for any undefined
|
|
/// transitions, the search will stop.
|
|
fn set_anchored_start_state(&mut self) -> Result<(), BuildError> {
|
|
let start_uid = self.nfa.special.start_unanchored_id;
|
|
let start_aid = self.nfa.special.start_anchored_id;
|
|
let (mut uprev_link, mut aprev_link) = (None, None);
|
|
loop {
|
|
let unext = self.nfa.next_link(start_uid, uprev_link);
|
|
let anext = self.nfa.next_link(start_aid, aprev_link);
|
|
let (ulink, alink) = match (unext, anext) {
|
|
(Some(ulink), Some(alink)) => (ulink, alink),
|
|
(None, None) => break,
|
|
_ => unreachable!(),
|
|
};
|
|
uprev_link = Some(ulink);
|
|
aprev_link = Some(alink);
|
|
self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next;
|
|
}
|
|
self.nfa.copy_matches(start_uid, start_aid)?;
|
|
// This is the main difference between the unanchored and anchored
|
|
// starting states. If a lookup on an anchored starting state fails,
|
|
// then the search should stop.
|
|
//
|
|
// N.B. This assumes that the loop on the unanchored starting state
|
|
// hasn't been created yet.
|
|
self.nfa.states[start_aid].fail = NFA::DEAD;
|
|
Ok(())
|
|
}
|
|
|
|
/// Set the failure transitions on the start state to loop back to the
|
|
/// start state. This effectively permits the Aho-Corasick automaton to
|
|
/// match at any position. This is also required for finding the next
|
|
/// state to terminate, namely, finding the next state should never return
|
|
/// a fail_id.
|
|
///
|
|
/// This must be done after building the initial trie, since trie
|
|
/// construction depends on transitions to `fail_id` to determine whether a
|
|
/// state already exists or not.
|
|
fn add_unanchored_start_state_loop(&mut self) {
|
|
let start_uid = self.nfa.special.start_unanchored_id;
|
|
let mut prev_link = None;
|
|
while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
|
|
prev_link = Some(link);
|
|
if self.nfa.sparse[link].next() == NFA::FAIL {
|
|
self.nfa.sparse[link].next = start_uid;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Remove the start state loop by rewriting any transitions on the start
|
|
/// state back to the start state with transitions to the dead state.
|
|
///
|
|
/// The loop is only closed when two conditions are met: the start state
|
|
/// is a match state and the match kind is leftmost-first or
|
|
/// leftmost-longest.
|
|
///
|
|
/// The reason for this is that under leftmost semantics, a start state
|
|
/// that is also a match implies that we should never restart the search
|
|
/// process. We allow normal transitions out of the start state, but if
|
|
/// none exist, we transition to the dead state, which signals that
|
|
/// searching should stop.
|
|
fn close_start_state_loop_for_leftmost(&mut self) {
|
|
let start_uid = self.nfa.special.start_unanchored_id;
|
|
let start = &mut self.nfa.states[start_uid];
|
|
let dense = start.dense;
|
|
if self.builder.match_kind.is_leftmost() && start.is_match() {
|
|
let mut prev_link = None;
|
|
while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
|
|
prev_link = Some(link);
|
|
if self.nfa.sparse[link].next() == start_uid {
|
|
self.nfa.sparse[link].next = NFA::DEAD;
|
|
if dense != StateID::ZERO {
|
|
let b = self.nfa.sparse[link].byte;
|
|
let class = usize::from(self.nfa.byte_classes.get(b));
|
|
self.nfa.dense[dense.as_usize() + class] = NFA::DEAD;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Sets all transitions on the dead state to point back to the dead state.
|
|
/// Normally, missing transitions map back to the failure state, but the
|
|
/// point of the dead state is to act as a sink that can never be escaped.
|
|
fn add_dead_state_loop(&mut self) -> Result<(), BuildError> {
|
|
self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// A set of state identifiers used to avoid revisiting the same state multiple
|
|
/// times when filling in failure transitions.
|
|
///
|
|
/// This set has an "inert" and an "active" mode. When inert, the set never
|
|
/// stores anything and always returns `false` for every member test. This is
|
|
/// useful to avoid the performance and memory overhead of maintaining this
|
|
/// set when it is not needed.
|
|
#[derive(Debug)]
|
|
struct QueuedSet {
|
|
set: Option<BTreeSet<StateID>>,
|
|
}
|
|
|
|
impl QueuedSet {
|
|
/// Return an inert set that returns `false` for every state ID membership
|
|
/// test.
|
|
fn inert() -> QueuedSet {
|
|
QueuedSet { set: None }
|
|
}
|
|
|
|
/// Return an active set that tracks state ID membership.
|
|
fn active() -> QueuedSet {
|
|
QueuedSet { set: Some(BTreeSet::new()) }
|
|
}
|
|
|
|
/// Inserts the given state ID into this set. (If the set is inert, then
|
|
/// this is a no-op.)
|
|
fn insert(&mut self, state_id: StateID) {
|
|
if let Some(ref mut set) = self.set {
|
|
set.insert(state_id);
|
|
}
|
|
}
|
|
|
|
/// Returns true if and only if the given state ID is in this set. If the
|
|
/// set is inert, this always returns false.
|
|
fn contains(&self, state_id: StateID) -> bool {
|
|
match self.set {
|
|
None => false,
|
|
Some(ref set) => set.contains(&state_id),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl core::fmt::Debug for NFA {
|
|
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
|
use crate::{
|
|
automaton::{fmt_state_indicator, sparse_transitions},
|
|
util::debug::DebugByte,
|
|
};
|
|
|
|
writeln!(f, "noncontiguous::NFA(")?;
|
|
for (sid, state) in self.states.iter().with_state_ids() {
|
|
// The FAIL state doesn't actually have space for a state allocated
|
|
// for it, so we have to treat it as a special case.
|
|
if sid == NFA::FAIL {
|
|
writeln!(f, "F {:06}:", sid.as_usize())?;
|
|
continue;
|
|
}
|
|
fmt_state_indicator(f, self, sid)?;
|
|
write!(
|
|
f,
|
|
"{:06}({:06}): ",
|
|
sid.as_usize(),
|
|
state.fail.as_usize()
|
|
)?;
|
|
|
|
let it = sparse_transitions(
|
|
self.iter_trans(sid).map(|t| (t.byte, t.next)),
|
|
)
|
|
.enumerate();
|
|
for (i, (start, end, sid)) in it {
|
|
if i > 0 {
|
|
write!(f, ", ")?;
|
|
}
|
|
if start == end {
|
|
write!(
|
|
f,
|
|
"{:?} => {:?}",
|
|
DebugByte(start),
|
|
sid.as_usize()
|
|
)?;
|
|
} else {
|
|
write!(
|
|
f,
|
|
"{:?}-{:?} => {:?}",
|
|
DebugByte(start),
|
|
DebugByte(end),
|
|
sid.as_usize()
|
|
)?;
|
|
}
|
|
}
|
|
|
|
write!(f, "\n")?;
|
|
if self.is_match(sid) {
|
|
write!(f, " matches: ")?;
|
|
for (i, pid) in self.iter_matches(sid).enumerate() {
|
|
if i > 0 {
|
|
write!(f, ", ")?;
|
|
}
|
|
write!(f, "{}", pid.as_usize())?;
|
|
}
|
|
write!(f, "\n")?;
|
|
}
|
|
}
|
|
writeln!(f, "match kind: {:?}", self.match_kind)?;
|
|
writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
|
|
writeln!(f, "state length: {:?}", self.states.len())?;
|
|
writeln!(f, "pattern length: {:?}", self.patterns_len())?;
|
|
writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
|
|
writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
|
|
writeln!(f, "memory usage: {:?}", self.memory_usage())?;
|
|
writeln!(f, ")")?;
|
|
Ok(())
|
|
}
|
|
}
|