Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

2789
vendor/aho-corasick/src/ahocorasick.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1608
vendor/aho-corasick/src/automaton.rs vendored Normal file

File diff suppressed because it is too large Load Diff

835
vendor/aho-corasick/src/dfa.rs vendored Normal file
View File

@@ -0,0 +1,835 @@
/*!
Provides direct access to a DFA implementation of Aho-Corasick.
This is a low-level API that generally only needs to be used in niche
circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
instead of a DFA directly. Using an `DFA` directly is typically only necessary
when one needs access to the [`Automaton`] trait implementation.
*/
use alloc::{vec, vec::Vec};
use crate::{
automaton::Automaton,
nfa::noncontiguous,
util::{
alphabet::ByteClasses,
error::{BuildError, MatchError},
int::{Usize, U32},
prefilter::Prefilter,
primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
search::{Anchored, MatchKind, StartKind},
special::Special,
},
};
/// A DFA implementation of Aho-Corasick.
///
/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
/// this type directly. Using a `DFA` directly is typically only necessary when
/// one needs access to the [`Automaton`] trait implementation.
///
/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
///
/// A DFA provides the best possible search performance (in this crate) via two
/// mechanisms:
///
/// * All states use a dense representation for their transitions.
/// * All failure transitions are pre-computed such that they are never
/// explicitly handled at search time.
///
/// These two facts combined mean that every state transition is performed
/// using a constant number of instructions. However, this comes at
/// great cost. The memory usage of a DFA can be quite exorbitant.
/// It is potentially multiple orders of magnitude greater than a
/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
/// a DFA will typically have better search speed than a `contiguous::NFA`, but
/// not by orders of magnitude.
///
/// Unless you have a small number of patterns or memory usage is not a concern
/// and search performance is critical, a DFA is usually not the best choice.
///
/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
/// support for anchored and unanchored search configurations. Namely,
/// since failure transitions are pre-computed, supporting both anchored
/// and unanchored searches requires a duplication of the transition table,
/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
/// unconditionally support both anchored and unanchored searches because there
/// is essentially no added cost for doing so.) It is for this reason that
/// a DFA's support for anchored and unanchored searches can be configured
/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
/// searches.
///
/// # Example
///
/// This example shows how to build an `DFA` directly and use it to execute
/// [`Automaton::try_find`]:
///
/// ```
/// use aho_corasick::{
/// automaton::Automaton,
/// dfa::DFA,
/// Input, Match,
/// };
///
/// let patterns = &["b", "abc", "abcd"];
/// let haystack = "abcd";
///
/// let nfa = DFA::new(patterns).unwrap();
/// assert_eq!(
/// Some(Match::must(0, 1..2)),
/// nfa.try_find(&Input::new(haystack))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// It is also possible to implement your own version of `try_find`. See the
/// [`Automaton`] documentation for an example.
#[derive(Clone)]
pub struct DFA {
/// The DFA transition table. IDs in this table are pre-multiplied. So
/// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
/// 2*stride, 3*stride, ...
trans: Vec<StateID>,
/// The matches for every match state in this DFA. This is first indexed by
/// state index (so that's `sid >> stride2`) and then by order in which the
/// matches are meant to occur.
matches: Vec<Vec<PatternID>>,
/// The amount of heap memory used, in bytes, by the inner Vecs of
/// 'matches'.
matches_memory_usage: usize,
/// The length of each pattern. This is used to compute the start offset
/// of a match.
pattern_lens: Vec<SmallIndex>,
/// A prefilter for accelerating searches, if one exists.
prefilter: Option<Prefilter>,
/// The match semantics built into this DFA.
match_kind: MatchKind,
/// The total number of states in this DFA.
state_len: usize,
/// The alphabet size, or total number of equivalence classes, for this
/// DFA. Note that the actual number of transitions in each state is
/// stride=2^stride2, where stride is the smallest power of 2 greater than
/// or equal to alphabet_len. We do things this way so that we can use
/// bitshifting to go from a state ID to an index into 'matches'.
alphabet_len: usize,
/// The exponent with a base 2, such that stride=2^stride2. Given a state
/// index 'i', its state identifier is 'i << stride2'. Given a state
/// identifier 'sid', its state index is 'sid >> stride2'.
stride2: usize,
/// The equivalence classes for this DFA. All transitions are defined on
/// equivalence classes and not on the 256 distinct byte values.
byte_classes: ByteClasses,
/// The length of the shortest pattern in this automaton.
min_pattern_len: usize,
/// The length of the longest pattern in this automaton.
max_pattern_len: usize,
/// The information required to deduce which states are "special" in this
/// DFA.
special: Special,
}
impl DFA {
/// Create a new Aho-Corasick DFA using the default configuration.
///
/// Use a [`Builder`] if you want to change the configuration.
pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
DFA::builder().build(patterns)
}
/// A convenience method for returning a new Aho-Corasick DFA builder.
///
/// This usually permits one to just import the `DFA` type.
pub fn builder() -> Builder {
Builder::new()
}
}
impl DFA {
/// A sentinel state ID indicating that a search should stop once it has
/// entered this state. When a search stops, it returns a match if one has
/// been found, otherwise no match. A DFA always has an actual dead state
/// at this ID.
///
/// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
/// Namely, the whole point of a DFA is that the FAIL state is completely
/// compiled away. That is, DFA construction involves pre-computing the
/// failure transitions everywhere, such that failure transitions are no
/// longer used at search time. This, combined with its uniformly dense
/// representation, are the two most important factors in why it's faster
/// than the NFAs in this crate.
const DEAD: StateID = StateID::new_unchecked(0);
/// Adds the given pattern IDs as matches to the given state and also
/// records the added memory usage.
fn set_matches(
&mut self,
sid: StateID,
pids: impl Iterator<Item = PatternID>,
) {
let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
let mut at_least_one = false;
for pid in pids {
self.matches[index].push(pid);
self.matches_memory_usage += PatternID::SIZE;
at_least_one = true;
}
assert!(at_least_one, "match state must have non-empty pids");
}
}
// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
// returns a valid state ID given a valid state ID. We otherwise claim that
// all other methods are correct as well.
unsafe impl Automaton for DFA {
#[inline(always)]
fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
// Either of the start state IDs can be DEAD, in which case, support
// for that type of search is not provided by this DFA. Which start
// state IDs are inactive depends on the 'StartKind' configuration at
// DFA construction time.
match anchored {
Anchored::No => {
let start = self.special.start_unanchored_id;
if start == DFA::DEAD {
Err(MatchError::invalid_input_unanchored())
} else {
Ok(start)
}
}
Anchored::Yes => {
let start = self.special.start_anchored_id;
if start == DFA::DEAD {
Err(MatchError::invalid_input_anchored())
} else {
Ok(start)
}
}
}
}
#[inline(always)]
fn next_state(
&self,
_anchored: Anchored,
sid: StateID,
byte: u8,
) -> StateID {
let class = self.byte_classes.get(byte);
self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
}
#[inline(always)]
fn is_special(&self, sid: StateID) -> bool {
sid <= self.special.max_special_id
}
#[inline(always)]
fn is_dead(&self, sid: StateID) -> bool {
sid == DFA::DEAD
}
#[inline(always)]
fn is_match(&self, sid: StateID) -> bool {
!self.is_dead(sid) && sid <= self.special.max_match_id
}
#[inline(always)]
fn is_start(&self, sid: StateID) -> bool {
sid == self.special.start_unanchored_id
|| sid == self.special.start_anchored_id
}
#[inline(always)]
fn match_kind(&self) -> MatchKind {
self.match_kind
}
#[inline(always)]
fn patterns_len(&self) -> usize {
self.pattern_lens.len()
}
#[inline(always)]
fn pattern_len(&self, pid: PatternID) -> usize {
self.pattern_lens[pid].as_usize()
}
#[inline(always)]
fn min_pattern_len(&self) -> usize {
self.min_pattern_len
}
#[inline(always)]
fn max_pattern_len(&self) -> usize {
self.max_pattern_len
}
#[inline(always)]
fn match_len(&self, sid: StateID) -> usize {
debug_assert!(self.is_match(sid));
let offset = (sid.as_usize() >> self.stride2) - 2;
self.matches[offset].len()
}
#[inline(always)]
fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
debug_assert!(self.is_match(sid));
let offset = (sid.as_usize() >> self.stride2) - 2;
self.matches[offset][index]
}
#[inline(always)]
fn memory_usage(&self) -> usize {
use core::mem::size_of;
(self.trans.len() * size_of::<u32>())
+ (self.matches.len() * size_of::<Vec<PatternID>>())
+ self.matches_memory_usage
+ (self.pattern_lens.len() * size_of::<SmallIndex>())
+ self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
}
#[inline(always)]
fn prefilter(&self) -> Option<&Prefilter> {
self.prefilter.as_ref()
}
}
impl core::fmt::Debug for DFA {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use crate::{
automaton::{fmt_state_indicator, sparse_transitions},
util::debug::DebugByte,
};
writeln!(f, "dfa::DFA(")?;
for index in 0..self.state_len {
let sid = StateID::new_unchecked(index << self.stride2);
// While we do currently include the FAIL state in the transition
// table (to simplify construction), it is never actually used. It
// poses problems with the code below because it gets treated as
// a match state incidentally when it is, of course, not. So we
// special case it. The fail state is always the first state after
// the dead state.
//
// If the construction is changed to remove the fail state (it
// probably should be), then this special case should be updated.
if index == 1 {
writeln!(f, "F {:06}:", sid.as_usize())?;
continue;
}
fmt_state_indicator(f, self, sid)?;
write!(f, "{:06}: ", sid.as_usize())?;
let it = (0..self.byte_classes.alphabet_len()).map(|class| {
(class.as_u8(), self.trans[sid.as_usize() + class])
});
for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
if i > 0 {
write!(f, ", ")?;
}
if start == end {
write!(
f,
"{:?} => {:?}",
DebugByte(start),
next.as_usize()
)?;
} else {
write!(
f,
"{:?}-{:?} => {:?}",
DebugByte(start),
DebugByte(end),
next.as_usize()
)?;
}
}
write!(f, "\n")?;
if self.is_match(sid) {
write!(f, " matches: ")?;
for i in 0..self.match_len(sid) {
if i > 0 {
write!(f, ", ")?;
}
let pid = self.match_pattern(sid, i);
write!(f, "{}", pid.as_usize())?;
}
write!(f, "\n")?;
}
}
writeln!(f, "match kind: {:?}", self.match_kind)?;
writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
writeln!(f, "state length: {:?}", self.state_len)?;
writeln!(f, "pattern length: {:?}", self.patterns_len())?;
writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
writeln!(f, "stride: {:?}", 1 << self.stride2)?;
writeln!(f, "byte classes: {:?}", self.byte_classes)?;
writeln!(f, "memory usage: {:?}", self.memory_usage())?;
writeln!(f, ")")?;
Ok(())
}
}
/// A builder for configuring an Aho-Corasick DFA.
///
/// This builder has a subset of the options available to a
/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
/// their behavior is identical.
#[derive(Clone, Debug)]
pub struct Builder {
noncontiguous: noncontiguous::Builder,
start_kind: StartKind,
byte_classes: bool,
}
impl Default for Builder {
fn default() -> Builder {
Builder {
noncontiguous: noncontiguous::Builder::new(),
start_kind: StartKind::Unanchored,
byte_classes: true,
}
}
}
impl Builder {
/// Create a new builder for configuring an Aho-Corasick DFA.
pub fn new() -> Builder {
Builder::default()
}
/// Build an Aho-Corasick DFA from the given iterator of patterns.
///
/// A builder may be reused to create more DFAs.
pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let nnfa = self.noncontiguous.build(patterns)?;
self.build_from_noncontiguous(&nnfa)
}
/// Build an Aho-Corasick DFA from the given noncontiguous NFA.
///
/// Note that when this method is used, only the `start_kind` and
/// `byte_classes` settings on this builder are respected. The other
/// settings only apply to the initial construction of the Aho-Corasick
/// automaton. Since using this method requires that initial construction
/// has already completed, all settings impacting only initial construction
/// are no longer relevant.
pub fn build_from_noncontiguous(
&self,
nnfa: &noncontiguous::NFA,
) -> Result<DFA, BuildError> {
debug!("building DFA");
let byte_classes = if self.byte_classes {
nnfa.byte_classes().clone()
} else {
ByteClasses::singletons()
};
let state_len = match self.start_kind {
StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
StartKind::Both => {
// These unwraps are OK because we know that the number of
// NFA states is < StateID::LIMIT which is in turn less than
// i32::MAX. Thus, there is always room to multiply by 2.
// Finally, the number of states is always at least 4 in the
// NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
// subtraction of 4 is okay.
//
// Note that we subtract 4 because the "anchored" part of
// the DFA duplicates the unanchored part (without failure
// transitions), but reuses the DEAD, FAIL and START states.
nnfa.states()
.len()
.checked_mul(2)
.unwrap()
.checked_sub(4)
.unwrap()
}
};
let trans_len =
match state_len.checked_shl(byte_classes.stride2().as_u32()) {
Some(trans_len) => trans_len,
None => {
return Err(BuildError::state_id_overflow(
StateID::MAX.as_u64(),
usize::MAX.as_u64(),
))
}
};
StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
.map_err(|e| {
BuildError::state_id_overflow(
StateID::MAX.as_u64(),
e.attempted(),
)
})?;
let num_match_states = match self.start_kind {
StartKind::Unanchored | StartKind::Anchored => {
nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
}
StartKind::Both => nnfa
.special()
.max_match_id
.as_usize()
.checked_sub(1)
.unwrap()
.checked_mul(2)
.unwrap(),
};
let mut dfa = DFA {
trans: vec![DFA::DEAD; trans_len],
matches: vec![vec![]; num_match_states],
matches_memory_usage: 0,
pattern_lens: nnfa.pattern_lens_raw().to_vec(),
prefilter: nnfa.prefilter().map(|p| p.clone()),
match_kind: nnfa.match_kind(),
state_len,
alphabet_len: byte_classes.alphabet_len(),
stride2: byte_classes.stride2(),
byte_classes,
min_pattern_len: nnfa.min_pattern_len(),
max_pattern_len: nnfa.max_pattern_len(),
// The special state IDs are set later.
special: Special::zero(),
};
match self.start_kind {
StartKind::Both => {
self.finish_build_both_starts(nnfa, &mut dfa);
}
StartKind::Unanchored => {
self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
}
StartKind::Anchored => {
self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
}
}
debug!(
"DFA built, <states: {:?}, size: {:?}, \
alphabet len: {:?}, stride: {:?}>",
dfa.state_len,
dfa.memory_usage(),
dfa.byte_classes.alphabet_len(),
dfa.byte_classes.stride(),
);
// The vectors can grow ~twice as big during construction because a
// Vec amortizes growth. But here, let's shrink things back down to
// what we actually need since we're never going to add more to it.
dfa.trans.shrink_to_fit();
dfa.pattern_lens.shrink_to_fit();
dfa.matches.shrink_to_fit();
// TODO: We might also want to shrink each Vec inside of `dfa.matches`,
// or even better, convert it to one contiguous allocation. But I think
// I went with nested allocs for good reason (can't remember), so this
// may be tricky to do. I decided not to shrink them here because it
// might require a fair bit of work to do. It's unclear whether it's
// worth it.
Ok(dfa)
}
/// Finishes building a DFA for either unanchored or anchored searches,
/// but NOT both.
fn finish_build_one_start(
&self,
anchored: Anchored,
nnfa: &noncontiguous::NFA,
dfa: &mut DFA,
) {
// This function always succeeds because we check above that all of the
// states in the NFA can be mapped to DFA state IDs.
let stride2 = dfa.stride2;
let old2new = |oldsid: StateID| {
StateID::new_unchecked(oldsid.as_usize() << stride2)
};
for (oldsid, state) in nnfa.states().iter().with_state_ids() {
let newsid = old2new(oldsid);
if state.is_match() {
dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
}
sparse_iter(
nnfa,
oldsid,
&dfa.byte_classes,
|byte, class, mut oldnextsid| {
if oldnextsid == noncontiguous::NFA::FAIL {
if anchored.is_anchored() {
oldnextsid = noncontiguous::NFA::DEAD;
} else if state.fail() == noncontiguous::NFA::DEAD {
// This is a special case that avoids following
// DEAD transitions in a non-contiguous NFA.
// Following these transitions is pretty slow
// because the non-contiguous NFA will always use
// a sparse representation for it (because the
// DEAD state is usually treated as a sentinel).
// The *vast* majority of failure states are DEAD
// states, so this winds up being pretty slow if
// we go through the non-contiguous NFA state
// transition logic. Instead, just do it ourselves.
oldnextsid = noncontiguous::NFA::DEAD;
} else {
oldnextsid = nnfa.next_state(
Anchored::No,
state.fail(),
byte,
);
}
}
dfa.trans[newsid.as_usize() + usize::from(class)] =
old2new(oldnextsid);
},
);
}
// Now that we've remapped all the IDs in our states, all that's left
// is remapping the special state IDs.
let old = nnfa.special();
let new = &mut dfa.special;
new.max_special_id = old2new(old.max_special_id);
new.max_match_id = old2new(old.max_match_id);
if anchored.is_anchored() {
new.start_unanchored_id = DFA::DEAD;
new.start_anchored_id = old2new(old.start_anchored_id);
} else {
new.start_unanchored_id = old2new(old.start_unanchored_id);
new.start_anchored_id = DFA::DEAD;
}
}
/// Finishes building a DFA that supports BOTH unanchored and anchored
/// searches. It works by inter-leaving unanchored states with anchored
/// states in the same transition table. This way, we avoid needing to
/// re-shuffle states afterward to ensure that our states still look like
/// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
///
/// Honestly this is pretty inscrutable... Simplifications are most
/// welcome.
fn finish_build_both_starts(
&self,
nnfa: &noncontiguous::NFA,
dfa: &mut DFA,
) {
let stride2 = dfa.stride2;
let stride = 1 << stride2;
let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
let mut is_anchored = vec![false; dfa.state_len];
let mut newsid = DFA::DEAD;
let next_dfa_id =
|sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
for (oldsid, state) in nnfa.states().iter().with_state_ids() {
if oldsid == noncontiguous::NFA::DEAD
|| oldsid == noncontiguous::NFA::FAIL
{
remap_unanchored[oldsid] = newsid;
remap_anchored[oldsid] = newsid;
newsid = next_dfa_id(newsid);
} else if oldsid == nnfa.special().start_unanchored_id
|| oldsid == nnfa.special().start_anchored_id
{
if oldsid == nnfa.special().start_unanchored_id {
remap_unanchored[oldsid] = newsid;
remap_anchored[oldsid] = DFA::DEAD;
} else {
remap_unanchored[oldsid] = DFA::DEAD;
remap_anchored[oldsid] = newsid;
is_anchored[newsid.as_usize() >> stride2] = true;
}
if state.is_match() {
dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
}
sparse_iter(
nnfa,
oldsid,
&dfa.byte_classes,
|_, class, oldnextsid| {
let class = usize::from(class);
if oldnextsid == noncontiguous::NFA::FAIL {
dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
} else {
dfa.trans[newsid.as_usize() + class] = oldnextsid;
}
},
);
newsid = next_dfa_id(newsid);
} else {
let unewsid = newsid;
newsid = next_dfa_id(newsid);
let anewsid = newsid;
newsid = next_dfa_id(newsid);
remap_unanchored[oldsid] = unewsid;
remap_anchored[oldsid] = anewsid;
is_anchored[anewsid.as_usize() >> stride2] = true;
if state.is_match() {
dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
}
sparse_iter(
nnfa,
oldsid,
&dfa.byte_classes,
|byte, class, oldnextsid| {
let class = usize::from(class);
if oldnextsid == noncontiguous::NFA::FAIL {
let oldnextsid =
if state.fail() == noncontiguous::NFA::DEAD {
noncontiguous::NFA::DEAD
} else {
nnfa.next_state(
Anchored::No,
state.fail(),
byte,
)
};
dfa.trans[unewsid.as_usize() + class] = oldnextsid;
} else {
dfa.trans[unewsid.as_usize() + class] = oldnextsid;
dfa.trans[anewsid.as_usize() + class] = oldnextsid;
}
},
);
}
}
for i in 0..dfa.state_len {
let sid = i << stride2;
if is_anchored[i] {
for next in dfa.trans[sid..][..stride].iter_mut() {
*next = remap_anchored[*next];
}
} else {
for next in dfa.trans[sid..][..stride].iter_mut() {
*next = remap_unanchored[*next];
}
}
}
// Now that we've remapped all the IDs in our states, all that's left
// is remapping the special state IDs.
let old = nnfa.special();
let new = &mut dfa.special;
new.max_special_id = remap_anchored[old.max_special_id];
new.max_match_id = remap_anchored[old.max_match_id];
new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
new.start_anchored_id = remap_anchored[old.start_anchored_id];
}
/// Set the desired match semantics.
///
/// This only applies when using [`Builder::build`] and not
/// [`Builder::build_from_noncontiguous`].
///
/// See
/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
/// for more documentation and examples.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
self.noncontiguous.match_kind(kind);
self
}
/// Enable ASCII-aware case insensitive matching.
///
/// This only applies when using [`Builder::build`] and not
/// [`Builder::build_from_noncontiguous`].
///
/// See
/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
/// for more documentation and examples.
pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
self.noncontiguous.ascii_case_insensitive(yes);
self
}
/// Enable heuristic prefilter optimizations.
///
/// This only applies when using [`Builder::build`] and not
/// [`Builder::build_from_noncontiguous`].
///
/// See
/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
/// for more documentation and examples.
pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
self.noncontiguous.prefilter(yes);
self
}
/// Sets the starting state configuration for the automaton.
///
/// See
/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
/// for more documentation and examples.
pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
self.start_kind = kind;
self
}
/// A debug setting for whether to attempt to shrink the size of the
/// automaton's alphabet or not.
///
/// This should never be enabled unless you're debugging an automaton.
/// Namely, disabling byte classes makes transitions easier to reason
/// about, since they use the actual bytes instead of equivalence classes.
/// Disabling this confers no performance benefit at search time.
///
/// See
/// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
/// for more documentation and examples.
pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
self.byte_classes = yes;
self
}
}
/// Iterate over all possible equivalence class transitions in this state.
/// The closure is called for all transitions with a distinct equivalence
/// class, even those not explicitly represented in this sparse state. For
/// any implicitly defined transitions, the given closure is called with
/// the fail state ID.
///
/// The closure is guaranteed to be called precisely
/// `byte_classes.alphabet_len()` times, once for every possible class in
/// ascending order.
fn sparse_iter<F: FnMut(u8, u8, StateID)>(
nnfa: &noncontiguous::NFA,
oldsid: StateID,
classes: &ByteClasses,
mut f: F,
) {
let mut prev_class = None;
let mut byte = 0usize;
for t in nnfa.iter_trans(oldsid) {
while byte < usize::from(t.byte()) {
let rep = byte.as_u8();
let class = classes.get(rep);
byte += 1;
if prev_class != Some(class) {
f(rep, class, noncontiguous::NFA::FAIL);
prev_class = Some(class);
}
}
let rep = t.byte();
let class = classes.get(rep);
byte += 1;
if prev_class != Some(class) {
f(rep, class, t.next());
prev_class = Some(class);
}
}
for b in byte..=255 {
let rep = b.as_u8();
let class = classes.get(rep);
if prev_class != Some(class) {
f(rep, class, noncontiguous::NFA::FAIL);
prev_class = Some(class);
}
}
}

326
vendor/aho-corasick/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,326 @@
/*!
A library for finding occurrences of many patterns at once. This library
provides multiple pattern search principally through an implementation of the
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
which builds a fast finite state machine for executing searches in linear time.
Additionally, this library provides a number of configuration options for
building the automaton that permit controlling the space versus time trade
off. Other features include simple ASCII case insensitive matching, finding
overlapping matches, replacements, searching streams and even searching and
replacing text in streams.
Finally, unlike most other Aho-Corasick implementations, this one
supports enabling [leftmost-first](MatchKind::LeftmostFirst) or
[leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a
(seemingly) novel alternative construction algorithm. For more details on what
match semantics means, see the [`MatchKind`] type.
# Overview
This section gives a brief overview of the primary types in this crate:
* [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton.
This is the type you use to execute searches.
* [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and
supports configuring a number of options.
* [`Match`] represents a single match reported by an Aho-Corasick automaton.
Each match has two pieces of information: the pattern that matched and the
start and end byte offsets corresponding to the position in the haystack at
which it matched.
# Example: basic searching
This example shows how to search for occurrences of multiple patterns
simultaneously. Each match includes the pattern that matched along with the
byte offsets of the match.
```
use aho_corasick::{AhoCorasick, PatternID};
let patterns = &["apple", "maple", "Snapple"];
let haystack = "Nobody likes maple in their apple flavored Snapple.";
let ac = AhoCorasick::new(patterns).unwrap();
let mut matches = vec![];
for mat in ac.find_iter(haystack) {
matches.push((mat.pattern(), mat.start(), mat.end()));
}
assert_eq!(matches, vec![
(PatternID::must(1), 13, 18),
(PatternID::must(0), 28, 33),
(PatternID::must(2), 43, 50),
]);
```
# Example: case insensitivity
This is like the previous example, but matches `Snapple` case insensitively
using `AhoCorasickBuilder`:
```
use aho_corasick::{AhoCorasick, PatternID};
let patterns = &["apple", "maple", "snapple"];
let haystack = "Nobody likes maple in their apple flavored Snapple.";
let ac = AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(patterns)
.unwrap();
let mut matches = vec![];
for mat in ac.find_iter(haystack) {
matches.push((mat.pattern(), mat.start(), mat.end()));
}
assert_eq!(matches, vec![
(PatternID::must(1), 13, 18),
(PatternID::must(0), 28, 33),
(PatternID::must(2), 43, 50),
]);
```
# Example: replacing matches in a stream
This example shows how to execute a search and replace on a stream without
loading the entire stream into memory first.
```
# #[cfg(feature = "std")] {
use aho_corasick::AhoCorasick;
# fn example() -> Result<(), std::io::Error> {
let patterns = &["fox", "brown", "quick"];
let replace_with = &["sloth", "grey", "slow"];
// In a real example, these might be `std::fs::File`s instead. All you need to
// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
let rdr = "The quick brown fox.";
let mut wtr = vec![];
let ac = AhoCorasick::new(patterns).unwrap();
ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
# Ok(()) }; example().unwrap()
# }
```
# Example: finding the leftmost first match
In the textbook description of Aho-Corasick, its formulation is typically
structured such that it reports all possible matches, even when they overlap
with another. In many cases, overlapping matches may not be desired, such as
the case of finding all successive non-overlapping matches like you might with
a standard regular expression.
Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
this doesn't always work in the expected way, since it will report matches as
soon as they are seen. For example, consider matching the regex `Samwise|Sam`
against the text `Samwise`. Most regex engines (that are Perl-like, or
non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
algorithm modified for reporting non-overlapping matches will report `Sam`.
A novel contribution of this library is the ability to change the match
semantics of Aho-Corasick (without additional search time overhead) such that
`Samwise` is reported instead. For example, here's the standard approach:
```
use aho_corasick::AhoCorasick;
let patterns = &["Samwise", "Sam"];
let haystack = "Samwise";
let ac = AhoCorasick::new(patterns).unwrap();
let mat = ac.find(haystack).expect("should have a match");
assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
```
And now here's the leftmost-first version, which matches how a Perl-like
regex will work:
```
use aho_corasick::{AhoCorasick, MatchKind};
let patterns = &["Samwise", "Sam"];
let haystack = "Samwise";
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
.build(patterns)
.unwrap();
let mat = ac.find(haystack).expect("should have a match");
assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
```
In addition to leftmost-first semantics, this library also supports
leftmost-longest semantics, which match the POSIX behavior of a regular
expression alternation. See [`MatchKind`] for more details.
# Prefilters
While an Aho-Corasick automaton can perform admirably when compared to more
naive solutions, it is generally slower than more specialized algorithms that
are accelerated using vector instructions such as SIMD.
For that reason, this library will internally use a "prefilter" to attempt
to accelerate searches when possible. Currently, this library has several
different algorithms it might use depending on the patterns provided. Once the
number of patterns gets too big, prefilters are no longer used.
While a prefilter is generally good to have on by default since it works
well in the common case, it can lead to less predictable or even sub-optimal
performance in some cases. For that reason, prefilters can be explicitly
disabled via [`AhoCorasickBuilder::prefilter`].
# Lower level APIs
This crate also provides several sub-modules that collectively expose many of
the implementation details of the main [`AhoCorasick`] type. Most users of this
library can completely ignore the submodules and their contents, but if you
needed finer grained control, some parts of them may be useful to you. Here is
a brief overview of each and why you might want to use them:
* The [`packed`] sub-module contains a lower level API for using fast
vectorized routines for finding a small number of patterns in a haystack.
You might want to use this API when you want to completely side-step using
Aho-Corasick automata. Otherwise, the fast vectorized routines are used
automatically as prefilters for `AhoCorasick` searches whenever possible.
* The [`automaton`] sub-module provides a lower level finite state
machine interface that the various Aho-Corasick implementations in
this crate implement. This sub-module's main contribution is the
[`Automaton`](automaton::Automaton) trait, which permits manually walking the
state transitions of an Aho-Corasick automaton.
* The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of
the aforementioned `Automaton` trait. The main reason one might want to use
these sub-modules is to get access to a type that implements the `Automaton`
trait. (The top-level `AhoCorasick` type does not implement the `Automaton`
trait.)
As mentioned above, if you aren't sure whether you need these sub-modules,
you should be able to safely ignore them and just focus on the [`AhoCorasick`]
type.
# Crate features
This crate exposes a few features for controlling dependency usage and whether
this crate can be used without the standard library.
* **std** -
Enables support for the standard library. This feature is enabled by
default. When disabled, only `core` and `alloc` are used. At an API
level, enabling `std` enables `std::error::Error` trait impls for the
various error types, and higher level stream search routines such as
[`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required
to enable vectorized prefilters. Prefilters can greatly accelerate searches,
but generally only apply when the number of patterns is small (less than
~100).
* **perf-literal** -
Enables support for literal prefilters that use vectorized routines from
external crates. This feature is enabled by default. If you're only using
Aho-Corasick for large numbers of patterns or otherwise can abide lower
throughput when searching with a small number of patterns, then it is
reasonable to disable this feature.
* **logging** -
Enables a dependency on the `log` crate and emits messages to aide in
diagnostics. This feature is disabled by default.
*/
#![no_std]
#![deny(missing_docs)]
#![deny(rustdoc::broken_intra_doc_links)]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
extern crate alloc;
#[cfg(any(test, feature = "std"))]
extern crate std;
#[cfg(doctest)]
doc_comment::doctest!("../README.md");
#[cfg(feature = "std")]
pub use crate::ahocorasick::StreamFindIter;
pub use crate::{
ahocorasick::{
AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter,
FindOverlappingIter,
},
util::{
error::{BuildError, MatchError, MatchErrorKind},
primitives::{PatternID, PatternIDError},
search::{Anchored, Input, Match, MatchKind, Span, StartKind},
},
};
#[macro_use]
mod macros;
mod ahocorasick;
pub mod automaton;
pub mod dfa;
pub mod nfa;
pub mod packed;
#[cfg(test)]
mod tests;
// I wrote out the module for implementing fst::Automaton only to later realize
// that this would make fst a public dependency and fst is not at 1.0 yet. I
// decided to just keep the code in tree, but build it only during tests.
//
// TODO: I think I've changed my mind again. I'm considering pushing it out
// into either a separate crate or into 'fst' directly as an optional feature.
// #[cfg(test)]
// #[allow(dead_code)]
// mod transducer;
pub(crate) mod util;
#[cfg(test)]
mod testoibits {
use std::panic::{RefUnwindSafe, UnwindSafe};
use super::*;
fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
#[test]
fn oibits_main() {
assert_all::<AhoCorasick>();
assert_all::<AhoCorasickBuilder>();
assert_all::<AhoCorasickKind>();
assert_all::<FindIter>();
assert_all::<FindOverlappingIter>();
assert_all::<BuildError>();
assert_all::<MatchError>();
assert_all::<MatchErrorKind>();
assert_all::<Anchored>();
assert_all::<Input>();
assert_all::<Match>();
assert_all::<MatchKind>();
assert_all::<Span>();
assert_all::<StartKind>();
}
#[test]
fn oibits_automaton() {
use crate::{automaton, dfa::DFA};
assert_all::<automaton::FindIter<DFA>>();
assert_all::<automaton::FindOverlappingIter<DFA>>();
#[cfg(feature = "std")]
assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>();
assert_all::<automaton::OverlappingState>();
assert_all::<automaton::Prefilter>();
assert_all::<automaton::Candidate>();
}
#[test]
fn oibits_packed() {
use crate::packed;
assert_all::<packed::Config>();
assert_all::<packed::Builder>();
assert_all::<packed::Searcher>();
assert_all::<packed::FindIter>();
assert_all::<packed::MatchKind>();
}
}

18
vendor/aho-corasick/src/macros.rs vendored Normal file
View File

@@ -0,0 +1,18 @@
#![allow(unused_macros)]
macro_rules! log {
($($tt:tt)*) => {
#[cfg(feature = "logging")]
{
$($tt)*
}
}
}
macro_rules! debug {
($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
}
macro_rules! trace {
($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
}

1141
vendor/aho-corasick/src/nfa/contiguous.rs vendored Normal file

File diff suppressed because it is too large Load Diff

40
vendor/aho-corasick/src/nfa/mod.rs vendored Normal file
View File

@@ -0,0 +1,40 @@
/*!
Provides direct access to NFA implementations of Aho-Corasick.
The principle characteristic of an NFA in this crate is that it may
transition through multiple states per byte of haystack. In Aho-Corasick
parlance, NFAs follow failure transitions during a search. In contrast,
a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during
compilation at the expense of a much bigger memory footprint.
Currently, there are two NFA implementations provided: noncontiguous and
contiguous. The names reflect their internal representation, and consequently,
the trade offs associated with them:
* A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to
represent its transitions in a sparse format. This is ideal for building an
NFA, since it cheaply permits different states to have a different number of
transitions. A noncontiguous NFA is where the main Aho-Corasick construction
algorithm is implemented. All other Aho-Corasick implementations are built by
first constructing a noncontiguous NFA.
* A [`contiguous::NFA`] is uses a single allocation to represent all states,
while still encoding most states as sparse states but permitting states near
the starting state to have a dense representation. The dense representation
uses more memory, but permits computing transitions during a search more
quickly. By only making the most active states dense (the states near the
starting state), a contiguous NFA better balances memory usage with search
speed. The single contiguous allocation also uses less overhead per state and
enables compression tricks where most states only use 8 bytes of heap memory.
When given the choice between these two, you almost always want to pick a
contiguous NFA. It takes only a little longer to build, but both its memory
usage and search speed are typically much better than a noncontiguous NFA. A
noncontiguous NFA is useful when prioritizing build times, or when there are
so many patterns that a contiguous NFA could not be built. (Currently, because
of both memory and search speed improvements, a contiguous NFA has a smaller
internal limit on the total number of NFA states it can represent. But you
would likely need to have hundreds of thousands or even millions of patterns
before you hit this limit.)
*/
pub mod contiguous;
pub mod noncontiguous;

File diff suppressed because it is too large Load Diff

687
vendor/aho-corasick/src/packed/api.rs vendored Normal file
View File

@@ -0,0 +1,687 @@
use alloc::sync::Arc;
use crate::{
packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy},
util::search::{Match, Span},
};
/// This is a limit placed on the total number of patterns we're willing to try
/// and match at once. As more sophisticated algorithms are added, this number
/// may be increased.
const PATTERN_LIMIT: usize = 128;
/// A knob for controlling the match semantics of a packed multiple string
/// searcher.
///
/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
/// crate module in that it doesn't support "standard" match semantics,
/// and instead only supports leftmost-first or leftmost-longest. Namely,
/// "standard" semantics cannot be easily supported by packed searchers.
///
/// For more information on the distinction between leftmost-first and
/// leftmost-longest, see the docs on the top-level `MatchKind` type.
///
/// Unlike the top-level `MatchKind` type, the default match semantics for this
/// type are leftmost-first.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[non_exhaustive]
pub enum MatchKind {
/// Use leftmost-first match semantics, which reports leftmost matches.
/// When there are multiple possible leftmost matches, the match
/// corresponding to the pattern that appeared earlier when constructing
/// the automaton is reported.
///
/// This is the default.
LeftmostFirst,
/// Use leftmost-longest match semantics, which reports leftmost matches.
/// When there are multiple possible leftmost matches, the longest match
/// is chosen.
LeftmostLongest,
}
impl Default for MatchKind {
fn default() -> MatchKind {
MatchKind::LeftmostFirst
}
}
/// The configuration for a packed multiple pattern searcher.
///
/// The configuration is currently limited only to being able to select the
/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
/// future, more knobs may be made available.
///
/// A configuration produces a [`packed::Builder`](Builder), which in turn can
/// be used to construct a [`packed::Searcher`](Searcher) for searching.
///
/// # Example
///
/// This example shows how to use leftmost-longest semantics instead of the
/// default (leftmost-first).
///
/// ```
/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Config::new()
/// .match_kind(MatchKind::LeftmostLongest)
/// .builder()
/// .add("foo")
/// .add("foobar")
/// .build()?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::must(1)], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Config {
kind: MatchKind,
force: Option<ForceAlgorithm>,
only_teddy_fat: Option<bool>,
only_teddy_256bit: Option<bool>,
heuristic_pattern_limits: bool,
}
/// An internal option for forcing the use of a particular packed algorithm.
///
/// When an algorithm is forced, if a searcher could not be constructed for it,
/// then no searcher will be returned even if an alternative algorithm would
/// work.
#[derive(Clone, Debug)]
enum ForceAlgorithm {
Teddy,
RabinKarp,
}
impl Default for Config {
fn default() -> Config {
Config::new()
}
}
impl Config {
/// Create a new default configuration. A default configuration uses
/// leftmost-first match semantics.
pub fn new() -> Config {
Config {
kind: MatchKind::LeftmostFirst,
force: None,
only_teddy_fat: None,
only_teddy_256bit: None,
heuristic_pattern_limits: true,
}
}
/// Create a packed builder from this configuration. The builder can be
/// used to accumulate patterns and create a [`Searcher`] from them.
pub fn builder(&self) -> Builder {
Builder::from_config(self.clone())
}
/// Set the match semantics for this configuration.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
self.kind = kind;
self
}
/// An undocumented method for forcing the use of the Teddy algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_teddy(&mut self, yes: bool) -> &mut Config {
if yes {
self.force = Some(ForceAlgorithm::Teddy);
} else {
self.force = None;
}
self
}
/// An undocumented method for forcing the use of the Fat Teddy algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
self.only_teddy_fat = yes;
self
}
/// An undocumented method for forcing the use of SSE (`Some(false)`) or
/// AVX (`Some(true)`) algorithms.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config {
self.only_teddy_256bit = yes;
self
}
/// An undocumented method for forcing the use of the Rabin-Karp algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config {
if yes {
self.force = Some(ForceAlgorithm::RabinKarp);
} else {
self.force = None;
}
self
}
/// Request that heuristic limitations on the number of patterns be
/// employed. This useful to disable for benchmarking where one wants to
/// explore how Teddy performs on large number of patterns even if the
/// heuristics would otherwise refuse construction.
///
/// This is enabled by default.
pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config {
self.heuristic_pattern_limits = yes;
self
}
}
/// A builder for constructing a packed searcher from a collection of patterns.
///
/// # Example
///
/// This example shows how to use a builder to construct a searcher. By
/// default, leftmost-first match semantics are used.
///
/// ```
/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Builder::new()
/// .add("foobar")
/// .add("foo")
/// .build()?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::ZERO], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
/// The configuration of this builder and subsequent matcher.
config: Config,
/// Set to true if the builder detects that a matcher cannot be built.
inert: bool,
/// The patterns provided by the caller.
patterns: Patterns,
}
impl Builder {
/// Create a new builder for constructing a multi-pattern searcher. This
/// constructor uses the default configuration.
pub fn new() -> Builder {
Builder::from_config(Config::new())
}
fn from_config(config: Config) -> Builder {
Builder { config, inert: false, patterns: Patterns::new() }
}
/// Build a searcher from the patterns added to this builder so far.
pub fn build(&self) -> Option<Searcher> {
if self.inert || self.patterns.is_empty() {
return None;
}
let mut patterns = self.patterns.clone();
patterns.set_match_kind(self.config.kind);
let patterns = Arc::new(patterns);
let rabinkarp = RabinKarp::new(&patterns);
// Effectively, we only want to return a searcher if we can use Teddy,
// since Teddy is our only fast packed searcher at the moment.
// Rabin-Karp is only used when searching haystacks smaller than what
// Teddy can support. Thus, the only way to get a Rabin-Karp searcher
// is to force it using undocumented APIs (for tests/benchmarks).
let (search_kind, minimum_len) = match self.config.force {
None | Some(ForceAlgorithm::Teddy) => {
debug!("trying to build Teddy packed matcher");
let teddy = match self.build_teddy(Arc::clone(&patterns)) {
None => return None,
Some(teddy) => teddy,
};
let minimum_len = teddy.minimum_len();
(SearchKind::Teddy(teddy), minimum_len)
}
Some(ForceAlgorithm::RabinKarp) => {
debug!("using Rabin-Karp packed matcher");
(SearchKind::RabinKarp, 0)
}
};
Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
}
fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> {
teddy::Builder::new()
.only_256bit(self.config.only_teddy_256bit)
.only_fat(self.config.only_teddy_fat)
.heuristic_pattern_limits(self.config.heuristic_pattern_limits)
.build(patterns)
}
/// Add the given pattern to this set to match.
///
/// The order in which patterns are added is significant. Namely, when
/// using leftmost-first match semantics, then when multiple patterns can
/// match at a particular location, the pattern that was added first is
/// used as the match.
///
/// If the number of patterns added exceeds the amount supported by packed
/// searchers, then the builder will stop accumulating patterns and render
/// itself inert. At this point, constructing a searcher will always return
/// `None`.
pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
if self.inert {
return self;
} else if self.patterns.len() >= PATTERN_LIMIT {
self.inert = true;
self.patterns.reset();
return self;
}
// Just in case PATTERN_LIMIT increases beyond u16::MAX.
assert!(self.patterns.len() <= core::u16::MAX as usize);
let pattern = pattern.as_ref();
if pattern.is_empty() {
self.inert = true;
self.patterns.reset();
return self;
}
self.patterns.add(pattern);
self
}
/// Add the given iterator of patterns to this set to match.
///
/// The iterator must yield elements that can be converted into a `&[u8]`.
///
/// The order in which patterns are added is significant. Namely, when
/// using leftmost-first match semantics, then when multiple patterns can
/// match at a particular location, the pattern that was added first is
/// used as the match.
///
/// If the number of patterns added exceeds the amount supported by packed
/// searchers, then the builder will stop accumulating patterns and render
/// itself inert. At this point, constructing a searcher will always return
/// `None`.
pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
for p in patterns {
self.add(p);
}
self
}
/// Returns the number of patterns added to this builder.
pub fn len(&self) -> usize {
self.patterns.len()
}
/// Returns the length, in bytes, of the shortest pattern added.
pub fn minimum_len(&self) -> usize {
self.patterns.minimum_len()
}
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
/// A packed searcher for quickly finding occurrences of multiple patterns.
///
/// If callers need more flexible construction, or if one wants to change the
/// match semantics (either leftmost-first or leftmost-longest), then one can
/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
///
/// # Example
///
/// This example shows how to create a searcher from an iterator of patterns.
/// By default, leftmost-first match semantics are used.
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::ZERO], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Searcher {
patterns: Arc<Patterns>,
rabinkarp: RabinKarp,
search_kind: SearchKind,
minimum_len: usize,
}
#[derive(Clone, Debug)]
enum SearchKind {
Teddy(teddy::Searcher),
RabinKarp,
}
impl Searcher {
/// A convenience function for constructing a searcher from an iterator
/// of things that can be converted to a `&[u8]`.
///
/// If a searcher could not be constructed (either because of an
/// unsupported CPU or because there are too many patterns), then `None`
/// is returned.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::ZERO], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn new<I, P>(patterns: I) -> Option<Searcher>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
Builder::new().extend(patterns).build()
}
/// A convenience function for calling `Config::new()`.
///
/// This is useful for avoiding an additional import.
pub fn config() -> Config {
Config::new()
}
/// A convenience function for calling `Builder::new()`.
///
/// This is useful for avoiding an additional import.
pub fn builder() -> Builder {
Builder::new()
}
/// Return the first occurrence of any of the patterns in this searcher,
/// according to its match semantics, in the given haystack. The `Match`
/// returned will include the identifier of the pattern that matched, which
/// corresponds to the index of the pattern (starting from `0`) in which it
/// was added.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let mat = searcher.find("foobar")?;
/// assert_eq!(PatternID::ZERO, mat.pattern());
/// assert_eq!(0, mat.start());
/// assert_eq!(6, mat.end());
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
let haystack = haystack.as_ref();
self.find_in(haystack, Span::from(0..haystack.len()))
}
/// Return the first occurrence of any of the patterns in this searcher,
/// according to its match semantics, in the given haystack starting from
/// the given position.
///
/// The `Match` returned will include the identifier of the pattern that
/// matched, which corresponds to the index of the pattern (starting from
/// `0`) in which it was added. The offsets in the `Match` will be relative
/// to the start of `haystack` (and not `at`).
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
///
/// # fn example() -> Option<()> {
/// let haystack = "foofoobar";
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
/// assert_eq!(PatternID::ZERO, mat.pattern());
/// assert_eq!(3, mat.start());
/// assert_eq!(9, mat.end());
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn find_in<B: AsRef<[u8]>>(
&self,
haystack: B,
span: Span,
) -> Option<Match> {
let haystack = haystack.as_ref();
match self.search_kind {
SearchKind::Teddy(ref teddy) => {
if haystack[span].len() < teddy.minimum_len() {
return self.find_in_slow(haystack, span);
}
teddy.find(&haystack[..span.end], span.start)
}
SearchKind::RabinKarp => {
self.rabinkarp.find_at(&haystack[..span.end], span.start)
}
}
}
/// Return an iterator of non-overlapping occurrences of the patterns in
/// this searcher, according to its match semantics, in the given haystack.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar fooba foofoo")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![
/// PatternID::must(0),
/// PatternID::must(1),
/// PatternID::must(1),
/// PatternID::must(1),
/// ], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
&'a self,
haystack: &'b B,
) -> FindIter<'a, 'b> {
let haystack = haystack.as_ref();
let span = Span::from(0..haystack.len());
FindIter { searcher: self, haystack, span }
}
/// Returns the match kind used by this packed searcher.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// // leftmost-first is the default.
/// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn match_kind(&self) -> &MatchKind {
self.patterns.match_kind()
}
/// Returns the minimum length of a haystack that is required in order for
/// packed searching to be effective.
///
/// In some cases, the underlying packed searcher may not be able to search
/// very short haystacks. When that occurs, the implementation will defer
/// to a slower non-packed searcher (which is still generally faster than
/// Aho-Corasick for a small number of patterns). However, callers may
/// want to avoid ever using the slower variant, which one can do by
/// never passing a haystack shorter than the minimum length returned by
/// this method.
#[inline]
pub fn minimum_len(&self) -> usize {
self.minimum_len
}
/// Returns the approximate total amount of heap used by this searcher, in
/// units of bytes.
#[inline]
pub fn memory_usage(&self) -> usize {
self.patterns.memory_usage()
+ self.rabinkarp.memory_usage()
+ self.search_kind.memory_usage()
}
/// Use a slow (non-packed) searcher.
///
/// This is useful when a packed searcher could be constructed, but could
/// not be used to search a specific haystack. For example, if Teddy was
/// built but the haystack is smaller than ~34 bytes, then Teddy might not
/// be able to run.
fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
self.rabinkarp.find_at(&haystack[..span.end], span.start)
}
}
impl SearchKind {
fn memory_usage(&self) -> usize {
match *self {
SearchKind::Teddy(ref ted) => ted.memory_usage(),
SearchKind::RabinKarp => 0,
}
}
}
/// An iterator over non-overlapping matches from a packed searcher.
///
/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
/// while the lifetime `'h` refers to the lifetime of the haystack being
/// searched.
#[derive(Debug)]
pub struct FindIter<'s, 'h> {
searcher: &'s Searcher,
haystack: &'h [u8],
span: Span,
}
impl<'s, 'h> Iterator for FindIter<'s, 'h> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
if self.span.start > self.span.end {
return None;
}
match self.searcher.find_in(&self.haystack, self.span) {
None => None,
Some(m) => {
self.span.start = m.end();
Some(m)
}
}
}
}

39
vendor/aho-corasick/src/packed/ext.rs vendored Normal file
View File

@@ -0,0 +1,39 @@
/// A trait for adding some helper routines to pointers.
pub(crate) trait Pointer {
/// Returns the distance, in units of `T`, between `self` and `origin`.
///
/// # Safety
///
/// Same as `ptr::offset_from` in addition to `self >= origin`.
unsafe fn distance(self, origin: Self) -> usize;
/// Casts this pointer to `usize`.
///
/// Callers should not convert the `usize` back to a pointer if at all
/// possible. (And if you believe it's necessary, open an issue to discuss
/// why. Otherwise, it has the potential to violate pointer provenance.)
/// The purpose of this function is just to be able to do arithmetic, i.e.,
/// computing offsets or alignments.
fn as_usize(self) -> usize;
}
impl<T> Pointer for *const T {
unsafe fn distance(self, origin: *const T) -> usize {
// TODO: Replace with `ptr::sub_ptr` once stabilized.
usize::try_from(self.offset_from(origin)).unwrap_unchecked()
}
fn as_usize(self) -> usize {
self as usize
}
}
impl<T> Pointer for *mut T {
unsafe fn distance(self, origin: *mut T) -> usize {
(self as *const T).distance(origin as *const T)
}
fn as_usize(self) -> usize {
(self as *const T).as_usize()
}
}

120
vendor/aho-corasick/src/packed/mod.rs vendored Normal file
View File

@@ -0,0 +1,120 @@
/*!
Provides packed multiple substring search, principally for a small number of
patterns.
This sub-module provides vectorized routines for quickly finding
matches of a small number of patterns. In general, users of this crate
shouldn't need to interface with this module directly, as the primary
[`AhoCorasick`](crate::AhoCorasick) searcher will use these routines
automatically as a prefilter when applicable. However, in some cases, callers
may want to bypass the Aho-Corasick machinery entirely and use this vectorized
searcher directly.
# Overview
The primary types in this sub-module are:
* [`Searcher`] executes the actual search algorithm to report matches in a
haystack.
* [`Builder`] accumulates patterns incrementally and can construct a
`Searcher`.
* [`Config`] permits tuning the searcher, and itself will produce a `Builder`
(which can then be used to build a `Searcher`). Currently, the only tuneable
knob are the match semantics, but this may be expanded in the future.
# Examples
This example shows how to create a searcher from an iterator of patterns.
By default, leftmost-first match semantics are used. (See the top-level
[`MatchKind`] type for more details about match semantics, which apply
similarly to packed substring search.)
```
use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
# fn example() -> Option<()> {
let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
let matches: Vec<PatternID> = searcher
.find_iter("foobar")
.map(|mat| mat.pattern())
.collect();
assert_eq!(vec![PatternID::ZERO], matches);
# Some(()) }
# if cfg!(all(feature = "std", any(
# target_arch = "x86_64", target_arch = "aarch64",
# ))) {
# example().unwrap()
# } else {
# assert!(example().is_none());
# }
```
This example shows how to use [`Config`] to change the match semantics to
leftmost-longest:
```
use aho_corasick::{packed::{Config, MatchKind}, PatternID};
# fn example() -> Option<()> {
let searcher = Config::new()
.match_kind(MatchKind::LeftmostLongest)
.builder()
.add("foo")
.add("foobar")
.build()?;
let matches: Vec<PatternID> = searcher
.find_iter("foobar")
.map(|mat| mat.pattern())
.collect();
assert_eq!(vec![PatternID::must(1)], matches);
# Some(()) }
# if cfg!(all(feature = "std", any(
# target_arch = "x86_64", target_arch = "aarch64",
# ))) {
# example().unwrap()
# } else {
# assert!(example().is_none());
# }
```
# Packed substring searching
Packed substring searching refers to the use of SIMD (Single Instruction,
Multiple Data) to accelerate the detection of matches in a haystack. Unlike
conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
search tend to do better with a small number of patterns, where as Aho-Corasick
generally maintains reasonably consistent performance regardless of the number
of patterns you give it. Because of this, the vectorized searcher in this
sub-module cannot be used as a general purpose searcher, since building the
searcher may fail even when given a small number of patterns. However, in
exchange, when searching for a small number of patterns, searching can be quite
a bit faster than Aho-Corasick (sometimes by an order of magnitude).
The key take away here is that constructing a searcher from a list of patterns
is a fallible operation with no clear rules for when it will fail. While the
precise conditions under which building a searcher can fail is specifically an
implementation detail, here are some common reasons:
* Too many patterns were given. Typically, the limit is on the order of 100 or
so, but this limit may fluctuate based on available CPU features.
* The available packed algorithms require CPU features that aren't available.
For example, currently, this crate only provides packed algorithms for
`x86_64` and `aarch64`. Therefore, constructing a packed searcher on any
other target will always fail.
* Zero patterns were given, or one of the patterns given was empty. Packed
searchers require at least one pattern and that all patterns are non-empty.
* Something else about the nature of the patterns (typically based on
heuristics) suggests that a packed searcher would perform very poorly, so
no searcher is built.
*/
pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
mod api;
mod ext;
mod pattern;
mod rabinkarp;
mod teddy;
#[cfg(all(feature = "std", test))]
mod tests;
mod vector;

View File

@@ -0,0 +1,480 @@
use core::{cmp, fmt, mem, u16, usize};
use alloc::{boxed::Box, string::String, vec, vec::Vec};
use crate::{
packed::{api::MatchKind, ext::Pointer},
PatternID,
};
/// A non-empty collection of non-empty patterns to search for.
///
/// This collection of patterns is what is passed around to both execute
/// searches and to construct the searchers themselves. Namely, this permits
/// searches to avoid copying all of the patterns, and allows us to keep only
/// one copy throughout all packed searchers.
///
/// Note that this collection is not a set. The same pattern can appear more
/// than once.
#[derive(Clone, Debug)]
pub(crate) struct Patterns {
/// The match semantics supported by this collection of patterns.
///
/// The match semantics determines the order of the iterator over patterns.
/// For leftmost-first, patterns are provided in the same order as were
/// provided by the caller. For leftmost-longest, patterns are provided in
/// descending order of length, with ties broken by the order in which they
/// were provided by the caller.
kind: MatchKind,
/// The collection of patterns, indexed by their identifier.
by_id: Vec<Vec<u8>>,
/// The order of patterns defined for iteration, given by pattern
/// identifiers. The order of `by_id` and `order` is always the same for
/// leftmost-first semantics, but may be different for leftmost-longest
/// semantics.
order: Vec<PatternID>,
/// The length of the smallest pattern, in bytes.
minimum_len: usize,
/// The total number of pattern bytes across the entire collection. This
/// is used for reporting total heap usage in constant time.
total_pattern_bytes: usize,
}
// BREADCRUMBS: I think we want to experiment with a different bucket
// representation. Basically, each bucket is just a Range<usize> to a single
// contiguous allocation? Maybe length-prefixed patterns or something? The
// idea is to try to get rid of the pointer chasing in verification. I don't
// know that that is the issue, but I suspect it is.
impl Patterns {
/// Create a new collection of patterns for the given match semantics. The
/// ID of each pattern is the index of the pattern at which it occurs in
/// the `by_id` slice.
///
/// If any of the patterns in the slice given are empty, then this panics.
/// Similarly, if the number of patterns given is zero, then this also
/// panics.
pub(crate) fn new() -> Patterns {
Patterns {
kind: MatchKind::default(),
by_id: vec![],
order: vec![],
minimum_len: usize::MAX,
total_pattern_bytes: 0,
}
}
/// Add a pattern to this collection.
///
/// This panics if the pattern given is empty.
pub(crate) fn add(&mut self, bytes: &[u8]) {
assert!(!bytes.is_empty());
assert!(self.by_id.len() <= u16::MAX as usize);
let id = PatternID::new(self.by_id.len()).unwrap();
self.order.push(id);
self.by_id.push(bytes.to_vec());
self.minimum_len = cmp::min(self.minimum_len, bytes.len());
self.total_pattern_bytes += bytes.len();
}
/// Set the match kind semantics for this collection of patterns.
///
/// If the kind is not set, then the default is leftmost-first.
pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {
self.kind = kind;
match self.kind {
MatchKind::LeftmostFirst => {
self.order.sort();
}
MatchKind::LeftmostLongest => {
let (order, by_id) = (&mut self.order, &mut self.by_id);
order.sort_by(|&id1, &id2| {
by_id[id1].len().cmp(&by_id[id2].len()).reverse()
});
}
}
}
/// Return the number of patterns in this collection.
///
/// This is guaranteed to be greater than zero.
pub(crate) fn len(&self) -> usize {
self.by_id.len()
}
/// Returns true if and only if this collection of patterns is empty.
pub(crate) fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the approximate total amount of heap used by these patterns, in
/// units of bytes.
pub(crate) fn memory_usage(&self) -> usize {
self.order.len() * mem::size_of::<PatternID>()
+ self.by_id.len() * mem::size_of::<Vec<u8>>()
+ self.total_pattern_bytes
}
/// Clears all heap memory associated with this collection of patterns and
/// resets all state such that it is a valid empty collection.
pub(crate) fn reset(&mut self) {
self.kind = MatchKind::default();
self.by_id.clear();
self.order.clear();
self.minimum_len = usize::MAX;
}
/// Returns the length, in bytes, of the smallest pattern.
///
/// This is guaranteed to be at least one.
pub(crate) fn minimum_len(&self) -> usize {
self.minimum_len
}
/// Returns the match semantics used by these patterns.
pub(crate) fn match_kind(&self) -> &MatchKind {
&self.kind
}
/// Return the pattern with the given identifier. If such a pattern does
/// not exist, then this panics.
pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {
Pattern(&self.by_id[id])
}
/// Return the pattern with the given identifier without performing bounds
/// checks.
///
/// # Safety
///
/// Callers must ensure that a pattern with the given identifier exists
/// before using this method.
pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
Pattern(self.by_id.get_unchecked(id.as_usize()))
}
/// Return an iterator over all the patterns in this collection, in the
/// order in which they should be matched.
///
/// Specifically, in a naive multi-pattern matcher, the following is
/// guaranteed to satisfy the match semantics of this collection of
/// patterns:
///
/// ```ignore
/// for i in 0..haystack.len():
/// for p in patterns.iter():
/// if haystack[i..].starts_with(p.bytes()):
/// return Match(p.id(), i, i + p.bytes().len())
/// ```
///
/// Namely, among the patterns in a collection, if they are matched in
/// the order provided by this iterator, then the result is guaranteed
/// to satisfy the correct match semantics. (Either leftmost-first or
/// leftmost-longest.)
pub(crate) fn iter(&self) -> PatternIter<'_> {
PatternIter { patterns: self, i: 0 }
}
}
/// An iterator over the patterns in the `Patterns` collection.
///
/// The order of the patterns provided by this iterator is consistent with the
/// match semantics of the originating collection of patterns.
///
/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
/// this is iterating over.
#[derive(Debug)]
pub(crate) struct PatternIter<'p> {
patterns: &'p Patterns,
i: usize,
}
impl<'p> Iterator for PatternIter<'p> {
type Item = (PatternID, Pattern<'p>);
fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
if self.i >= self.patterns.len() {
return None;
}
let id = self.patterns.order[self.i];
let p = self.patterns.get(id);
self.i += 1;
Some((id, p))
}
}
/// A pattern that is used in packed searching.
#[derive(Clone)]
pub(crate) struct Pattern<'a>(&'a [u8]);
impl<'a> fmt::Debug for Pattern<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Pattern")
.field("lit", &String::from_utf8_lossy(&self.0))
.finish()
}
}
impl<'p> Pattern<'p> {
/// Returns the length of this pattern, in bytes.
pub(crate) fn len(&self) -> usize {
self.0.len()
}
/// Returns the bytes of this pattern.
pub(crate) fn bytes(&self) -> &[u8] {
&self.0
}
/// Returns the first `len` low nybbles from this pattern. If this pattern
/// is shorter than `len`, then this panics.
pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {
let mut nybs = vec![0; len].into_boxed_slice();
for (i, byte) in self.bytes().iter().take(len).enumerate() {
nybs[i] = byte & 0xF;
}
nybs
}
/// Returns true if this pattern is a prefix of the given bytes.
#[inline(always)]
pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {
is_prefix(bytes, self.bytes())
}
/// Returns true if this pattern is a prefix of the haystack given by the
/// raw `start` and `end` pointers.
///
/// # Safety
///
/// * It must be the case that `start < end` and that the distance between
/// them is at least equal to `V::BYTES`. That is, it must always be valid
/// to do at least an unaligned load of `V` at `start`.
/// * Both `start` and `end` must be valid for reads.
/// * Both `start` and `end` must point to an initialized value.
/// * Both `start` and `end` must point to the same allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object.
/// * Both `start` and `end` must be _derived from_ a pointer to the same
/// object.
/// * The distance between `start` and `end` must not overflow `isize`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
#[inline(always)]
pub(crate) unsafe fn is_prefix_raw(
&self,
start: *const u8,
end: *const u8,
) -> bool {
let patlen = self.bytes().len();
let haylen = end.distance(start);
if patlen > haylen {
return false;
}
// SAFETY: We've checked that the haystack has length at least equal
// to this pattern. All other safety concerns are the responsibility
// of the caller.
is_equal_raw(start, self.bytes().as_ptr(), patlen)
}
}
/// Returns true if and only if `needle` is a prefix of `haystack`.
///
/// This uses a latency optimized variant of `memcmp` internally which *might*
/// make this faster for very short strings.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
#[inline(always)]
fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
if needle.len() > haystack.len() {
return false;
}
// SAFETY: Our pointers are derived directly from borrowed slices which
// uphold all of our safety guarantees except for length. We account for
// length with the check above.
unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }
}
/// Compare corresponding bytes in `x` and `y` for equality.
///
/// That is, this returns true if and only if `x.len() == y.len()` and
/// `x[i] == y[i]` for all `0 <= i < x.len()`.
///
/// Note that this isn't used. We only use it in tests as a convenient way
/// of testing `is_equal_raw`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
#[cfg(test)]
#[inline(always)]
fn is_equal(x: &[u8], y: &[u8]) -> bool {
if x.len() != y.len() {
return false;
}
// SAFETY: Our pointers are derived directly from borrowed slices which
// uphold all of our safety guarantees except for length. We account for
// length with the check above.
unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
}
/// Compare `n` bytes at the given pointers for equality.
///
/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
/// `0 <= i < n`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
///
/// # Safety
///
/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
/// * Both `x` and `y` must point to an initialized value.
/// * Both `x` and `y` must each point to an allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object. `x` and `y` do not need to point to the same allocated
/// object, but they may.
/// * Both `x` and `y` must be _derived from_ a pointer to their respective
/// allocated objects.
/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
/// for `y` and `y+n`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
#[inline(always)]
unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {
// If we don't have enough bytes to do 4-byte at a time loads, then
// handle each possible length specially. Note that I used to have a
// byte-at-a-time loop here and that turned out to be quite a bit slower
// for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
if n < 4 {
return match n {
0 => true,
1 => x.read() == y.read(),
2 => {
x.cast::<u16>().read_unaligned()
== y.cast::<u16>().read_unaligned()
}
// I also tried copy_nonoverlapping here and it looks like the
// codegen is the same.
3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
_ => unreachable!(),
};
}
// When we have 4 or more bytes to compare, then proceed in chunks of 4 at
// a time using unaligned loads.
//
// Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
// that this particular version of memcmp is likely to be called with tiny
// needles. That means that if we do 8 byte loads, then a higher proportion
// of memcmp calls will use the slower variant above. With that said, this
// is a hypothesis and is only loosely supported by benchmarks. There's
// likely some improvement that could be made here. The main thing here
// though is to optimize for latency, not throughput.
// SAFETY: The caller is responsible for ensuring the pointers we get are
// valid and readable for at least `n` bytes. We also do unaligned loads,
// so there's no need to ensure we're aligned. (This is justified by this
// routine being specifically for short strings.)
let xend = x.add(n.wrapping_sub(4));
let yend = y.add(n.wrapping_sub(4));
while x < xend {
let vx = x.cast::<u32>().read_unaligned();
let vy = y.cast::<u32>().read_unaligned();
if vx != vy {
return false;
}
x = x.add(4);
y = y.add(4);
}
let vx = xend.cast::<u32>().read_unaligned();
let vy = yend.cast::<u32>().read_unaligned();
vx == vy
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn equals_different_lengths() {
assert!(!is_equal(b"", b"a"));
assert!(!is_equal(b"a", b""));
assert!(!is_equal(b"ab", b"a"));
assert!(!is_equal(b"a", b"ab"));
}
#[test]
fn equals_mismatch() {
let one_mismatch = [
(&b"a"[..], &b"x"[..]),
(&b"ab"[..], &b"ax"[..]),
(&b"abc"[..], &b"abx"[..]),
(&b"abcd"[..], &b"abcx"[..]),
(&b"abcde"[..], &b"abcdx"[..]),
(&b"abcdef"[..], &b"abcdex"[..]),
(&b"abcdefg"[..], &b"abcdefx"[..]),
(&b"abcdefgh"[..], &b"abcdefgx"[..]),
(&b"abcdefghi"[..], &b"abcdefghx"[..]),
(&b"abcdefghij"[..], &b"abcdefghix"[..]),
(&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
(&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
(&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
(&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
];
for (x, y) in one_mismatch {
assert_eq!(x.len(), y.len(), "lengths should match");
assert!(!is_equal(x, y));
assert!(!is_equal(y, x));
}
}
#[test]
fn equals_yes() {
assert!(is_equal(b"", b""));
assert!(is_equal(b"a", b"a"));
assert!(is_equal(b"ab", b"ab"));
assert!(is_equal(b"abc", b"abc"));
assert!(is_equal(b"abcd", b"abcd"));
assert!(is_equal(b"abcde", b"abcde"));
assert!(is_equal(b"abcdef", b"abcdef"));
assert!(is_equal(b"abcdefg", b"abcdefg"));
assert!(is_equal(b"abcdefgh", b"abcdefgh"));
assert!(is_equal(b"abcdefghi", b"abcdefghi"));
}
#[test]
fn prefix() {
assert!(is_prefix(b"", b""));
assert!(is_prefix(b"a", b""));
assert!(is_prefix(b"ab", b""));
assert!(is_prefix(b"foo", b"foo"));
assert!(is_prefix(b"foobar", b"foo"));
assert!(!is_prefix(b"foo", b"fob"));
assert!(!is_prefix(b"foobar", b"fob"));
}
}

View File

@@ -0,0 +1,168 @@
use alloc::{sync::Arc, vec, vec::Vec};
use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
/// The type of the rolling hash used in the Rabin-Karp algorithm.
type Hash = usize;
/// The number of buckets to store our patterns in. We don't want this to be
/// too big in order to avoid wasting memory, but we don't want it to be too
/// small either to avoid spending too much time confirming literals.
///
/// The number of buckets MUST be a power of two. Otherwise, determining the
/// bucket from a hash will slow down the code considerably. Using a power
/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
/// instruction.
const NUM_BUCKETS: usize = 64;
/// An implementation of the Rabin-Karp algorithm. The main idea of this
/// algorithm is to maintain a rolling hash as it moves through the input, and
/// then check whether that hash corresponds to the same hash for any of the
/// patterns we're looking for.
///
/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
/// it requires all of the patterns to be the same length, which in turn
/// corresponds to the number of bytes to hash. We adapt this to work for
/// multiple patterns of varying size by fixing the number of bytes to hash
/// to be the length of the smallest pattern. We also split the patterns into
/// several buckets to hopefully make the confirmation step faster.
///
/// Wikipedia has a decent explanation, if a bit heavy on the theory:
/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
///
/// But ESMAJ provides something a bit more concrete:
/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
#[derive(Clone, Debug)]
pub(crate) struct RabinKarp {
/// The patterns we're searching for.
patterns: Arc<Patterns>,
/// The order of patterns in each bucket is significant. Namely, they are
/// arranged such that the first one to match is the correct match. This
/// may not necessarily correspond to the order provided by the caller.
/// For example, if leftmost-longest semantics are used, then the patterns
/// are sorted by their length in descending order. If leftmost-first
/// semantics are used, then the patterns are sorted by their pattern ID
/// in ascending order (which corresponds to the caller's order).
buckets: Vec<Vec<(Hash, PatternID)>>,
/// The length of the hashing window. Generally, this corresponds to the
/// length of the smallest pattern.
hash_len: usize,
/// The factor to subtract out of a hash before updating it with a new
/// byte.
hash_2pow: usize,
}
impl RabinKarp {
/// Compile a new Rabin-Karp matcher from the patterns given.
///
/// This panics if any of the patterns in the collection are empty, or if
/// the collection is itself empty.
pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
assert!(patterns.len() >= 1);
let hash_len = patterns.minimum_len();
assert!(hash_len >= 1);
let mut hash_2pow = 1usize;
for _ in 1..hash_len {
hash_2pow = hash_2pow.wrapping_shl(1);
}
let mut rk = RabinKarp {
patterns: Arc::clone(patterns),
buckets: vec![vec![]; NUM_BUCKETS],
hash_len,
hash_2pow,
};
for (id, pat) in patterns.iter() {
let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
let bucket = hash % NUM_BUCKETS;
rk.buckets[bucket].push((hash, id));
}
rk
}
/// Return the first matching pattern in the given haystack, begining the
/// search at `at`.
pub(crate) fn find_at(
&self,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
assert_eq!(NUM_BUCKETS, self.buckets.len());
if at + self.hash_len > haystack.len() {
return None;
}
let mut hash = self.hash(&haystack[at..at + self.hash_len]);
loop {
let bucket = &self.buckets[hash % NUM_BUCKETS];
for &(phash, pid) in bucket {
if phash == hash {
if let Some(c) = self.verify(pid, haystack, at) {
return Some(c);
}
}
}
if at + self.hash_len >= haystack.len() {
return None;
}
hash = self.update_hash(
hash,
haystack[at],
haystack[at + self.hash_len],
);
at += 1;
}
}
/// Returns the approximate total amount of heap used by this searcher, in
/// units of bytes.
pub(crate) fn memory_usage(&self) -> usize {
self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
+ self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
}
/// Verify whether the pattern with the given id matches at
/// `haystack[at..]`.
///
/// We tag this function as `cold` because it helps improve codegen.
/// Intuitively, it would seem like inlining it would be better. However,
/// the only time this is called and a match is not found is when there
/// there is a hash collision, or when a prefix of a pattern matches but
/// the entire pattern doesn't match. This is hopefully fairly rare, and
/// if it does occur a lot, it's going to be slow no matter what we do.
#[cold]
fn verify(
&self,
id: PatternID,
haystack: &[u8],
at: usize,
) -> Option<Match> {
let pat = self.patterns.get(id);
if pat.is_prefix(&haystack[at..]) {
Some(Match::new(id, at..at + pat.len()))
} else {
None
}
}
/// Hash the given bytes.
fn hash(&self, bytes: &[u8]) -> Hash {
assert_eq!(self.hash_len, bytes.len());
let mut hash = 0usize;
for &b in bytes {
hash = hash.wrapping_shl(1).wrapping_add(b as usize);
}
hash
}
/// Update the hash given based on removing `old_byte` at the beginning
/// of some byte string, and appending `new_byte` to the end of that same
/// byte string.
fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
.wrapping_shl(1)
.wrapping_add(new_byte as usize)
}
}

View File

@@ -0,0 +1,386 @@
Teddy is a SIMD accelerated multiple substring matching algorithm. The name
and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
project. The implementation in this repository was mostly motivated for use in
accelerating regex searches by searching for small sets of required literals
extracted from the regex.
# Background
The key idea of Teddy is to do *packed* substring matching. In the literature,
packed substring matching is the idea of examining multiple bytes in a haystack
at a time to detect matches. Implementations of, for example, memchr (which
detects matches of a single byte) have been doing this for years. Only
recently, with the introduction of various SIMD instructions, has this been
extended to substring matching. The PCMPESTRI instruction (and its relatives),
for example, implements substring matching in hardware. It is, however, limited
to substrings of length 16 bytes or fewer, but this restriction is fine in a
regex engine, since we rarely care about the performance difference between
searching for a 16 byte literal and a 16 + N literal; 16 is already long
enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
at least, is its latency and throughput. As a result, it is often faster to
do substring search with a Boyer-Moore (or Two-Way) variant and a well placed
memchr to quickly skip through the haystack.
There are fewer results from the literature on packed substring matching,
and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
describes use of PCMPESTRI for substring matching, but is mostly theoretical
and hand-waves performance. There is other theoretical work done by Bille [3]
as well.
The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
and is generally focused on multiple pattern search. Their first paper [4a]
introduces the concept of a fingerprint, which is computed for every block of
N bytes in every pattern. The haystack is then scanned N bytes at a time and
a fingerprint is computed in the same way it was computed for blocks in the
patterns. If the fingerprint corresponds to one that was found in a pattern,
then a verification step follows to confirm that one of the substrings with the
corresponding fingerprint actually matches at the current location. Various
implementation tricks are employed to make sure the fingerprint lookup is fast;
typically by truncating the fingerprint. (This may, of course, provoke more
steps in the verification process, so a balance must be struck.)
The main downside of [4a] is that the minimum substring length is 32 bytes,
presumably because of how the algorithm uses certain SIMD instructions. This
essentially makes it useless for general purpose regex matching, where a small
number of short patterns is far more likely.
Faro and Kulekci published another paper [4b] that is conceptually very similar
to [4a]. The key difference is that it uses the CRC32 instruction (introduced
as part of SSE 4.2) to compute fingerprint values. This also enables the
algorithm to work effectively on substrings as short as 7 bytes with 4 byte
windows. 7 bytes is unfortunately still too long. The window could be
technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
small window size ends up negating most performance benefits—and it's likely
the common case in a general purpose regex engine.
Faro and Kulekci also published [4c] that appears to be intended as a
replacement to using PCMPESTRI. In particular, it is specifically motivated by
the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
instructions that are faster. While this approach works for short substrings,
I personally couldn't see a way to generalize it to multiple substring search.
Faro and Kulekci have another paper [4d] that I haven't been able to read
because it is behind a paywall.
# Teddy
Finally, we get to Teddy. If the above literature review is complete, then it
appears that Teddy is a novel algorithm. More than that, in my experience, it
completely blows away the competition for short substrings, which is exactly
what we want in a general purpose regex engine. Again, the algorithm appears
to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
late 2015, and no earlier history could be found. Therefore, tracking the exact
provenance of the algorithm with respect to the published literature seems
difficult.
At a high level, Teddy works somewhat similarly to the fingerprint algorithms
published by Faro and Kulekci, but Teddy does it in a way that scales a bit
better. Namely:
1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX)
byte chunks. 16 (or 32) is significant because it corresponds to the number
of bytes in a SIMD vector.
2. Bitwise operations are performed on each chunk to discover if any region of
it matches a set of precomputed fingerprints from the patterns. If there are
matches, then a verification step is performed. In this implementation, our
verification step is naive. This can be improved upon.
The details to make this work are quite clever. First, we must choose how to
pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
last N bytes of each substring, where N must be at least the minimum length of
any substring in the set being searched. In this implementation, we use the
first N bytes of each substring. (The tradeoffs between these choices aren't
yet clear to me.) We then must figure out how to quickly test whether an
occurrence of any fingerprint from the set of patterns appears in a 16 byte
block from the haystack. To keep things simple, let's assume N = 1 and examine
some examples to motivate the approach. Here are our patterns:
```ignore
foo
bar
baz
```
The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
our 16 byte block to:
```ignore
bat cat foo bump
xxxxxxxxxxxxxxxx
```
To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
a mask that allows us to quickly compute membership of a fingerprint in a 16
byte block that also tells which pattern the fingerprint corresponds to. In
this case, our fingerprint is a single byte, so an appropriate abstraction is
a map from a single byte to a list of patterns that contain that fingerprint:
```ignore
f |--> foo
b |--> bar, baz
```
Now, all we need to do is figure out how to represent this map in vector space
and use normal SIMD operations to perform a lookup. The first simplification
we can make is to represent our patterns as bit fields occupying a single
byte. This is important, because a single SIMD vector can store 16 bytes.
```ignore
f |--> 00000001
b |--> 00000010, 00000100
```
How do we perform lookup though? It turns out that SSSE3 introduced a very cool
instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
for the purposes of this algorithm. For full details, see [Intel's Intrinsics
Guide][5_u].) This essentially lets us use the values in `B` to lookup values
in `A`.
If we could somehow cause `B` to contain our 16 byte block from the haystack,
and if `A` could contain our bitmasks, then we'd end up with something like
this for `A`:
```ignore
0x00 0x01 ... 0x62 ... 0x66 ... 0xFF
A = 0 0 00000110 00000001 0
```
And if `B` contains our window from our haystack, we could use shuffle to take
the values from `B` and use them to look up our bitsets in `A`. But of course,
we can't do this because `A` in the above example contains 256 bytes, which
is much larger than the size of a SIMD vector.
Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
our bitsets, we can use two masks, where one mask corresponds to the lower four
bits of our fingerprint and the other mask corresponds to the upper four bits.
So our map now looks like:
```ignore
'f' & 0xF = 0x6 |--> 00000001
'f' >> 4 = 0x6 |--> 00000111
'b' & 0xF = 0x2 |--> 00000110
'b' >> 4 = 0x6 |--> 00000111
```
Notice that the bitsets for each nybble correspond to the union of all
fingerprints that contain that nybble. For example, both `f` and `b` have the
same upper 4 bits but differ on the lower 4 bits. Putting this together, we
have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
our mask for the upper nybble and `B` is our 16 byte block from the haystack:
```ignore
0x00 0x01 0x02 0x03 ... 0x06 ... 0xF
A0 = 0 0 00000110 0 00000001 0
A1 = 0 0 0 0 00000111 0
B = b a t _ t p
B = 0x62 0x61 0x74 0x20 0x74 0x70
```
But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
and we need indexes that are at most 4 bits (corresponding to one of 16
values). We can apply the same transformation to split `B` into lower and upper
nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
`B1` corresponds to the upper nybbles:
```ignore
b a t _ c a t _ f o o _ b u m p
B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
```
And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
```ignore
b a ... f o ... p
A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0]
C0 = 00000110 0 00000001 0 0
```
And `C1 = PSHUFB(A1, B1)`:
```ignore
b a ... f o ... p
A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7]
C1 = 00000111 00000111 00000111 00000111 0
```
Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
results all on its own. For example, `C1` claims that `b` is a fingerprint for
the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
for all of our patterns. But if we combined `C0` and `C1` with an `AND`
operation:
```ignore
b a ... f o ... p
C = 00000110 0 00000001 0 0
```
Then we now have that `C[i]` contains a bitset corresponding to the matching
fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
block.
Once we have that, we can look for the position of the least significant bit
in `C`. (Least significant because we only target little endian here. Thus,
the least significant bytes correspond to bytes in our haystack at a lower
address.) That position, modulo `8`, gives us the pattern that the fingerprint
matches. That position, integer divided by `8`, also gives us the byte offset
that the fingerprint occurs in inside the 16 byte haystack block. Using those
two pieces of information, we can run a verification procedure that tries
to match all substrings containing that fingerprint at that position in the
haystack.
# Implementation notes
The problem with the algorithm as described above is that it uses a single byte
for a fingerprint. This will work well if the fingerprints are rare in the
haystack (e.g., capital letters or special characters in normal English text),
but if the fingerprints are common, you'll wind up spending too much time in
the verification step, which effectively negates the performance benefits of
scanning 16 bytes at a time. Remember, the key to the performance of this
algorithm is to do as little work as possible per 16 (or 32) bytes.
This algorithm can be extrapolated in a relatively straight-forward way to use
larger fingerprints. That is, instead of a single byte prefix, we might use a
two or three byte prefix. The implementation here implements N = {1, 2, 3}
and always picks the largest N possible. The rationale is that the bigger the
fingerprint, the fewer verification steps we'll do. Of course, if N is too
large, then we'll end up doing too much on each step.
The way to extend it is:
1. Add a mask for each byte in the fingerprint. (Remember that each mask is
composed of two SIMD vectors.) This results in a value of `C` for each byte
in the fingerprint while searching.
2. When testing each 16 (or 32) byte block, each value of `C` must be shifted
so that they are aligned. Once aligned, they should all be `AND`'d together.
This will give you only the bitsets corresponding to the full match of the
fingerprint. To do this, one needs to save the last byte (for N=2) or last
two bytes (for N=3) from the previous iteration, and then line them up with
the first one or two bytes of the next iteration.
## Verification
Verification generally follows the procedure outlined above. The tricky parts
are in the right formulation of operations to get our bits out of our vectors.
We have a limited set of operations available to us on SIMD vectors as 128-bit
or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers
from our vectors, and then run our verification step on each of those. The
verification step looks at the least significant bit set, and from its
position, we can derive the byte offset and bucket. (Again, as described
above.) Once we know the bucket, we do a fairly naive exhaustive search for
every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash
table, but I haven't had time to thoroughly explore that. A few initial
half-hearted attempts resulted in worse performance.)
## AVX
The AVX version of Teddy extrapolates almost perfectly from the SSE version.
The only hickup is that PALIGNR is used to align chunks in the 16-bit version,
and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it
only works within 128-bit lanes. So there's a bit of tomfoolery to get around
this by shuffling the vectors before calling VPALIGNR.
The only other aspect to AVX is that since our masks are still fundamentally
16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to
32-byte chunks.
## Fat Teddy
In the version of Teddy described above, 8 buckets are used to group patterns
that we want to search for. However, when AVX is available, we can extend the
number of buckets to 16 by permitting each byte in our masks to use 16-bits
instead of 8-bits to represent the buckets it belongs to. (This variant is also
in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a
time, even though we're using AVX. Instead, we have to scan 16 bytes at a time.
What we gain, though, is (hopefully) less work in our verification routine.
It patterns are more spread out across more buckets, then there should overall
be fewer false positives. In general, Fat Teddy permits us to grow our capacity
a bit and search for more literals before Teddy gets overwhelmed.
The tricky part of Fat Teddy is in how we adjust our masks and our verification
procedure. For the masks, we simply represent the first 8 buckets in each of
the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes.
Then, in the search loop, instead of loading 32 bytes from the haystack, we
load the same 16 bytes from the haystack into both the low and high 16 byte
portions of our 256-bit vector. So for example, a mask might look like this:
bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000
byte: 31 30 16 15 14 0
offset: 15 14 0 15 14 0
buckets: 8-15 8-15 8-15 0-7 0-7 0-7
Where `byte` is the position in the vector (higher numbers corresponding to
more significant bits), `offset` is the corresponding position in the haystack
chunk, and `buckets` corresponds to the bucket assignments for that particular
byte.
In particular, notice that the bucket assignments for offset `0` are spread
out between bytes `0` and `16`. This works well for the chunk-by-chunk search
procedure, but verification really wants to process all bucket assignments for
each offset at once. Otherwise, we might wind up finding a match at offset
`1` in one the first 8 buckets, when we really should have reported a match
at offset `0` in one of the second 8 buckets. (Because we want the leftmost
match.)
Thus, for verification, we rearrange the above vector such that it is a
sequence of 16-bit integers, where the least significant 16-bit integer
corresponds to all of the bucket assignments for offset `0`. So with the
above vector, the least significant 16-bit integer would be
11000000 000000
which was taken from bytes `16` and `0`. Then the verification step pretty much
runs as described, except with 16 buckets instead of 8.
# References
- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan),
[webpage](https://www.hyperscan.io/)
- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
& Weimann, O. (2011).
_Optimal packed string matching_.
In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
[PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
& Weimann, O. (2014).
_Towards optimal packed string matching_.
Theoretical Computer Science, 525, 111-129.
DOI: 10.1016/j.tcs.2013.06.013.
[PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
- **[3]** Bille, P. (2011).
_Fast searching in packed strings_.
Journal of Discrete Algorithms, 9(1), 49-56.
DOI: 10.1016/j.jda.2010.09.003.
[PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353).
- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
_Fast multiple string matching using streaming SIMD extensions technology_.
In String Processing and Information Retrieval (pp. 217-228).
Springer Berlin Heidelberg.
DOI: 10.1007/978-3-642-34109-0_23.
[PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf).
- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
_Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
In Stringology (pp. 78-91).
[PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf).
- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
_Fast packed string matching for short patterns_.
In Proceedings of the Meeting on Algorithm Engineering & Expermiments
(pp. 113-121).
Society for Industrial and Applied Mathematics.
[PDF](https://arxiv.org/pdf/1209.6449.pdf).
- **[4d]** Faro, S., & Külekci, M. O. (2014).
_Fast and flexible packed string matching_.
Journal of Discrete Algorithms, 28, 61-72.
DOI: 10.1016/j.jda.2014.07.003.
[1_u]: https://github.com/intel/hyperscan
[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide

View File

@@ -0,0 +1,792 @@
use core::{
fmt::Debug,
panic::{RefUnwindSafe, UnwindSafe},
};
use alloc::sync::Arc;
use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match};
/// A builder for constructing a Teddy matcher.
///
/// The builder primarily permits fine grained configuration of the Teddy
/// matcher. Most options are made only available for testing/benchmarking
/// purposes. In reality, options are automatically determined by the nature
/// and number of patterns given to the builder.
#[derive(Clone, Debug)]
pub(crate) struct Builder {
/// When none, this is automatically determined. Otherwise, `false` means
/// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
/// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
/// available and Fat Teddy was requested, no matcher will be built.
only_fat: Option<bool>,
/// When none, this is automatically determined. Otherwise, `false` means
/// that 128-bit vectors will be used (up to SSSE3 instructions) where as
/// `true` means that 256-bit vectors will be used. As with `fat`, if
/// 256-bit vectors are requested and they aren't available, then a
/// searcher will not be built.
only_256bit: Option<bool>,
/// When true (the default), the number of patterns will be used as a
/// heuristic for refusing construction of a Teddy searcher. The point here
/// is that too many patterns can overwhelm Teddy. But this can be disabled
/// in cases where the caller knows better.
heuristic_pattern_limits: bool,
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
impl Builder {
/// Create a new builder for configuring a Teddy matcher.
pub(crate) fn new() -> Builder {
Builder {
only_fat: None,
only_256bit: None,
heuristic_pattern_limits: true,
}
}
/// Build a matcher for the set of patterns given. If a matcher could not
/// be built, then `None` is returned.
///
/// Generally, a matcher isn't built if the necessary CPU features aren't
/// available, an unsupported target or if the searcher is believed to be
/// slower than standard techniques (i.e., if there are too many literals).
pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
self.build_imp(patterns)
}
/// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
/// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
/// for a larger set of literals.
///
/// `None` is the default, which results in an automatic selection based
/// on the number of literals and available CPU features.
pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder {
self.only_fat = yes;
self
}
/// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
/// Generally, a larger vector size is better since it either permits
/// matching more patterns or matching more bytes in the haystack at once.
///
/// `None` is the default, which results in an automatic selection based on
/// the number of literals and available CPU features.
pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder {
self.only_256bit = yes;
self
}
/// Request that heuristic limitations on the number of patterns be
/// employed. This useful to disable for benchmarking where one wants to
/// explore how Teddy performs on large number of patterns even if the
/// heuristics would otherwise refuse construction.
///
/// This is enabled by default.
pub(crate) fn heuristic_pattern_limits(
&mut self,
yes: bool,
) -> &mut Builder {
self.heuristic_pattern_limits = yes;
self
}
fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
let patlimit = self.heuristic_pattern_limits;
// There's no particular reason why we limit ourselves to little endian
// here, but it seems likely that some parts of Teddy as they are
// currently written (e.g., the uses of `trailing_zeros`) are likely
// wrong on non-little-endian targets. Such things are likely easy to
// fix, but at the time of writing (2023/09/18), I actually do not know
// how to test this code on a big-endian target. So for now, we're
// conservative and just bail out.
if !cfg!(target_endian = "little") {
debug!("skipping Teddy because target isn't little endian");
return None;
}
// Too many patterns will overwhelm Teddy and likely lead to slow
// downs, typically in the verification step.
if patlimit && patterns.len() > 64 {
debug!("skipping Teddy because of too many patterns");
return None;
}
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
{
use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3};
let mask_len = core::cmp::min(4, patterns.minimum_len());
let beefy = patterns.len() > 32;
let has_avx2 = self::x86_64::is_available_avx2();
let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3();
let use_avx2 = if self.only_256bit == Some(true) {
if !has_avx2 {
debug!(
"skipping Teddy because avx2 was demanded but unavailable"
);
return None;
}
true
} else if self.only_256bit == Some(false) {
if !has_ssse3 {
debug!(
"skipping Teddy because ssse3 was demanded but unavailable"
);
return None;
}
false
} else if !has_ssse3 && !has_avx2 {
debug!(
"skipping Teddy because ssse3 and avx2 are unavailable"
);
return None;
} else {
has_avx2
};
let fat = match self.only_fat {
None => use_avx2 && beefy,
Some(false) => false,
Some(true) if !use_avx2 => {
debug!(
"skipping Teddy because fat was demanded, but fat \
Teddy requires avx2 which is unavailable"
);
return None;
}
Some(true) => true,
};
// Just like for aarch64, it's possible that too many patterns will
// overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which
// helps things scale a bit more by spreading patterns over more
// buckets.
//
// These thresholds were determined by looking at the measurements
// for the rust/aho-corasick/packed/leftmost-first and
// rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
// benchmarks.
if patlimit && mask_len == 1 && patterns.len() > 16 {
debug!(
"skipping Teddy (mask len: 1) because there are \
too many patterns",
);
return None;
}
match (mask_len, use_avx2, fat) {
(1, false, _) => {
debug!("Teddy choice: 128-bit slim, 1 byte");
SlimSSSE3::<1>::new(&patterns)
}
(1, true, false) => {
debug!("Teddy choice: 256-bit slim, 1 byte");
SlimAVX2::<1>::new(&patterns)
}
(1, true, true) => {
debug!("Teddy choice: 256-bit fat, 1 byte");
FatAVX2::<1>::new(&patterns)
}
(2, false, _) => {
debug!("Teddy choice: 128-bit slim, 2 bytes");
SlimSSSE3::<2>::new(&patterns)
}
(2, true, false) => {
debug!("Teddy choice: 256-bit slim, 2 bytes");
SlimAVX2::<2>::new(&patterns)
}
(2, true, true) => {
debug!("Teddy choice: 256-bit fat, 2 bytes");
FatAVX2::<2>::new(&patterns)
}
(3, false, _) => {
debug!("Teddy choice: 128-bit slim, 3 bytes");
SlimSSSE3::<3>::new(&patterns)
}
(3, true, false) => {
debug!("Teddy choice: 256-bit slim, 3 bytes");
SlimAVX2::<3>::new(&patterns)
}
(3, true, true) => {
debug!("Teddy choice: 256-bit fat, 3 bytes");
FatAVX2::<3>::new(&patterns)
}
(4, false, _) => {
debug!("Teddy choice: 128-bit slim, 4 bytes");
SlimSSSE3::<4>::new(&patterns)
}
(4, true, false) => {
debug!("Teddy choice: 256-bit slim, 4 bytes");
SlimAVX2::<4>::new(&patterns)
}
(4, true, true) => {
debug!("Teddy choice: 256-bit fat, 4 bytes");
FatAVX2::<4>::new(&patterns)
}
_ => {
debug!("no supported Teddy configuration found");
None
}
}
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
target_endian = "little"
))]
{
use self::aarch64::SlimNeon;
let mask_len = core::cmp::min(4, patterns.minimum_len());
if self.only_256bit == Some(true) {
debug!(
"skipping Teddy because 256-bits were demanded \
but unavailable"
);
return None;
}
if self.only_fat == Some(true) {
debug!(
"skipping Teddy because fat was demanded but unavailable"
);
}
// Since we don't have Fat teddy in aarch64 (I think we'd want at
// least 256-bit vectors for that), we need to be careful not to
// allow too many patterns as it might overwhelm Teddy. Generally
// speaking, as the mask length goes up, the more patterns we can
// handle because the mask length results in fewer candidates
// generated.
//
// These thresholds were determined by looking at the measurements
// for the rust/aho-corasick/packed/leftmost-first and
// rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
// benchmarks.
match mask_len {
1 => {
if patlimit && patterns.len() > 16 {
debug!(
"skipping Teddy (mask len: 1) because there are \
too many patterns",
);
}
debug!("Teddy choice: 128-bit slim, 1 byte");
SlimNeon::<1>::new(&patterns)
}
2 => {
if patlimit && patterns.len() > 32 {
debug!(
"skipping Teddy (mask len: 2) because there are \
too many patterns",
);
}
debug!("Teddy choice: 128-bit slim, 2 bytes");
SlimNeon::<2>::new(&patterns)
}
3 => {
if patlimit && patterns.len() > 48 {
debug!(
"skipping Teddy (mask len: 3) because there are \
too many patterns",
);
}
debug!("Teddy choice: 128-bit slim, 3 bytes");
SlimNeon::<3>::new(&patterns)
}
4 => {
debug!("Teddy choice: 128-bit slim, 4 bytes");
SlimNeon::<4>::new(&patterns)
}
_ => {
debug!("no supported Teddy configuration found");
None
}
}
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "sse2"),
all(
target_arch = "aarch64",
target_feature = "neon",
target_endian = "little"
)
)))]
{
None
}
}
}
/// A searcher that dispatches to one of several possible Teddy variants.
#[derive(Clone, Debug)]
pub(crate) struct Searcher {
/// The Teddy variant we use. We use dynamic dispatch under the theory that
/// it results in better codegen then a enum, although this is a specious
/// claim.
///
/// This `Searcher` is essentially a wrapper for a `SearcherT` trait
/// object. We just make `memory_usage` and `minimum_len` available without
/// going through dynamic dispatch.
imp: Arc<dyn SearcherT>,
/// Total heap memory used by the Teddy variant.
memory_usage: usize,
/// The minimum haystack length this searcher can handle. It is intended
/// for callers to use some other search routine (such as Rabin-Karp) in
/// cases where the haystack (or remainer of the haystack) is too short.
minimum_len: usize,
}
impl Searcher {
/// Look for the leftmost occurrence of any pattern in this search in the
/// given haystack starting at the given position.
///
/// # Panics
///
/// This panics when `haystack[at..].len()` is less than the minimum length
/// for this haystack.
#[inline(always)]
pub(crate) fn find(
&self,
haystack: &[u8],
at: usize,
) -> Option<crate::Match> {
// SAFETY: The Teddy implementations all require a minimum haystack
// length, and this is required for safety. Therefore, we assert it
// here in order to make this method sound.
assert!(haystack[at..].len() >= self.minimum_len);
let hayptr = haystack.as_ptr();
// SAFETY: Construction of the searcher guarantees that we are able
// to run it in the current environment (i.e., we won't get an AVX2
// searcher on a x86-64 CPU without AVX2 support). Also, the pointers
// are valid as they are derived directly from a borrowed slice.
let teddym = unsafe {
self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))?
};
let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize());
let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize());
let span = crate::Span { start, end };
// OK because we won't permit the construction of a searcher that
// could report a pattern ID bigger than what can fit in the crate-wide
// PatternID type.
let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize());
let m = crate::Match::new(pid, span);
Some(m)
}
/// Returns the approximate total amount of heap used by this type, in
/// units of bytes.
#[inline(always)]
pub(crate) fn memory_usage(&self) -> usize {
self.memory_usage
}
/// Returns the minimum length, in bytes, that a haystack must be in order
/// to use it with this searcher.
#[inline(always)]
pub(crate) fn minimum_len(&self) -> usize {
self.minimum_len
}
}
/// A trait that provides dynamic dispatch over the different possible Teddy
/// variants on the same algorithm.
///
/// On `x86_64` for example, it isn't known until runtime which of 12 possible
/// variants will be used. One might use one of the four slim 128-bit vector
/// variants, or one of the four 256-bit vector variants or even one of the
/// four fat 256-bit vector variants.
///
/// Since this choice is generally made when the Teddy searcher is constructed
/// and this choice is based on the patterns given and what the current CPU
/// supports, it follows that there must be some kind of indirection at search
/// time that "selects" the variant chosen at build time.
///
/// There are a few different ways to go about this. One approach is to use an
/// enum. It works fine, but in my experiments, this generally results in worse
/// codegen. Another approach, which is what we use here, is dynamic dispatch
/// via a trait object. We basically implement this trait for each possible
/// variant, select the variant we want at build time and convert it to a
/// trait object for use at search time.
///
/// Another approach is to use function pointers and stick each of the possible
/// variants into a union. This is essentially isomorphic to the dynamic
/// dispatch approach, but doesn't require any allocations. Since this crate
/// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The
/// `memchr` crate does this.)
trait SearcherT:
Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
{
/// Execute a search on the given haystack (identified by `start` and `end`
/// raw pointers).
///
/// # Safety
///
/// Essentially, the `start` and `end` pointers must be valid and point
/// to a haystack one can read. As long as you derive them from, for
/// example, a `&[u8]`, they should automatically satisfy all of the safety
/// obligations:
///
/// * Both `start` and `end` must be valid for reads.
/// * Both `start` and `end` must point to an initialized value.
/// * Both `start` and `end` must point to the same allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object.
/// * Both `start` and `end` must be _derived from_ a pointer to the same
/// object.
/// * The distance between `start` and `end` must not overflow `isize`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
/// * It must be the case that `start <= end`.
/// * `end - start` must be greater than the minimum length for this
/// searcher.
///
/// Also, it is expected that implementations of this trait will tag this
/// method with a `target_feature` attribute. Callers must ensure that
/// they are executing this method in an environment where that attribute
/// is valid.
unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>;
}
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
mod x86_64 {
use core::arch::x86_64::{__m128i, __m256i};
use alloc::sync::Arc;
use crate::packed::{
ext::Pointer,
pattern::Patterns,
teddy::generic::{self, Match},
};
use super::{Searcher, SearcherT};
#[derive(Clone, Debug)]
pub(super) struct SlimSSSE3<const BYTES: usize> {
slim128: generic::Slim<__m128i, BYTES>,
}
// Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! slim_ssse3 {
($len:expr) => {
impl SlimSSSE3<$len> {
/// Creates a new searcher using "slim" Teddy with 128-bit
/// vectors. If SSSE3 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
if !is_available_ssse3() {
return None;
}
Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether SSSE3 is available or not.
///
/// # Safety
///
/// Callers must ensure that SSSE3 is available in the current
/// environment.
#[target_feature(enable = "ssse3")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let slim128 = generic::Slim::<__m128i, $len>::new(
Arc::clone(patterns),
);
let memory_usage = slim128.memory_usage();
let minimum_len = slim128.minimum_len();
let imp = Arc::new(SlimSSSE3 { slim128 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for SlimSSSE3<$len> {
#[target_feature(enable = "ssse3")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
self.slim128.find(start, end)
}
}
};
}
slim_ssse3!(1);
slim_ssse3!(2);
slim_ssse3!(3);
slim_ssse3!(4);
#[derive(Clone, Debug)]
pub(super) struct SlimAVX2<const BYTES: usize> {
slim128: generic::Slim<__m128i, BYTES>,
slim256: generic::Slim<__m256i, BYTES>,
}
// Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! slim_avx2 {
($len:expr) => {
impl SlimAVX2<$len> {
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors. If AVX2 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
if !is_available_avx2() {
return None;
}
Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether AVX2 is available or not.
///
/// # Safety
///
/// Callers must ensure that AVX2 is available in the current
/// environment.
#[target_feature(enable = "avx2")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let slim128 = generic::Slim::<__m128i, $len>::new(
Arc::clone(&patterns),
);
let slim256 = generic::Slim::<__m256i, $len>::new(
Arc::clone(&patterns),
);
let memory_usage =
slim128.memory_usage() + slim256.memory_usage();
let minimum_len = slim128.minimum_len();
let imp = Arc::new(SlimAVX2 { slim128, slim256 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for SlimAVX2<$len> {
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
let len = end.distance(start);
if len < self.slim256.minimum_len() {
self.slim128.find(start, end)
} else {
self.slim256.find(start, end)
}
}
}
};
}
slim_avx2!(1);
slim_avx2!(2);
slim_avx2!(3);
slim_avx2!(4);
#[derive(Clone, Debug)]
pub(super) struct FatAVX2<const BYTES: usize> {
fat256: generic::Fat<__m256i, BYTES>,
}
// Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! fat_avx2 {
($len:expr) => {
impl FatAVX2<$len> {
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors. If AVX2 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
if !is_available_avx2() {
return None;
}
Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether AVX2 is available or not.
///
/// # Safety
///
/// Callers must ensure that AVX2 is available in the current
/// environment.
#[target_feature(enable = "avx2")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let fat256 = generic::Fat::<__m256i, $len>::new(
Arc::clone(&patterns),
);
let memory_usage = fat256.memory_usage();
let minimum_len = fat256.minimum_len();
let imp = Arc::new(FatAVX2 { fat256 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for FatAVX2<$len> {
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
self.fat256.find(start, end)
}
}
};
}
fat_avx2!(1);
fat_avx2!(2);
fat_avx2!(3);
fat_avx2!(4);
#[inline]
pub(super) fn is_available_ssse3() -> bool {
#[cfg(not(target_feature = "sse2"))]
{
false
}
#[cfg(target_feature = "sse2")]
{
#[cfg(target_feature = "ssse3")]
{
true
}
#[cfg(not(target_feature = "ssse3"))]
{
#[cfg(feature = "std")]
{
std::is_x86_feature_detected!("ssse3")
}
#[cfg(not(feature = "std"))]
{
false
}
}
}
}
#[inline]
pub(super) fn is_available_avx2() -> bool {
#[cfg(not(target_feature = "sse2"))]
{
false
}
#[cfg(target_feature = "sse2")]
{
#[cfg(target_feature = "avx2")]
{
true
}
#[cfg(not(target_feature = "avx2"))]
{
#[cfg(feature = "std")]
{
std::is_x86_feature_detected!("avx2")
}
#[cfg(not(feature = "std"))]
{
false
}
}
}
}
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
target_endian = "little"
))]
mod aarch64 {
use core::arch::aarch64::uint8x16_t;
use alloc::sync::Arc;
use crate::packed::{
pattern::Patterns,
teddy::generic::{self, Match},
};
use super::{Searcher, SearcherT};
#[derive(Clone, Debug)]
pub(super) struct SlimNeon<const BYTES: usize> {
slim128: generic::Slim<uint8x16_t, BYTES>,
}
// Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! slim_neon {
($len:expr) => {
impl SlimNeon<$len> {
/// Creates a new searcher using "slim" Teddy with 128-bit
/// vectors. If SSSE3 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether SSSE3 is available or not.
///
/// # Safety
///
/// Callers must ensure that SSSE3 is available in the current
/// environment.
#[target_feature(enable = "neon")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let slim128 = generic::Slim::<uint8x16_t, $len>::new(
Arc::clone(patterns),
);
let memory_usage = slim128.memory_usage();
let minimum_len = slim128.minimum_len();
let imp = Arc::new(SlimNeon { slim128 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for SlimNeon<$len> {
#[target_feature(enable = "neon")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
self.slim128.find(start, end)
}
}
};
}
slim_neon!(1);
slim_neon!(2);
slim_neon!(3);
slim_neon!(4);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,9 @@
// Regrettable, but Teddy stuff just isn't used on all targets. And for some
// targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a
// bunch of dead-code warnings. Just not worth trying to squash them. Blech.
#![allow(dead_code)]
pub(crate) use self::builder::{Builder, Searcher};
mod builder;
mod generic;

583
vendor/aho-corasick/src/packed/tests.rs vendored Normal file
View File

@@ -0,0 +1,583 @@
use std::collections::HashMap;
use alloc::{
format,
string::{String, ToString},
vec,
vec::Vec,
};
use crate::{
packed::{Config, MatchKind},
util::search::Match,
};
/// A description of a single test against a multi-pattern searcher.
///
/// A single test may not necessarily pass on every configuration of a
/// searcher. The tests are categorized and grouped appropriately below.
#[derive(Clone, Debug, Eq, PartialEq)]
struct SearchTest {
/// The name of this test, for debugging.
name: &'static str,
/// The patterns to search for.
patterns: &'static [&'static str],
/// The text to search.
haystack: &'static str,
/// Each match is a triple of (pattern_index, start, end), where
/// pattern_index is an index into `patterns` and `start`/`end` are indices
/// into `haystack`.
matches: &'static [(usize, usize, usize)],
}
struct SearchTestOwned {
offset: usize,
name: String,
patterns: Vec<String>,
haystack: String,
matches: Vec<(usize, usize, usize)>,
}
impl SearchTest {
fn variations(&self) -> Vec<SearchTestOwned> {
let count = if cfg!(miri) { 1 } else { 261 };
let mut tests = vec![];
for i in 0..count {
tests.push(self.offset_prefix(i));
tests.push(self.offset_suffix(i));
tests.push(self.offset_both(i));
}
tests
}
fn offset_both(&self, off: usize) -> SearchTestOwned {
SearchTestOwned {
offset: off,
name: self.name.to_string(),
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
haystack: format!(
"{}{}{}",
"Z".repeat(off),
self.haystack,
"Z".repeat(off)
),
matches: self
.matches
.iter()
.map(|&(id, s, e)| (id, s + off, e + off))
.collect(),
}
}
fn offset_prefix(&self, off: usize) -> SearchTestOwned {
SearchTestOwned {
offset: off,
name: self.name.to_string(),
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
haystack: format!("{}{}", "Z".repeat(off), self.haystack),
matches: self
.matches
.iter()
.map(|&(id, s, e)| (id, s + off, e + off))
.collect(),
}
}
fn offset_suffix(&self, off: usize) -> SearchTestOwned {
SearchTestOwned {
offset: off,
name: self.name.to_string(),
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
haystack: format!("{}{}", self.haystack, "Z".repeat(off)),
matches: self.matches.to_vec(),
}
}
}
/// Short-hand constructor for SearchTest. We use it a lot below.
macro_rules! t {
($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
SearchTest {
name: stringify!($name),
patterns: $patterns,
haystack: $haystack,
matches: $matches,
}
};
}
/// A collection of test groups.
type TestCollection = &'static [&'static [SearchTest]];
// Define several collections corresponding to the different type of match
// semantics supported. These collections have some overlap, but each
// collection should have some tests that no other collection has.
/// Tests for leftmost-first match semantics.
const PACKED_LEFTMOST_FIRST: TestCollection =
&[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY];
/// Tests for leftmost-longest match semantics.
const PACKED_LEFTMOST_LONGEST: TestCollection =
&[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY];
// Now define the individual tests that make up the collections above.
/// A collection of tests for the that should always be true regardless of
/// match semantics. That is, all combinations of leftmost-{first, longest}
/// should produce the same answer.
const BASICS: &'static [SearchTest] = &[
t!(basic001, &["a"], "", &[]),
t!(basic010, &["a"], "a", &[(0, 0, 1)]),
t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
t!(basic060, &["a"], "bbb", &[]),
t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
t!(basic100, &["aa"], "", &[]),
t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
t!(basic130, &["aa"], "abbab", &[]),
t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]),
t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
t!(basic230, &["abcd"], "abcd", &[(0, 0, 4)]),
t!(basic240, &["abcd"], "zazabzabcdz", &[(0, 6, 10)]),
t!(basic250, &["abcd"], "zazabcdzabcdz", &[(0, 3, 7), (0, 8, 12)]),
t!(basic300, &["a", "b"], "", &[]),
t!(basic310, &["a", "b"], "z", &[]),
t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
t!(
basic340,
&["a", "b"],
"abba",
&[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
),
t!(
basic350,
&["b", "a"],
"abba",
&[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
),
t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
t!(basic400, &["foo", "bar"], "", &[]),
t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
t!(
basic720,
&["yabcdef", "bcdeyabc", "abcdezghi"],
"yabcdezghi",
&[(2, 1, 10),]
),
t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
t!(
basic840,
&["ab", "ba"],
"abababa",
&[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
),
t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
];
/// Tests for leftmost match semantics. These should pass for both
/// leftmost-first and leftmost-longest match kinds. Stated differently, among
/// ambiguous matches, the longest match and the match that appeared first when
/// constructing the automaton should always be the same.
const LEFTMOST: &'static [SearchTest] = &[
t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
t!(
leftmost360,
&["abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(2, 0, 8),]
),
t!(
leftmost370,
&["abcdefghi", "cde", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(
leftmost380,
&["abcdefghi", "hz", "abcdefgh", "a"],
"abcdefghz",
&[(2, 0, 8),]
),
t!(
leftmost390,
&["b", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(
leftmost400,
&["h", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(
leftmost410,
&["z", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8), (0, 8, 9),]
),
];
/// Tests for non-overlapping leftmost-first match semantics. These tests
/// should generally be specific to leftmost-first, which means they should
/// generally fail under leftmost-longest semantics.
const LEFTMOST_FIRST: &'static [SearchTest] = &[
t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
t!(
leftfirst310,
&["abcd", "b", "bce", "ce"],
"abce",
&[(1, 1, 2), (3, 2, 4),]
),
t!(
leftfirst320,
&["a", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(0, 0, 1), (2, 7, 9),]
),
t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
t!(
leftfirst340,
&["abcdef", "x", "x", "x", "x", "x", "x", "abcde"],
"abcdef",
&[(0, 0, 6)]
),
];
/// Tests for non-overlapping leftmost-longest match semantics. These tests
/// should generally be specific to leftmost-longest, which means they should
/// generally fail under leftmost-first semantics.
const LEFTMOST_LONGEST: &'static [SearchTest] = &[
t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
t!(
leftlong310,
&["a", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
];
/// Regression tests that are applied to all combinations.
///
/// If regression tests are needed for specific match semantics, then add them
/// to the appropriate group above.
const REGRESSION: &'static [SearchTest] = &[
t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
t!(
regression030,
&["libcore/", "libstd/"],
"libcore/char/methods.rs",
&[(0, 0, 8),]
),
t!(
regression040,
&["libstd/", "libcore/"],
"libcore/char/methods.rs",
&[(1, 0, 8),]
),
t!(
regression050,
&["\x00\x00\x01", "\x00\x00\x00"],
"\x00\x00\x00",
&[(1, 0, 3),]
),
t!(
regression060,
&["\x00\x00\x00", "\x00\x00\x01"],
"\x00\x00\x00",
&[(0, 0, 3),]
),
];
const TEDDY: &'static [SearchTest] = &[
t!(
teddy010,
&["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
"abcdefghijk",
&[
(0, 0, 1),
(1, 1, 2),
(2, 2, 3),
(3, 3, 4),
(4, 4, 5),
(5, 5, 6),
(6, 6, 7),
(7, 7, 8),
(8, 8, 9),
(9, 9, 10),
(10, 10, 11)
]
),
t!(
teddy020,
&["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"],
"abcdefghijk",
&[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),]
),
t!(
teddy030,
&["abc"],
"abcdefghijklmnopqrstuvwxyzabcdefghijk",
&[(0, 0, 3), (0, 26, 29)]
),
];
// Now define a test for each combination of things above that we want to run.
// Since there are a few different combinations for each collection of tests,
// we define a couple of macros to avoid repetition drudgery. The testconfig
// macro constructs the automaton from a given match kind, and runs the search
// tests one-by-one over the given collection. The `with` parameter allows one
// to configure the config with additional parameters. The testcombo macro
// invokes testconfig in precisely this way: it sets up several tests where
// each one turns a different knob on Config.
macro_rules! testconfig {
($name:ident, $collection:expr, $with:expr) => {
#[test]
fn $name() {
run_search_tests($collection, |test| {
let mut config = Config::new();
$with(&mut config);
let mut builder = config.builder();
builder.extend(test.patterns.iter().map(|p| p.as_bytes()));
let searcher = match builder.build() {
Some(searcher) => searcher,
None => {
// For x86-64 and aarch64, not building a searcher is
// probably a bug, so be loud.
if cfg!(any(
target_arch = "x86_64",
target_arch = "aarch64"
)) {
panic!("failed to build packed searcher")
}
return None;
}
};
Some(searcher.find_iter(&test.haystack).collect())
});
}
};
}
testconfig!(
search_default_leftmost_first,
PACKED_LEFTMOST_FIRST,
|_: &mut Config| {}
);
testconfig!(
search_default_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.match_kind(MatchKind::LeftmostLongest);
}
);
testconfig!(
search_teddy_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
}
);
testconfig!(
search_teddy_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
}
);
testconfig!(
search_teddy_ssse3_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("ssse3") {
c.only_teddy_256bit(Some(false));
}
}
);
testconfig!(
search_teddy_ssse3_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("ssse3") {
c.only_teddy_256bit(Some(false));
}
}
);
testconfig!(
search_teddy_avx2_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_256bit(Some(true));
}
}
);
testconfig!(
search_teddy_avx2_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_256bit(Some(true));
}
}
);
testconfig!(
search_teddy_fat_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_fat(Some(true));
}
}
);
testconfig!(
search_teddy_fat_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_fat(Some(true));
}
}
);
testconfig!(
search_rabinkarp_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_rabin_karp(true);
}
);
testconfig!(
search_rabinkarp_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_rabin_karp(true).match_kind(MatchKind::LeftmostLongest);
}
);
#[test]
fn search_tests_have_unique_names() {
let assert = |constname, tests: &[SearchTest]| {
let mut seen = HashMap::new(); // map from test name to position
for (i, test) in tests.iter().enumerate() {
if !seen.contains_key(test.name) {
seen.insert(test.name, i);
} else {
let last = seen[test.name];
panic!(
"{} tests have duplicate names at positions {} and {}",
constname, last, i
);
}
}
};
assert("BASICS", BASICS);
assert("LEFTMOST", LEFTMOST);
assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
assert("REGRESSION", REGRESSION);
assert("TEDDY", TEDDY);
}
fn run_search_tests<F: FnMut(&SearchTestOwned) -> Option<Vec<Match>>>(
which: TestCollection,
mut f: F,
) {
let get_match_triples =
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
matches
.into_iter()
.map(|m| (m.pattern().as_usize(), m.start(), m.end()))
.collect()
};
for &tests in which {
for spec in tests {
for test in spec.variations() {
let results = match f(&test) {
None => continue,
Some(results) => results,
};
assert_eq!(
test.matches,
get_match_triples(results).as_slice(),
"test: {}, patterns: {:?}, haystack(len={:?}): {:?}, \
offset: {:?}",
test.name,
test.patterns,
test.haystack.len(),
test.haystack,
test.offset,
);
}
}
}
}

1757
vendor/aho-corasick/src/packed/vector.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1664
vendor/aho-corasick/src/tests.rs vendored Normal file

File diff suppressed because it is too large Load Diff

270
vendor/aho-corasick/src/transducer.rs vendored Normal file
View File

@@ -0,0 +1,270 @@
/*!
Provides implementations of `fst::Automaton` for Aho-Corasick automata.
This works by providing two wrapper types, [`Anchored`] and [`Unanchored`].
The former executes an anchored search on an FST while the latter executes
an unanchored search. Building these wrappers is fallible and will fail if
the underlying Aho-Corasick automaton does not support the type of search it
represents.
*/
use crate::{
automaton::{Automaton, StateID},
Anchored as AcAnchored, Input, MatchError,
};
/// Represents an unanchored Aho-Corasick search of a finite state transducer.
///
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
/// underlying automaton does not support unanchored searches.
///
/// # Example
///
/// This shows how to build an FST of keys and then run an unanchored search on
/// those keys using an Aho-Corasick automaton.
///
/// ```
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored};
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
///
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
/// // NFAs always support both unanchored and anchored searches.
/// let searcher = Unanchored::new(&nfa).unwrap();
///
/// let mut stream = set.search(searcher).into_stream();
/// let mut results = vec![];
/// while let Some(key) = stream.next() {
/// results.push(std::str::from_utf8(key).unwrap().to_string());
/// }
/// assert_eq!(vec!["abcd", "bcd", "xyz"], results);
/// ```
#[derive(Clone, Debug)]
pub struct Unanchored<A>(A);
impl<A: Automaton> Unanchored<A> {
/// Create a new `Unanchored` implementation of the `fst::Automaton` trait.
///
/// If the given Aho-Corasick automaton does not support unanchored
/// searches, then this returns an error.
pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> {
let input = Input::new("").anchored(AcAnchored::No);
let _ = aut.start_state(&input)?;
Ok(Unanchored(aut))
}
/// Returns a borrow to the underlying automaton.
pub fn as_ref(&self) -> &A {
&self.0
}
/// Unwrap this value and return the inner automaton.
pub fn into_inner(self) -> A {
self.0
}
}
impl<A: Automaton> fst::Automaton for Unanchored<A> {
type State = StateID;
#[inline]
fn start(&self) -> StateID {
let input = Input::new("").anchored(AcAnchored::No);
self.0.start_state(&input).expect("support for unanchored searches")
}
#[inline]
fn is_match(&self, state: &StateID) -> bool {
self.0.is_match(*state)
}
#[inline]
fn accept(&self, state: &StateID, byte: u8) -> StateID {
if fst::Automaton::is_match(self, state) {
return *state;
}
self.0.next_state(AcAnchored::No, *state, byte)
}
#[inline]
fn can_match(&self, state: &StateID) -> bool {
!self.0.is_dead(*state)
}
}
/// Represents an anchored Aho-Corasick search of a finite state transducer.
///
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
/// underlying automaton does not support unanchored searches.
///
/// # Example
///
/// This shows how to build an FST of keys and then run an anchored search on
/// those keys using an Aho-Corasick automaton.
///
/// ```
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored};
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
///
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
/// // NFAs always support both unanchored and anchored searches.
/// let searcher = Anchored::new(&nfa).unwrap();
///
/// let mut stream = set.search(searcher).into_stream();
/// let mut results = vec![];
/// while let Some(key) = stream.next() {
/// results.push(std::str::from_utf8(key).unwrap().to_string());
/// }
/// assert_eq!(vec!["bcd", "xyz"], results);
/// ```
///
/// This is like the example above, except we use an Aho-Corasick DFA, which
/// requires explicitly configuring it to support anchored searches. (NFAs
/// unconditionally support both unanchored and anchored searches.)
///
/// ```
/// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind};
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
///
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
/// let dfa = DFA::builder()
/// .start_kind(StartKind::Anchored)
/// .build(&["bcd", "x"])
/// .unwrap();
/// // We've explicitly configured our DFA to support anchored searches.
/// let searcher = Anchored::new(&dfa).unwrap();
///
/// let mut stream = set.search(searcher).into_stream();
/// let mut results = vec![];
/// while let Some(key) = stream.next() {
/// results.push(std::str::from_utf8(key).unwrap().to_string());
/// }
/// assert_eq!(vec!["bcd", "xyz"], results);
/// ```
#[derive(Clone, Debug)]
pub struct Anchored<A>(A);
impl<A: Automaton> Anchored<A> {
/// Create a new `Anchored` implementation of the `fst::Automaton` trait.
///
/// If the given Aho-Corasick automaton does not support anchored searches,
/// then this returns an error.
pub fn new(aut: A) -> Result<Anchored<A>, MatchError> {
let input = Input::new("").anchored(AcAnchored::Yes);
let _ = aut.start_state(&input)?;
Ok(Anchored(aut))
}
/// Returns a borrow to the underlying automaton.
pub fn as_ref(&self) -> &A {
&self.0
}
/// Unwrap this value and return the inner automaton.
pub fn into_inner(self) -> A {
self.0
}
}
impl<A: Automaton> fst::Automaton for Anchored<A> {
type State = StateID;
#[inline]
fn start(&self) -> StateID {
let input = Input::new("").anchored(AcAnchored::Yes);
self.0.start_state(&input).expect("support for unanchored searches")
}
#[inline]
fn is_match(&self, state: &StateID) -> bool {
self.0.is_match(*state)
}
#[inline]
fn accept(&self, state: &StateID, byte: u8) -> StateID {
if fst::Automaton::is_match(self, state) {
return *state;
}
self.0.next_state(AcAnchored::Yes, *state, byte)
}
#[inline]
fn can_match(&self, state: &StateID) -> bool {
!self.0.is_dead(*state)
}
}
#[cfg(test)]
mod tests {
use alloc::{string::String, vec, vec::Vec};
use fst::{Automaton, IntoStreamer, Set, Streamer};
use crate::{
dfa::DFA,
nfa::{contiguous, noncontiguous},
StartKind,
};
use super::*;
fn search<A: Automaton, D: AsRef<[u8]>>(
set: &Set<D>,
aut: A,
) -> Vec<String> {
let mut stream = set.search(aut).into_stream();
let mut results = vec![];
while let Some(key) = stream.next() {
results.push(String::from(core::str::from_utf8(key).unwrap()));
}
results
}
#[test]
fn unanchored() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let patterns = vec!["baz", "bax"];
let expected = vec!["baz", "xbax"];
let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Unanchored(DFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
}
#[test]
fn anchored() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let patterns = vec!["baz", "bax"];
let expected = vec!["baz"];
let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Anchored(contiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Anchored(
DFA::builder()
.start_kind(StartKind::Anchored)
.build(&patterns)
.unwrap(),
);
let got = search(&set, &aut);
assert_eq!(got, expected);
}
}

409
vendor/aho-corasick/src/util/alphabet.rs vendored Normal file
View File

@@ -0,0 +1,409 @@
use crate::util::int::Usize;
/// A representation of byte oriented equivalence classes.
///
/// This is used in finite state machines to reduce the size of the transition
/// table. This can have a particularly large impact not only on the total size
/// of an FSM, but also on FSM build times because it reduces the number of
/// transitions that need to be visited/set.
#[derive(Clone, Copy)]
pub(crate) struct ByteClasses([u8; 256]);
impl ByteClasses {
/// Creates a new set of equivalence classes where all bytes are mapped to
/// the same class.
pub(crate) fn empty() -> ByteClasses {
ByteClasses([0; 256])
}
/// Creates a new set of equivalence classes where each byte belongs to
/// its own equivalence class.
pub(crate) fn singletons() -> ByteClasses {
let mut classes = ByteClasses::empty();
for b in 0..=255 {
classes.set(b, b);
}
classes
}
/// Set the equivalence class for the given byte.
#[inline]
pub(crate) fn set(&mut self, byte: u8, class: u8) {
self.0[usize::from(byte)] = class;
}
/// Get the equivalence class for the given byte.
#[inline]
pub(crate) fn get(&self, byte: u8) -> u8 {
self.0[usize::from(byte)]
}
/// Return the total number of elements in the alphabet represented by
/// these equivalence classes. Equivalently, this returns the total number
/// of equivalence classes.
#[inline]
pub(crate) fn alphabet_len(&self) -> usize {
// Add one since the number of equivalence classes is one bigger than
// the last one.
usize::from(self.0[255]) + 1
}
/// Returns the stride, as a base-2 exponent, required for these
/// equivalence classes.
///
/// The stride is always the smallest power of 2 that is greater than or
/// equal to the alphabet length. This is done so that converting between
/// state IDs and indices can be done with shifts alone, which is much
/// faster than integer division. The "stride2" is the exponent. i.e.,
/// `2^stride2 = stride`.
pub(crate) fn stride2(&self) -> usize {
let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
usize::try_from(zeros).unwrap()
}
/// Returns the stride for these equivalence classes, which corresponds
/// to the smallest power of 2 greater than or equal to the number of
/// equivalence classes.
pub(crate) fn stride(&self) -> usize {
1 << self.stride2()
}
/// Returns true if and only if every byte in this class maps to its own
/// equivalence class. Equivalently, there are 257 equivalence classes
/// and each class contains exactly one byte (plus the special EOI class).
#[inline]
pub(crate) fn is_singleton(&self) -> bool {
self.alphabet_len() == 256
}
/// Returns an iterator over all equivalence classes in this set.
pub(crate) fn iter(&self) -> ByteClassIter {
ByteClassIter { it: 0..self.alphabet_len() }
}
/// Returns an iterator of the bytes in the given equivalence class.
pub(crate) fn elements(&self, class: u8) -> ByteClassElements {
ByteClassElements { classes: self, class, bytes: 0..=255 }
}
/// Returns an iterator of byte ranges in the given equivalence class.
///
/// That is, a sequence of contiguous ranges are returned. Typically, every
/// class maps to a single contiguous range.
fn element_ranges(&self, class: u8) -> ByteClassElementRanges {
ByteClassElementRanges { elements: self.elements(class), range: None }
}
}
impl core::fmt::Debug for ByteClasses {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
if self.is_singleton() {
write!(f, "ByteClasses(<one-class-per-byte>)")
} else {
write!(f, "ByteClasses(")?;
for (i, class) in self.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{:?} => [", class)?;
for (start, end) in self.element_ranges(class) {
if start == end {
write!(f, "{:?}", start)?;
} else {
write!(f, "{:?}-{:?}", start, end)?;
}
}
write!(f, "]")?;
}
write!(f, ")")
}
}
}
/// An iterator over each equivalence class.
#[derive(Debug)]
pub(crate) struct ByteClassIter {
it: core::ops::Range<usize>,
}
impl Iterator for ByteClassIter {
type Item = u8;
fn next(&mut self) -> Option<u8> {
self.it.next().map(|class| class.as_u8())
}
}
/// An iterator over all elements in a specific equivalence class.
#[derive(Debug)]
pub(crate) struct ByteClassElements<'a> {
classes: &'a ByteClasses,
class: u8,
bytes: core::ops::RangeInclusive<u8>,
}
impl<'a> Iterator for ByteClassElements<'a> {
type Item = u8;
fn next(&mut self) -> Option<u8> {
while let Some(byte) = self.bytes.next() {
if self.class == self.classes.get(byte) {
return Some(byte);
}
}
None
}
}
/// An iterator over all elements in an equivalence class expressed as a
/// sequence of contiguous ranges.
#[derive(Debug)]
pub(crate) struct ByteClassElementRanges<'a> {
elements: ByteClassElements<'a>,
range: Option<(u8, u8)>,
}
impl<'a> Iterator for ByteClassElementRanges<'a> {
type Item = (u8, u8);
fn next(&mut self) -> Option<(u8, u8)> {
loop {
let element = match self.elements.next() {
None => return self.range.take(),
Some(element) => element,
};
match self.range.take() {
None => {
self.range = Some((element, element));
}
Some((start, end)) => {
if usize::from(end) + 1 != usize::from(element) {
self.range = Some((element, element));
return Some((start, end));
}
self.range = Some((start, element));
}
}
}
}
}
/// A partitioning of bytes into equivalence classes.
///
/// A byte class set keeps track of an *approximation* of equivalence classes
/// of bytes during NFA construction. That is, every byte in an equivalence
/// class cannot discriminate between a match and a non-match.
///
/// Note that this may not compute the minimal set of equivalence classes.
/// Basically, any byte in a pattern given to the noncontiguous NFA builder
/// will automatically be treated as its own equivalence class. All other
/// bytes---any byte not in any pattern---will be treated as their own
/// equivalence classes. In theory, all bytes not in any pattern should
/// be part of a single equivalence class, but in practice, we only treat
/// contiguous ranges of bytes as an equivalence class. So the number of
/// classes computed may be bigger than necessary. This usually doesn't make
/// much of a difference, and keeps the implementation simple.
#[derive(Clone, Debug)]
pub(crate) struct ByteClassSet(ByteSet);
impl Default for ByteClassSet {
fn default() -> ByteClassSet {
ByteClassSet::empty()
}
}
impl ByteClassSet {
/// Create a new set of byte classes where all bytes are part of the same
/// equivalence class.
pub(crate) fn empty() -> Self {
ByteClassSet(ByteSet::empty())
}
/// Indicate the the range of byte given (inclusive) can discriminate a
/// match between it and all other bytes outside of the range.
pub(crate) fn set_range(&mut self, start: u8, end: u8) {
debug_assert!(start <= end);
if start > 0 {
self.0.add(start - 1);
}
self.0.add(end);
}
/// Convert this boolean set to a map that maps all byte values to their
/// corresponding equivalence class. The last mapping indicates the largest
/// equivalence class identifier (which is never bigger than 255).
pub(crate) fn byte_classes(&self) -> ByteClasses {
let mut classes = ByteClasses::empty();
let mut class = 0u8;
let mut b = 0u8;
loop {
classes.set(b, class);
if b == 255 {
break;
}
if self.0.contains(b) {
class = class.checked_add(1).unwrap();
}
b = b.checked_add(1).unwrap();
}
classes
}
}
/// A simple set of bytes that is reasonably cheap to copy and allocation free.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub(crate) struct ByteSet {
bits: BitSet,
}
/// The representation of a byte set. Split out so that we can define a
/// convenient Debug impl for it while keeping "ByteSet" in the output.
#[derive(Clone, Copy, Default, Eq, PartialEq)]
struct BitSet([u128; 2]);
impl ByteSet {
/// Create an empty set of bytes.
pub(crate) fn empty() -> ByteSet {
ByteSet { bits: BitSet([0; 2]) }
}
/// Add a byte to this set.
///
/// If the given byte already belongs to this set, then this is a no-op.
pub(crate) fn add(&mut self, byte: u8) {
let bucket = byte / 128;
let bit = byte % 128;
self.bits.0[usize::from(bucket)] |= 1 << bit;
}
/// Return true if and only if the given byte is in this set.
pub(crate) fn contains(&self, byte: u8) -> bool {
let bucket = byte / 128;
let bit = byte % 128;
self.bits.0[usize::from(bucket)] & (1 << bit) > 0
}
}
impl core::fmt::Debug for BitSet {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let mut fmtd = f.debug_set();
for b in 0u8..=255 {
if (ByteSet { bits: *self }).contains(b) {
fmtd.entry(&b);
}
}
fmtd.finish()
}
}
#[cfg(test)]
mod tests {
use alloc::{vec, vec::Vec};
use super::*;
#[test]
fn byte_classes() {
let mut set = ByteClassSet::empty();
set.set_range(b'a', b'z');
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(b'a' - 1), 0);
assert_eq!(classes.get(b'a'), 1);
assert_eq!(classes.get(b'm'), 1);
assert_eq!(classes.get(b'z'), 1);
assert_eq!(classes.get(b'z' + 1), 2);
assert_eq!(classes.get(254), 2);
assert_eq!(classes.get(255), 2);
let mut set = ByteClassSet::empty();
set.set_range(0, 2);
set.set_range(4, 6);
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(3), 1);
assert_eq!(classes.get(4), 2);
assert_eq!(classes.get(5), 2);
assert_eq!(classes.get(6), 2);
assert_eq!(classes.get(7), 3);
assert_eq!(classes.get(255), 3);
}
#[test]
fn full_byte_classes() {
let mut set = ByteClassSet::empty();
for b in 0u8..=255 {
set.set_range(b, b);
}
assert_eq!(set.byte_classes().alphabet_len(), 256);
}
#[test]
fn elements_typical() {
let mut set = ByteClassSet::empty();
set.set_range(b'b', b'd');
set.set_range(b'g', b'm');
set.set_range(b'z', b'z');
let classes = set.byte_classes();
// class 0: \x00-a
// class 1: b-d
// class 2: e-f
// class 3: g-m
// class 4: n-y
// class 5: z-z
// class 6: \x7B-\xFF
assert_eq!(classes.alphabet_len(), 7);
let elements = classes.elements(0).collect::<Vec<_>>();
assert_eq!(elements.len(), 98);
assert_eq!(elements[0], b'\x00');
assert_eq!(elements[97], b'a');
let elements = classes.elements(1).collect::<Vec<_>>();
assert_eq!(elements, vec![b'b', b'c', b'd'],);
let elements = classes.elements(2).collect::<Vec<_>>();
assert_eq!(elements, vec![b'e', b'f'],);
let elements = classes.elements(3).collect::<Vec<_>>();
assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],);
let elements = classes.elements(4).collect::<Vec<_>>();
assert_eq!(elements.len(), 12);
assert_eq!(elements[0], b'n');
assert_eq!(elements[11], b'y');
let elements = classes.elements(5).collect::<Vec<_>>();
assert_eq!(elements, vec![b'z']);
let elements = classes.elements(6).collect::<Vec<_>>();
assert_eq!(elements.len(), 133);
assert_eq!(elements[0], b'\x7B');
assert_eq!(elements[132], b'\xFF');
}
#[test]
fn elements_singletons() {
let classes = ByteClasses::singletons();
assert_eq!(classes.alphabet_len(), 256);
let elements = classes.elements(b'a').collect::<Vec<_>>();
assert_eq!(elements, vec![b'a']);
}
#[test]
fn elements_empty() {
let classes = ByteClasses::empty();
assert_eq!(classes.alphabet_len(), 1);
let elements = classes.elements(0).collect::<Vec<_>>();
assert_eq!(elements.len(), 256);
assert_eq!(elements[0], b'\x00');
assert_eq!(elements[255], b'\xFF');
}
}

124
vendor/aho-corasick/src/util/buffer.rs vendored Normal file
View File

@@ -0,0 +1,124 @@
use alloc::{vec, vec::Vec};
/// The default buffer capacity that we use for the stream buffer.
const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
/// A fairly simple roll buffer for supporting stream searches.
///
/// This buffer acts as a temporary place to store a fixed amount of data when
/// reading from a stream. Its central purpose is to allow "rolling" some
/// suffix of the data to the beginning of the buffer before refilling it with
/// more data from the stream. For example, let's say we are trying to match
/// "foobar" on a stream. When we report the match, we'd like to not only
/// report the correct offsets at which the match occurs, but also the matching
/// bytes themselves. So let's say our stream is a file with the following
/// contents: `test test foobar test test`. Now assume that we happen to read
/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
/// Naively, it would not be possible to report a single contiguous `foobar`
/// match, but this roll buffer allows us to do that. Namely, after the second
/// read, the contents of the buffer should be `st foobar test test`, where the
/// search should ultimately resume immediately after `foo`. (The prefix `st `
/// is included because the roll buffer saves N bytes at the end of the buffer,
/// where N is the maximum possible length of a match.)
///
/// A lot of the logic for dealing with this is unfortunately split out between
/// this roll buffer and the `StreamChunkIter`.
///
/// Note also that this buffer is not actually required to just report matches.
/// Because a `Match` is just some offsets. But it *is* required for supporting
/// things like `try_stream_replace_all` because that needs some mechanism for
/// knowing which bytes in the stream correspond to a match and which don't. So
/// when a match occurs across two `read` calls, *something* needs to retain
/// the bytes from the previous `read` call because you don't know before the
/// second read call whether a match exists or not.
#[derive(Debug)]
pub(crate) struct Buffer {
/// The raw buffer contents. This has a fixed size and never increases.
buf: Vec<u8>,
/// The minimum size of the buffer, which is equivalent to the maximum
/// possible length of a match. This corresponds to the amount that we
/// roll
min: usize,
/// The end of the contents of this buffer.
end: usize,
}
impl Buffer {
/// Create a new buffer for stream searching. The minimum buffer length
/// given should be the size of the maximum possible match length.
pub(crate) fn new(min_buffer_len: usize) -> Buffer {
let min = core::cmp::max(1, min_buffer_len);
// The minimum buffer amount is also the amount that we roll our
// buffer in order to support incremental searching. To this end,
// our actual capacity needs to be at least 1 byte bigger than our
// minimum amount, otherwise we won't have any overlap. In actuality,
// we want our buffer to be a bit bigger than that for performance
// reasons, so we set a lower bound of `8 * min`.
//
// TODO: It would be good to find a way to test the streaming
// implementation with the minimal buffer size. For now, we just
// uncomment out the next line and comment out the subsequent line.
// let capacity = 1 + min;
let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
Buffer { buf: vec![0; capacity], min, end: 0 }
}
/// Return the contents of this buffer.
#[inline]
pub(crate) fn buffer(&self) -> &[u8] {
&self.buf[..self.end]
}
/// Return the minimum size of the buffer. The only way a buffer may be
/// smaller than this is if the stream itself contains less than the
/// minimum buffer amount.
#[inline]
pub(crate) fn min_buffer_len(&self) -> usize {
self.min
}
/// Return all free capacity in this buffer.
fn free_buffer(&mut self) -> &mut [u8] {
&mut self.buf[self.end..]
}
/// Refill the contents of this buffer by reading as much as possible into
/// this buffer's free capacity. If no more bytes could be read, then this
/// returns false. Otherwise, this reads until it has filled the buffer
/// past the minimum amount.
pub(crate) fn fill<R: std::io::Read>(
&mut self,
mut rdr: R,
) -> std::io::Result<bool> {
let mut readany = false;
loop {
let readlen = rdr.read(self.free_buffer())?;
if readlen == 0 {
return Ok(readany);
}
readany = true;
self.end += readlen;
if self.buffer().len() >= self.min {
return Ok(true);
}
}
}
/// Roll the contents of the buffer so that the suffix of this buffer is
/// moved to the front and all other contents are dropped. The size of the
/// suffix corresponds precisely to the minimum buffer length.
///
/// This should only be called when the entire contents of this buffer have
/// been searched.
pub(crate) fn roll(&mut self) {
let roll_start = self
.end
.checked_sub(self.min)
.expect("buffer capacity should be bigger than minimum amount");
let roll_end = roll_start + self.min;
assert!(roll_end <= self.end);
self.buf.copy_within(roll_start..roll_end, 0);
self.end = self.min;
}
}

View File

@@ -0,0 +1,258 @@
pub const BYTE_FREQUENCIES: [u8; 256] = [
55, // '\x00'
52, // '\x01'
51, // '\x02'
50, // '\x03'
49, // '\x04'
48, // '\x05'
47, // '\x06'
46, // '\x07'
45, // '\x08'
103, // '\t'
242, // '\n'
66, // '\x0b'
67, // '\x0c'
229, // '\r'
44, // '\x0e'
43, // '\x0f'
42, // '\x10'
41, // '\x11'
40, // '\x12'
39, // '\x13'
38, // '\x14'
37, // '\x15'
36, // '\x16'
35, // '\x17'
34, // '\x18'
33, // '\x19'
56, // '\x1a'
32, // '\x1b'
31, // '\x1c'
30, // '\x1d'
29, // '\x1e'
28, // '\x1f'
255, // ' '
148, // '!'
164, // '"'
149, // '#'
136, // '$'
160, // '%'
155, // '&'
173, // "'"
221, // '('
222, // ')'
134, // '*'
122, // '+'
232, // ','
202, // '-'
215, // '.'
224, // '/'
208, // '0'
220, // '1'
204, // '2'
187, // '3'
183, // '4'
179, // '5'
177, // '6'
168, // '7'
178, // '8'
200, // '9'
226, // ':'
195, // ';'
154, // '<'
184, // '='
174, // '>'
126, // '?'
120, // '@'
191, // 'A'
157, // 'B'
194, // 'C'
170, // 'D'
189, // 'E'
162, // 'F'
161, // 'G'
150, // 'H'
193, // 'I'
142, // 'J'
137, // 'K'
171, // 'L'
176, // 'M'
185, // 'N'
167, // 'O'
186, // 'P'
112, // 'Q'
175, // 'R'
192, // 'S'
188, // 'T'
156, // 'U'
140, // 'V'
143, // 'W'
123, // 'X'
133, // 'Y'
128, // 'Z'
147, // '['
138, // '\\'
146, // ']'
114, // '^'
223, // '_'
151, // '`'
249, // 'a'
216, // 'b'
238, // 'c'
236, // 'd'
253, // 'e'
227, // 'f'
218, // 'g'
230, // 'h'
247, // 'i'
135, // 'j'
180, // 'k'
241, // 'l'
233, // 'm'
246, // 'n'
244, // 'o'
231, // 'p'
139, // 'q'
245, // 'r'
243, // 's'
251, // 't'
235, // 'u'
201, // 'v'
196, // 'w'
240, // 'x'
214, // 'y'
152, // 'z'
182, // '{'
205, // '|'
181, // '}'
127, // '~'
27, // '\x7f'
212, // '\x80'
211, // '\x81'
210, // '\x82'
213, // '\x83'
228, // '\x84'
197, // '\x85'
169, // '\x86'
159, // '\x87'
131, // '\x88'
172, // '\x89'
105, // '\x8a'
80, // '\x8b'
98, // '\x8c'
96, // '\x8d'
97, // '\x8e'
81, // '\x8f'
207, // '\x90'
145, // '\x91'
116, // '\x92'
115, // '\x93'
144, // '\x94'
130, // '\x95'
153, // '\x96'
121, // '\x97'
107, // '\x98'
132, // '\x99'
109, // '\x9a'
110, // '\x9b'
124, // '\x9c'
111, // '\x9d'
82, // '\x9e'
108, // '\x9f'
118, // '\xa0'
141, // '¡'
113, // '¢'
129, // '£'
119, // '¤'
125, // '¥'
165, // '¦'
117, // '§'
92, // '¨'
106, // '©'
83, // 'ª'
72, // '«'
99, // '¬'
93, // '\xad'
65, // '®'
79, // '¯'
166, // '°'
237, // '±'
163, // '²'
199, // '³'
190, // '´'
225, // 'µ'
209, // '¶'
203, // '·'
198, // '¸'
217, // '¹'
219, // 'º'
206, // '»'
234, // '¼'
248, // '½'
158, // '¾'
239, // '¿'
255, // 'À'
255, // 'Á'
255, // 'Â'
255, // 'Ã'
255, // 'Ä'
255, // 'Å'
255, // 'Æ'
255, // 'Ç'
255, // 'È'
255, // 'É'
255, // 'Ê'
255, // 'Ë'
255, // 'Ì'
255, // 'Í'
255, // 'Î'
255, // 'Ï'
255, // 'Ð'
255, // 'Ñ'
255, // 'Ò'
255, // 'Ó'
255, // 'Ô'
255, // 'Õ'
255, // 'Ö'
255, // '×'
255, // 'Ø'
255, // 'Ù'
255, // 'Ú'
255, // 'Û'
255, // 'Ü'
255, // 'Ý'
255, // 'Þ'
255, // 'ß'
255, // 'à'
255, // 'á'
255, // 'â'
255, // 'ã'
255, // 'ä'
255, // 'å'
255, // 'æ'
255, // 'ç'
255, // 'è'
255, // 'é'
255, // 'ê'
255, // 'ë'
255, // 'ì'
255, // 'í'
255, // 'î'
255, // 'ï'
255, // 'ð'
255, // 'ñ'
255, // 'ò'
255, // 'ó'
255, // 'ô'
255, // 'õ'
255, // 'ö'
255, // '÷'
255, // 'ø'
255, // 'ù'
255, // 'ú'
255, // 'û'
255, // 'ü'
255, // 'ý'
255, // 'þ'
255, // 'ÿ'
];

26
vendor/aho-corasick/src/util/debug.rs vendored Normal file
View File

@@ -0,0 +1,26 @@
/// A type that wraps a single byte with a convenient fmt::Debug impl that
/// escapes the byte.
pub(crate) struct DebugByte(pub(crate) u8);
impl core::fmt::Debug for DebugByte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// Special case ASCII space. It's too hard to read otherwise, so
// put quotes around it. I sometimes wonder whether just '\x20' would
// be better...
if self.0 == b' ' {
return write!(f, "' '");
}
// 10 bytes is enough to cover any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}

259
vendor/aho-corasick/src/util/error.rs vendored Normal file
View File

@@ -0,0 +1,259 @@
use crate::util::{
primitives::{PatternID, SmallIndex},
search::MatchKind,
};
/// An error that occurred during the construction of an Aho-Corasick
/// automaton.
///
/// Build errors occur when some kind of limit has been exceeded, either in the
/// number of states, the number of patterns of the length of a pattern. These
/// limits aren't part of the public API, but they should generally be large
/// enough to handle most use cases.
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug)]
pub struct BuildError {
kind: ErrorKind,
}
/// The kind of error that occurred.
#[derive(Clone, Debug)]
enum ErrorKind {
/// An error that occurs when allocating a new state would result in an
/// identifier that exceeds the capacity of a `StateID`.
StateIDOverflow {
/// The maximum possible id.
max: u64,
/// The maximum ID requested.
requested_max: u64,
},
/// An error that occurs when adding a pattern to an Aho-Corasick
/// automaton would result in an identifier that exceeds the capacity of a
/// `PatternID`.
PatternIDOverflow {
/// The maximum possible id.
max: u64,
/// The maximum ID requested.
requested_max: u64,
},
/// Occurs when a pattern string is given to the Aho-Corasick constructor
/// that is too long.
PatternTooLong {
/// The ID of the pattern that was too long.
pattern: PatternID,
/// The length that was too long.
len: usize,
},
}
impl BuildError {
pub(crate) fn state_id_overflow(
max: u64,
requested_max: u64,
) -> BuildError {
BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } }
}
pub(crate) fn pattern_id_overflow(
max: u64,
requested_max: u64,
) -> BuildError {
BuildError {
kind: ErrorKind::PatternIDOverflow { max, requested_max },
}
}
pub(crate) fn pattern_too_long(
pattern: PatternID,
len: usize,
) -> BuildError {
BuildError { kind: ErrorKind::PatternTooLong { pattern, len } }
}
}
#[cfg(feature = "std")]
impl std::error::Error for BuildError {}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind {
ErrorKind::StateIDOverflow { max, requested_max } => {
write!(
f,
"state identifier overflow: failed to create state ID \
from {}, which exceeds the max of {}",
requested_max, max,
)
}
ErrorKind::PatternIDOverflow { max, requested_max } => {
write!(
f,
"pattern identifier overflow: failed to create pattern ID \
from {}, which exceeds the max of {}",
requested_max, max,
)
}
ErrorKind::PatternTooLong { pattern, len } => {
write!(
f,
"pattern {} with length {} exceeds \
the maximum pattern length of {}",
pattern.as_usize(),
len,
SmallIndex::MAX.as_usize(),
)
}
}
}
}
/// An error that occurred during an Aho-Corasick search.
///
/// An error that occurs during a search is limited to some kind of
/// misconfiguration that resulted in an illegal call. Stated differently,
/// whether an error occurs is not dependent on the specific bytes in the
/// haystack.
///
/// Examples of misconfiguration:
///
/// * Executing a stream or overlapping search on a searcher that was built was
/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard)
/// semantics.
/// * Requested an anchored or an unanchored search on a searcher that doesn't
/// support unanchored or anchored searches, respectively.
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct MatchError(alloc::boxed::Box<MatchErrorKind>);
impl MatchError {
/// Create a new error value with the given kind.
///
/// This is a more verbose version of the kind-specific constructors, e.g.,
/// `MatchError::unsupported_stream`.
pub fn new(kind: MatchErrorKind) -> MatchError {
MatchError(alloc::boxed::Box::new(kind))
}
/// Returns a reference to the underlying error kind.
pub fn kind(&self) -> &MatchErrorKind {
&self.0
}
/// Create a new "invalid anchored search" error. This occurs when the
/// caller requests an anchored search but where anchored searches aren't
/// supported.
///
/// This is the same as calling `MatchError::new` with a
/// [`MatchErrorKind::InvalidInputAnchored`] kind.
pub fn invalid_input_anchored() -> MatchError {
MatchError::new(MatchErrorKind::InvalidInputAnchored)
}
/// Create a new "invalid unanchored search" error. This occurs when the
/// caller requests an unanchored search but where unanchored searches
/// aren't supported.
///
/// This is the same as calling `MatchError::new` with a
/// [`MatchErrorKind::InvalidInputUnanchored`] kind.
pub fn invalid_input_unanchored() -> MatchError {
MatchError::new(MatchErrorKind::InvalidInputUnanchored)
}
/// Create a new "unsupported stream search" error. This occurs when the
/// caller requests a stream search while using an Aho-Corasick automaton
/// with a match kind other than [`MatchKind::Standard`].
///
/// The match kind given should be the match kind of the automaton. It
/// should never be `MatchKind::Standard`.
pub fn unsupported_stream(got: MatchKind) -> MatchError {
MatchError::new(MatchErrorKind::UnsupportedStream { got })
}
/// Create a new "unsupported overlapping search" error. This occurs when
/// the caller requests an overlapping search while using an Aho-Corasick
/// automaton with a match kind other than [`MatchKind::Standard`].
///
/// The match kind given should be the match kind of the automaton. It
/// should never be `MatchKind::Standard`.
pub fn unsupported_overlapping(got: MatchKind) -> MatchError {
MatchError::new(MatchErrorKind::UnsupportedOverlapping { got })
}
/// Create a new "unsupported empty pattern" error. This occurs when the
/// caller requests a search for which matching an automaton that contains
/// an empty pattern string is not supported.
pub fn unsupported_empty() -> MatchError {
MatchError::new(MatchErrorKind::UnsupportedEmpty)
}
}
/// The underlying kind of a [`MatchError`].
///
/// This is a **non-exhaustive** enum. That means new variants may be added in
/// a semver-compatible release.
#[non_exhaustive]
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum MatchErrorKind {
/// An error indicating that an anchored search was requested, but from a
/// searcher that was built without anchored support.
InvalidInputAnchored,
/// An error indicating that an unanchored search was requested, but from a
/// searcher that was built without unanchored support.
InvalidInputUnanchored,
/// An error indicating that a stream search was attempted on an
/// Aho-Corasick automaton with an unsupported `MatchKind`.
UnsupportedStream {
/// The match semantics for the automaton that was used.
got: MatchKind,
},
/// An error indicating that an overlapping search was attempted on an
/// Aho-Corasick automaton with an unsupported `MatchKind`.
UnsupportedOverlapping {
/// The match semantics for the automaton that was used.
got: MatchKind,
},
/// An error indicating that the operation requested doesn't support
/// automatons that contain an empty pattern string.
UnsupportedEmpty,
}
#[cfg(feature = "std")]
impl std::error::Error for MatchError {}
impl core::fmt::Display for MatchError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
match *self.kind() {
MatchErrorKind::InvalidInputAnchored => {
write!(f, "anchored searches are not supported or enabled")
}
MatchErrorKind::InvalidInputUnanchored => {
write!(f, "unanchored searches are not supported or enabled")
}
MatchErrorKind::UnsupportedStream { got } => {
write!(
f,
"match kind {:?} does not support stream searching",
got,
)
}
MatchErrorKind::UnsupportedOverlapping { got } => {
write!(
f,
"match kind {:?} does not support overlapping searches",
got,
)
}
MatchErrorKind::UnsupportedEmpty => {
write!(
f,
"matching with an empty pattern string is not \
supported for this operation",
)
}
}
}
}

278
vendor/aho-corasick/src/util/int.rs vendored Normal file
View File

@@ -0,0 +1,278 @@
/*!
This module provides several integer oriented traits for converting between
both fixed size integers and integers whose size varies based on the target
(like `usize`).
The main design principle for this module is to centralize all uses of `as`.
The thinking here is that `as` makes it very easy to perform accidental lossy
conversions, and if we centralize all its uses here under more descriptive
higher level operations, its use and correctness becomes easier to audit.
This was copied mostly wholesale from `regex-automata`.
NOTE: for simplicity, we don't take target pointer width into account here for
`usize` conversions. Since we currently only panic in debug mode, skipping the
check when it can be proven it isn't needed at compile time doesn't really
matter. Now, if we wind up wanting to do as many checks as possible in release
mode, then we would want to skip those when we know the conversions are always
non-lossy.
*/
// We define a little more than what we need, but I'd rather just have
// everything via a consistent and uniform API then have holes.
#![allow(dead_code)]
pub(crate) trait U8 {
fn as_usize(self) -> usize;
}
impl U8 for u8 {
fn as_usize(self) -> usize {
usize::from(self)
}
}
pub(crate) trait U16 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn high_u8(self) -> u8;
}
impl U16 for u16 {
fn as_usize(self) -> usize {
usize::from(self)
}
fn low_u8(self) -> u8 {
self as u8
}
fn high_u8(self) -> u8 {
(self >> 8) as u8
}
}
pub(crate) trait U32 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn low_u16(self) -> u16;
fn high_u16(self) -> u16;
}
impl U32 for u32 {
#[inline]
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("u32 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn low_u8(self) -> u8 {
self as u8
}
fn low_u16(self) -> u16 {
self as u16
}
fn high_u16(self) -> u16 {
(self >> 16) as u16
}
}
pub(crate) trait U64 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn low_u16(self) -> u16;
fn low_u32(self) -> u32;
fn high_u32(self) -> u32;
}
impl U64 for u64 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("u64 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn low_u8(self) -> u8 {
self as u8
}
fn low_u16(self) -> u16 {
self as u16
}
fn low_u32(self) -> u32 {
self as u32
}
fn high_u32(self) -> u32 {
(self >> 32) as u32
}
}
pub(crate) trait I8 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u8;
fn from_bits(n: u8) -> i8;
}
impl I8 for i8 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i8 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u8 {
self as u8
}
fn from_bits(n: u8) -> i8 {
n as i8
}
}
pub(crate) trait I32 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u32;
fn from_bits(n: u32) -> i32;
}
impl I32 for i32 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i32 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u32 {
self as u32
}
fn from_bits(n: u32) -> i32 {
n as i32
}
}
pub(crate) trait I64 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u64;
fn from_bits(n: u64) -> i64;
}
impl I64 for i64 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i64 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u64 {
self as u64
}
fn from_bits(n: u64) -> i64 {
n as i64
}
}
pub(crate) trait Usize {
fn as_u8(self) -> u8;
fn as_u16(self) -> u16;
fn as_u32(self) -> u32;
fn as_u64(self) -> u64;
}
impl Usize for usize {
fn as_u8(self) -> u8 {
#[cfg(debug_assertions)]
{
u8::try_from(self).expect("usize overflowed u8")
}
#[cfg(not(debug_assertions))]
{
self as u8
}
}
fn as_u16(self) -> u16 {
#[cfg(debug_assertions)]
{
u16::try_from(self).expect("usize overflowed u16")
}
#[cfg(not(debug_assertions))]
{
self as u16
}
}
fn as_u32(self) -> u32 {
#[cfg(debug_assertions)]
{
u32::try_from(self).expect("usize overflowed u32")
}
#[cfg(not(debug_assertions))]
{
self as u32
}
}
fn as_u64(self) -> u64 {
#[cfg(debug_assertions)]
{
u64::try_from(self).expect("usize overflowed u64")
}
#[cfg(not(debug_assertions))]
{
self as u64
}
}
}
// Pointers aren't integers, but we convert pointers to integers to perform
// offset arithmetic in some places. (And no, we don't convert the integers
// back to pointers.) So add 'as_usize' conversions here too for completeness.
//
// These 'as' casts are actually okay because they're always non-lossy. But the
// idea here is to just try and remove as much 'as' as possible, particularly
// in this crate where we are being really paranoid about offsets and making
// sure we don't panic on inputs that might be untrusted. This way, the 'as'
// casts become easier to audit if they're all in one place, even when some of
// them are actually okay 100% of the time.
pub(crate) trait Pointer {
fn as_usize(self) -> usize;
}
impl<T> Pointer for *const T {
fn as_usize(self) -> usize {
self as usize
}
}

12
vendor/aho-corasick/src/util/mod.rs vendored Normal file
View File

@@ -0,0 +1,12 @@
pub(crate) mod alphabet;
#[cfg(feature = "std")]
pub(crate) mod buffer;
pub(crate) mod byte_frequencies;
pub(crate) mod debug;
pub(crate) mod error;
pub(crate) mod int;
pub(crate) mod prefilter;
pub(crate) mod primitives;
pub(crate) mod remapper;
pub(crate) mod search;
pub(crate) mod special;

View File

@@ -0,0 +1,924 @@
use core::{
cmp,
fmt::Debug,
panic::{RefUnwindSafe, UnwindSafe},
u8,
};
use alloc::{sync::Arc, vec, vec::Vec};
use crate::{
packed,
util::{
alphabet::ByteSet,
search::{Match, MatchKind, Span},
},
};
/// A prefilter for accelerating a search.
///
/// This crate uses prefilters in the core search implementations to accelerate
/// common cases. They typically only apply to cases where there are a small
/// number of patterns (less than 100 or so), but when they do, thoughput can
/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
/// is active, it is used whenever a search enters an automaton's start state.
///
/// Currently, prefilters cannot be constructed by
/// callers. A `Prefilter` can only be accessed via the
/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
/// method and used to execute a search. In other words, a prefilter can be
/// used to optimize your own search implementation if necessary, but cannot do
/// much else. If you have a use case for more APIs, please submit an issue.
#[derive(Clone, Debug)]
pub struct Prefilter {
finder: Arc<dyn PrefilterI>,
memory_usage: usize,
}
impl Prefilter {
/// Execute a search in the haystack within the span given. If a match or
/// a possible match is returned, then it is guaranteed to occur within
/// the bounds of the span.
///
/// If the span provided is invalid for the given haystack, then behavior
/// is unspecified.
#[inline]
pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
self.finder.find_in(haystack, span)
}
#[inline]
pub(crate) fn memory_usage(&self) -> usize {
self.memory_usage
}
}
/// A candidate is the result of running a prefilter on a haystack at a
/// particular position.
///
/// The result is either no match, a confirmed match or a possible match.
///
/// When no match is returned, the prefilter is guaranteeing that no possible
/// match can be found in the haystack, and the caller may trust this. That is,
/// all correct prefilters must never report false negatives.
///
/// In some cases, a prefilter can confirm a match very quickly, in which case,
/// the caller may use this to stop what it's doing and report the match. In
/// this case, prefilter implementations must never report a false positive.
/// In other cases, the prefilter can only report a potential match, in which
/// case the callers must attempt to confirm the match. In this case, prefilter
/// implementations are permitted to return false positives.
#[derive(Clone, Debug)]
pub enum Candidate {
/// No match was found. Since false negatives are not possible, this means
/// the search can quit as it is guaranteed not to find another match.
None,
/// A confirmed match was found. Callers do not need to confirm it.
Match(Match),
/// The start of a possible match was found. Callers must confirm it before
/// reporting it as a match.
PossibleStartOfMatch(usize),
}
impl Candidate {
/// Convert this candidate into an option. This is useful when callers
/// do not distinguish between true positives and false positives (i.e.,
/// the caller must always confirm the match).
pub fn into_option(self) -> Option<usize> {
match self {
Candidate::None => None,
Candidate::Match(ref m) => Some(m.start()),
Candidate::PossibleStartOfMatch(start) => Some(start),
}
}
}
/// A prefilter describes the behavior of fast literal scanners for quickly
/// skipping past bytes in the haystack that we know cannot possibly
/// participate in a match.
trait PrefilterI:
Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
{
/// Returns the next possible match candidate. This may yield false
/// positives, so callers must confirm a match starting at the position
/// returned. This, however, must never produce false negatives. That is,
/// this must, at minimum, return the starting position of the next match
/// in the given haystack after or at the given position.
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
}
impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
#[inline(always)]
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
(**self).find_in(haystack, span)
}
}
/// A builder for constructing the best possible prefilter. When constructed,
/// this builder will heuristically select the best prefilter it can build,
/// if any, and discard the rest.
#[derive(Debug)]
pub(crate) struct Builder {
count: usize,
ascii_case_insensitive: bool,
start_bytes: StartBytesBuilder,
rare_bytes: RareBytesBuilder,
memmem: MemmemBuilder,
packed: Option<packed::Builder>,
// If we run across a condition that suggests we shouldn't use a prefilter
// at all (like an empty pattern), then disable prefilters entirely.
enabled: bool,
}
impl Builder {
/// Create a new builder for constructing the best possible prefilter.
pub(crate) fn new(kind: MatchKind) -> Builder {
let pbuilder = kind
.as_packed()
.map(|kind| packed::Config::new().match_kind(kind).builder());
Builder {
count: 0,
ascii_case_insensitive: false,
start_bytes: StartBytesBuilder::new(),
rare_bytes: RareBytesBuilder::new(),
memmem: MemmemBuilder::default(),
packed: pbuilder,
enabled: true,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
self.ascii_case_insensitive = yes;
self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
self
}
/// Return a prefilter suitable for quickly finding potential matches.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
pub(crate) fn build(&self) -> Option<Prefilter> {
if !self.enabled {
debug!("prefilter not enabled, skipping");
return None;
}
// If we only have one pattern, then deferring to memmem is always
// the best choice. This is kind of a weird case, because, well, why
// use Aho-Corasick if you only have one pattern? But maybe you don't
// know exactly how many patterns you'll get up front, and you need to
// support the option of multiple patterns. So instead of relying on
// the caller to branch and use memmem explicitly, we just do it for
// them.
if !self.ascii_case_insensitive {
if let Some(pre) = self.memmem.build() {
debug!("using memmem prefilter");
return Some(pre);
}
}
let (packed, patlen, minlen) = if self.ascii_case_insensitive {
(None, usize::MAX, 0)
} else {
let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
let packed =
self.packed.as_ref().and_then(|b| b.build()).map(|s| {
let memory_usage = s.memory_usage();
debug!(
"built packed prefilter (len: {}, \
minimum pattern len: {}, memory usage: {}) \
for consideration",
patlen, minlen, memory_usage,
);
Prefilter { finder: Arc::new(Packed(s)), memory_usage }
});
(packed, patlen, minlen)
};
match (self.start_bytes.build(), self.rare_bytes.build()) {
// If we could build both start and rare prefilters, then there are
// a few cases in which we'd want to use the start-byte prefilter
// over the rare-byte prefilter, since the former has lower
// overhead.
(prestart @ Some(_), prerare @ Some(_)) => {
debug!(
"both start (len={}, rank={}) and \
rare (len={}, rank={}) byte prefilters \
are available",
self.start_bytes.count,
self.start_bytes.rank_sum,
self.rare_bytes.count,
self.rare_bytes.rank_sum,
);
if patlen <= 16
&& minlen >= 2
&& self.start_bytes.count >= 3
&& self.rare_bytes.count >= 3
{
debug!(
"start and rare byte prefilters available, but \
they're probably slower than packed so using \
packed"
);
return packed;
}
// If the start-byte prefilter can scan for a smaller number
// of bytes than the rare-byte prefilter, then it's probably
// faster.
let has_fewer_bytes =
self.start_bytes.count < self.rare_bytes.count;
// Otherwise, if the combined frequency rank of the detected
// bytes in the start-byte prefilter is "close" to the combined
// frequency rank of the rare-byte prefilter, then we pick
// the start-byte prefilter even if the rare-byte prefilter
// heuristically searches for rare bytes. This is because the
// rare-byte prefilter has higher constant costs, so we tend to
// prefer the start-byte prefilter when we can.
let has_rarer_bytes =
self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
if has_fewer_bytes {
debug!(
"using start byte prefilter because it has fewer
bytes to search for than the rare byte prefilter",
);
prestart
} else if has_rarer_bytes {
debug!(
"using start byte prefilter because its byte \
frequency rank was determined to be \
\"good enough\" relative to the rare byte prefilter \
byte frequency rank",
);
prestart
} else {
debug!("using rare byte prefilter");
prerare
}
}
(prestart @ Some(_), None) => {
if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
debug!(
"start byte prefilter available, but \
it's probably slower than packed so using \
packed"
);
return packed;
}
debug!(
"have start byte prefilter but not rare byte prefilter, \
so using start byte prefilter",
);
prestart
}
(None, prerare @ Some(_)) => {
if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
debug!(
"rare byte prefilter available, but \
it's probably slower than packed so using \
packed"
);
return packed;
}
debug!(
"have rare byte prefilter but not start byte prefilter, \
so using rare byte prefilter",
);
prerare
}
(None, None) if self.ascii_case_insensitive => {
debug!(
"no start or rare byte prefilter and ASCII case \
insensitivity was enabled, so skipping prefilter",
);
None
}
(None, None) => {
if packed.is_some() {
debug!("falling back to packed prefilter");
} else {
debug!("no prefilter available");
}
packed
}
}
}
/// Add a literal string to this prefilter builder.
pub(crate) fn add(&mut self, bytes: &[u8]) {
if bytes.is_empty() {
self.enabled = false;
}
if !self.enabled {
return;
}
self.count += 1;
self.start_bytes.add(bytes);
self.rare_bytes.add(bytes);
self.memmem.add(bytes);
if let Some(ref mut pbuilder) = self.packed {
pbuilder.add(bytes);
}
}
}
/// A type that wraps a packed searcher and implements the `Prefilter`
/// interface.
#[derive(Clone, Debug)]
struct Packed(packed::Searcher);
impl PrefilterI for Packed {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
self.0
.find_in(&haystack, span)
.map_or(Candidate::None, Candidate::Match)
}
}
/// A builder for constructing a prefilter that uses memmem.
#[derive(Debug, Default)]
struct MemmemBuilder {
/// The number of patterns that have been added.
count: usize,
/// The singular pattern to search for. This is only set when count==1.
one: Option<Vec<u8>>,
}
impl MemmemBuilder {
fn build(&self) -> Option<Prefilter> {
#[cfg(all(feature = "std", feature = "perf-literal"))]
fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
let pattern = builder.one.as_ref()?;
assert_eq!(1, builder.count);
let finder = Arc::new(Memmem(
memchr::memmem::Finder::new(pattern).into_owned(),
));
let memory_usage = pattern.len();
Some(Prefilter { finder, memory_usage })
}
#[cfg(not(all(feature = "std", feature = "perf-literal")))]
fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
None
}
imp(self)
}
fn add(&mut self, bytes: &[u8]) {
self.count += 1;
if self.count == 1 {
self.one = Some(bytes.to_vec());
} else {
self.one = None;
}
}
}
/// A type that wraps a SIMD accelerated single substring search from the
/// `memchr` crate for use as a prefilter.
///
/// Currently, this prefilter is only active for Aho-Corasick searchers with
/// a single pattern. In theory, this could be extended to support searchers
/// that have a common prefix of more than one byte (for one byte, we would use
/// memchr), but it's not clear if it's worth it or not.
///
/// Also, unfortunately, this currently also requires the 'std' feature to
/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
/// disabled. But there should be an 'alloc' feature that brings in APIs like
/// Finder::into_owned but doesn't use std-only features like runtime CPU
/// feature detection.
#[cfg(all(feature = "std", feature = "perf-literal"))]
#[derive(Clone, Debug)]
struct Memmem(memchr::memmem::Finder<'static>);
#[cfg(all(feature = "std", feature = "perf-literal"))]
impl PrefilterI for Memmem {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
use crate::util::primitives::PatternID;
self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
let start = span.start + i;
let end = start + self.0.needle().len();
// N.B. We can declare a match and use a fixed pattern ID here
// because a Memmem prefilter is only ever created for searchers
// with exactly one pattern. Thus, every match is always a match
// and it is always for the first and only pattern.
Candidate::Match(Match::new(PatternID::ZERO, start..end))
})
}
}
/// A builder for constructing a rare byte prefilter.
///
/// A rare byte prefilter attempts to pick out a small set of rare bytes that
/// occurr in the patterns, and then quickly scan to matches of those rare
/// bytes.
#[derive(Clone, Debug)]
struct RareBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// A set of rare bytes, indexed by byte value.
rare_set: ByteSet,
/// A set of byte offsets associated with bytes in a pattern. An entry
/// corresponds to a particular bytes (its index) and is only non-zero if
/// the byte occurred at an offset greater than 0 in at least one pattern.
///
/// If a byte's offset is not representable in 8 bits, then the rare bytes
/// prefilter becomes inert.
byte_offsets: RareByteOffsets,
/// Whether this is available as a prefilter or not. This can be set to
/// false during construction if a condition is seen that invalidates the
/// use of the rare-byte prefilter.
available: bool,
/// The number of bytes set to an active value in `byte_offsets`.
count: usize,
/// The sum of frequency ranks for the rare bytes detected. This is
/// intended to give a heuristic notion of how rare the bytes are.
rank_sum: u16,
}
/// A set of byte offsets, keyed by byte.
#[derive(Clone, Copy)]
struct RareByteOffsets {
/// Each entry corresponds to the maximum offset of the corresponding
/// byte across all patterns seen.
set: [RareByteOffset; 256],
}
impl RareByteOffsets {
/// Create a new empty set of rare byte offsets.
pub(crate) fn empty() -> RareByteOffsets {
RareByteOffsets { set: [RareByteOffset::default(); 256] }
}
/// Add the given offset for the given byte to this set. If the offset is
/// greater than the existing offset, then it overwrites the previous
/// value and returns false. If there is no previous value set, then this
/// sets it and returns true.
pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
self.set[byte as usize].max =
cmp::max(self.set[byte as usize].max, off.max);
}
}
impl core::fmt::Debug for RareByteOffsets {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let mut offsets = vec![];
for off in self.set.iter() {
if off.max > 0 {
offsets.push(off);
}
}
f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
}
}
/// Offsets associated with an occurrence of a "rare" byte in any of the
/// patterns used to construct a single Aho-Corasick automaton.
#[derive(Clone, Copy, Debug)]
struct RareByteOffset {
/// The maximum offset at which a particular byte occurs from the start
/// of any pattern. This is used as a shift amount. That is, when an
/// occurrence of this byte is found, the candidate position reported by
/// the prefilter is `position_of_byte - max`, such that the automaton
/// will begin its search at a position that is guaranteed to observe a
/// match.
///
/// To avoid accidentally quadratic behavior, a prefilter is considered
/// ineffective when it is asked to start scanning from a position that it
/// has already scanned past.
///
/// Using a `u8` here means that if we ever see a pattern that's longer
/// than 255 bytes, then the entire rare byte prefilter is disabled.
max: u8,
}
impl Default for RareByteOffset {
fn default() -> RareByteOffset {
RareByteOffset { max: 0 }
}
}
impl RareByteOffset {
/// Create a new rare byte offset. If the given offset is too big, then
/// None is returned. In that case, callers should render the rare bytes
/// prefilter inert.
fn new(max: usize) -> Option<RareByteOffset> {
if max > u8::MAX as usize {
None
} else {
Some(RareByteOffset { max: max as u8 })
}
}
}
impl RareBytesBuilder {
/// Create a new builder for constructing a rare byte prefilter.
fn new() -> RareBytesBuilder {
RareBytesBuilder {
ascii_case_insensitive: false,
rare_set: ByteSet::empty(),
byte_offsets: RareByteOffsets::empty(),
available: true,
count: 0,
rank_sum: 0,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
/// Build the rare bytes prefilter.
///
/// If there are more than 3 distinct rare bytes found, or if heuristics
/// otherwise determine that this prefilter should not be used, then `None`
/// is returned.
fn build(&self) -> Option<Prefilter> {
#[cfg(feature = "perf-literal")]
fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
if !builder.available || builder.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..=255 {
if builder.rare_set.contains(b) {
bytes[len] = b as u8;
len += 1;
}
}
let finder: Arc<dyn PrefilterI> = match len {
0 => return None,
1 => Arc::new(RareBytesOne {
byte1: bytes[0],
offset: builder.byte_offsets.set[bytes[0] as usize],
}),
2 => Arc::new(RareBytesTwo {
offsets: builder.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
}),
3 => Arc::new(RareBytesThree {
offsets: builder.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
}),
_ => unreachable!(),
};
Some(Prefilter { finder, memory_usage: 0 })
}
#[cfg(not(feature = "perf-literal"))]
fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
None
}
imp(self)
}
/// Add a byte string to this builder.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
// If we've already given up, then do nothing.
if !self.available {
return;
}
// If we've already blown our budget, then don't waste time looking
// for more rare bytes.
if self.count > 3 {
self.available = false;
return;
}
// If the pattern is too long, then our offset table is bunk, so
// give up.
if bytes.len() >= 256 {
self.available = false;
return;
}
let mut rarest = match bytes.get(0) {
None => return,
Some(&b) => (b, freq_rank(b)),
};
// The idea here is to look for the rarest byte in each pattern, and
// add that to our set. As a special exception, if we see a byte that
// we've already added, then we immediately stop and choose that byte,
// even if there's another rare byte in the pattern. This helps us
// apply the rare byte optimization in more cases by attempting to pick
// bytes that are in common between patterns. So for example, if we
// were searching for `Sherlock` and `lockjaw`, then this would pick
// `k` for both patterns, resulting in the use of `memchr` instead of
// `memchr2` for `k` and `j`.
let mut found = false;
for (pos, &b) in bytes.iter().enumerate() {
self.set_offset(pos, b);
if found {
continue;
}
if self.rare_set.contains(b) {
found = true;
continue;
}
let rank = freq_rank(b);
if rank < rarest.1 {
rarest = (b, rank);
}
}
if !found {
self.add_rare_byte(rarest.0);
}
}
fn set_offset(&mut self, pos: usize, byte: u8) {
// This unwrap is OK because pos is never bigger than our max.
let offset = RareByteOffset::new(pos).unwrap();
self.byte_offsets.set(byte, offset);
if self.ascii_case_insensitive {
self.byte_offsets.set(opposite_ascii_case(byte), offset);
}
}
fn add_rare_byte(&mut self, byte: u8) {
self.add_one_rare_byte(byte);
if self.ascii_case_insensitive {
self.add_one_rare_byte(opposite_ascii_case(byte));
}
}
fn add_one_rare_byte(&mut self, byte: u8) {
if !self.rare_set.contains(byte) {
self.rare_set.add(byte);
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
/// A prefilter for scanning for a single "rare" byte.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct RareBytesOne {
byte1: u8,
offset: RareByteOffset,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for RareBytesOne {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr(self.byte1, &haystack[span])
.map(|i| {
let pos = span.start + i;
cmp::max(
span.start,
pos.saturating_sub(usize::from(self.offset.max)),
)
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for two "rare" bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct RareBytesTwo {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for RareBytesTwo {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr2(self.byte1, self.byte2, &haystack[span])
.map(|i| {
let pos = span.start + i;
let offset = self.offsets.set[usize::from(haystack[pos])].max;
cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for three "rare" bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct RareBytesThree {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
byte3: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for RareBytesThree {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
.map(|i| {
let pos = span.start + i;
let offset = self.offsets.set[usize::from(haystack[pos])].max;
cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A builder for constructing a starting byte prefilter.
///
/// A starting byte prefilter is a simplistic prefilter that looks for possible
/// matches by reporting all positions corresponding to a particular byte. This
/// generally only takes affect when there are at most 3 distinct possible
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
/// distinct starting bytes (`f` and `b`), and this prefilter returns all
/// occurrences of either `f` or `b`.
///
/// In some cases, a heuristic frequency analysis may determine that it would
/// be better not to use this prefilter even when there are 3 or fewer distinct
/// starting bytes.
#[derive(Clone, Debug)]
struct StartBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// The set of starting bytes observed.
byteset: Vec<bool>,
/// The number of bytes set to true in `byteset`.
count: usize,
/// The sum of frequency ranks for the rare bytes detected. This is
/// intended to give a heuristic notion of how rare the bytes are.
rank_sum: u16,
}
impl StartBytesBuilder {
/// Create a new builder for constructing a start byte prefilter.
fn new() -> StartBytesBuilder {
StartBytesBuilder {
ascii_case_insensitive: false,
byteset: vec![false; 256],
count: 0,
rank_sum: 0,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
/// Build the starting bytes prefilter.
///
/// If there are more than 3 distinct starting bytes, or if heuristics
/// otherwise determine that this prefilter should not be used, then `None`
/// is returned.
fn build(&self) -> Option<Prefilter> {
#[cfg(feature = "perf-literal")]
fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
if builder.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..256 {
if !builder.byteset[b] {
continue;
}
// We don't handle non-ASCII bytes for now. Getting non-ASCII
// bytes right is trickier, since we generally don't want to put
// a leading UTF-8 code unit into a prefilter that isn't ASCII,
// since they can frequently. Instead, it would be better to use a
// continuation byte, but this requires more sophisticated analysis
// of the automaton and a richer prefilter API.
if b > 0x7F {
return None;
}
bytes[len] = b as u8;
len += 1;
}
let finder: Arc<dyn PrefilterI> = match len {
0 => return None,
1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
2 => Arc::new(StartBytesTwo {
byte1: bytes[0],
byte2: bytes[1],
}),
3 => Arc::new(StartBytesThree {
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
}),
_ => unreachable!(),
};
Some(Prefilter { finder, memory_usage: 0 })
}
#[cfg(not(feature = "perf-literal"))]
fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
None
}
imp(self)
}
/// Add a byte string to this builder.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
if self.count > 3 {
return;
}
if let Some(&byte) = bytes.get(0) {
self.add_one_byte(byte);
if self.ascii_case_insensitive {
self.add_one_byte(opposite_ascii_case(byte));
}
}
}
fn add_one_byte(&mut self, byte: u8) {
if !self.byteset[byte as usize] {
self.byteset[byte as usize] = true;
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
/// A prefilter for scanning for a single starting byte.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct StartBytesOne {
byte1: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for StartBytesOne {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr(self.byte1, &haystack[span])
.map(|i| span.start + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for two starting bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct StartBytesTwo {
byte1: u8,
byte2: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for StartBytesTwo {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr2(self.byte1, self.byte2, &haystack[span])
.map(|i| span.start + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for three starting bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct StartBytesThree {
byte1: u8,
byte2: u8,
byte3: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for StartBytesThree {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
.map(|i| span.start + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// If the given byte is an ASCII letter, then return it in the opposite case.
/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
if b'A' <= b && b <= b'Z' {
b.to_ascii_lowercase()
} else if b'a' <= b && b <= b'z' {
b.to_ascii_uppercase()
} else {
b
}
}
/// Return the frequency rank of the given byte. The higher the rank, the more
/// common the byte (heuristically speaking).
fn freq_rank(b: u8) -> u8 {
use crate::util::byte_frequencies::BYTE_FREQUENCIES;
BYTE_FREQUENCIES[b as usize]
}

View File

@@ -0,0 +1,759 @@
/*!
Lower level primitive types that are useful in a variety of circumstances.
# Overview
This list represents the principle types in this module and briefly describes
when you might want to use them.
* [`PatternID`] - A type that represents the identifier of a regex pattern.
This is probably the most widely used type in this module (which is why it's
also re-exported in the crate root).
* [`StateID`] - A type the represents the identifier of a finite automaton
state. This is used for both NFAs and DFAs, with the notable exception of
the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
identifier.)
* [`SmallIndex`] - The internal representation of both a `PatternID` and a
`StateID`. Its purpose is to serve as a type that can index memory without
being as big as a `usize` on 64-bit targets. The main idea behind this type
is that there are many things in regex engines that will, in practice, never
overflow a 32-bit integer. (For example, like the number of patterns in a regex
or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
memory without peppering `as` casts everywhere. Moreover, it forces callers
to handle errors in the case where, somehow, the value would otherwise overflow
either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
*/
// The macro we use to define some types below adds methods that we don't
// use on some of the types. There isn't much, so we just squash the warning.
#![allow(dead_code)]
use alloc::vec::Vec;
use crate::util::int::{Usize, U16, U32, U64};
/// A type that represents a "small" index.
///
/// The main idea of this type is to provide something that can index memory,
/// but uses less memory than `usize` on 64-bit systems. Specifically, its
/// representation is always a `u32` and has `repr(transparent)` enabled. (So
/// it is safe to transmute between a `u32` and a `SmallIndex`.)
///
/// A small index is typically useful in cases where there is no practical way
/// that the index will overflow a 32-bit integer. A good example of this is
/// an NFA state. If you could somehow build an NFA with `2^30` states, its
/// memory usage would be exorbitant and its runtime execution would be so
/// slow as to be completely worthless. Therefore, this crate generally deems
/// it acceptable to return an error if it would otherwise build an NFA that
/// requires a slice longer than what a 32-bit integer can index. In exchange,
/// we can use 32-bit indices instead of 64-bit indices in various places.
///
/// This type ensures this by providing a constructor that will return an error
/// if its argument cannot fit into the type. This makes it much easier to
/// handle these sorts of boundary cases that are otherwise extremely subtle.
///
/// On all targets, this type guarantees that its value will fit in a `u32`,
/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
/// example, this type's maximum value will never overflow an `isize`,
/// which means it will never overflow a `i16` even though its internal
/// representation is still a `u32`.
///
/// The purpose for making the type fit into even signed integer types like
/// `isize` is to guarantee that the difference between any two small indices
/// is itself also a small index. This is useful in certain contexts, e.g.,
/// for delta encoding.
///
/// # Other types
///
/// The following types wrap `SmallIndex` to provide a more focused use case:
///
/// * [`PatternID`] is for representing the identifiers of patterns.
/// * [`StateID`] is for representing the identifiers of states in finite
/// automata. It is used for both NFAs and DFAs.
///
/// # Representation
///
/// This type is always represented internally by a `u32` and is marked as
/// `repr(transparent)`. Thus, this type always has the same representation as
/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
///
/// # Indexing
///
/// For convenience, callers may use a `SmallIndex` to index slices.
///
/// # Safety
///
/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `SmallIndex` with
/// an invalid value can be done in entirely safe code. This may in turn result
/// in panics or silent logical errors.
#[derive(
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
)]
#[repr(transparent)]
pub(crate) struct SmallIndex(u32);
impl SmallIndex {
/// The maximum index value.
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
pub const MAX: SmallIndex =
// FIXME: Use as_usize() once const functions in traits are stable.
SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
/// The maximum index value.
#[cfg(target_pointer_width = "16")]
pub const MAX: SmallIndex =
SmallIndex::new_unchecked(core::isize::MAX - 1);
/// The total number of values that can be represented as a small index.
pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
/// The zero index value.
pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
/// The number of bytes that a single small index uses in memory.
pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
/// Create a new small index.
///
/// If the given index exceeds [`SmallIndex::MAX`], then this returns
/// an error.
#[inline]
pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
SmallIndex::try_from(index)
}
/// Create a new small index without checking whether the given value
/// exceeds [`SmallIndex::MAX`].
///
/// Using this routine with an invalid index value will result in
/// unspecified behavior, but *not* undefined behavior. In particular, an
/// invalid index value is likely to cause panics or possibly even silent
/// logical errors.
///
/// Callers must never rely on a `SmallIndex` to be within a certain range
/// for memory safety.
#[inline]
pub const fn new_unchecked(index: usize) -> SmallIndex {
// FIXME: Use as_u32() once const functions in traits are stable.
SmallIndex::from_u32_unchecked(index as u32)
}
/// Create a new small index from a `u32` without checking whether the
/// given value exceeds [`SmallIndex::MAX`].
///
/// Using this routine with an invalid index value will result in
/// unspecified behavior, but *not* undefined behavior. In particular, an
/// invalid index value is likely to cause panics or possibly even silent
/// logical errors.
///
/// Callers must never rely on a `SmallIndex` to be within a certain range
/// for memory safety.
#[inline]
pub const fn from_u32_unchecked(index: u32) -> SmallIndex {
SmallIndex(index)
}
/// Like [`SmallIndex::new`], but panics if the given index is not valid.
#[inline]
pub fn must(index: usize) -> SmallIndex {
SmallIndex::new(index).expect("invalid small index")
}
/// Return this small index as a `usize`. This is guaranteed to never
/// overflow `usize`.
#[inline]
pub const fn as_usize(&self) -> usize {
// FIXME: Use as_usize() once const functions in traits are stable.
self.0 as usize
}
/// Return this small index as a `u64`. This is guaranteed to never
/// overflow.
#[inline]
pub const fn as_u64(&self) -> u64 {
// FIXME: Use u64::from() once const functions in traits are stable.
self.0 as u64
}
/// Return the internal `u32` of this small index. This is guaranteed to
/// never overflow `u32`.
#[inline]
pub const fn as_u32(&self) -> u32 {
self.0
}
/// Return the internal `u32` of this small index represented as an `i32`.
/// This is guaranteed to never overflow an `i32`.
#[inline]
pub const fn as_i32(&self) -> i32 {
// This is OK because we guarantee that our max value is <= i32::MAX.
self.0 as i32
}
/// Returns one more than this small index as a usize.
///
/// Since a small index has constraints on its maximum value, adding `1` to
/// it will always fit in a `usize`, `isize`, `u32` and a `i32`.
#[inline]
pub fn one_more(&self) -> usize {
self.as_usize() + 1
}
/// Decode this small index from the bytes given using the native endian
/// byte order for the current target.
///
/// If the decoded integer is not representable as a small index for the
/// current target, then this returns an error.
#[inline]
pub fn from_ne_bytes(
bytes: [u8; 4],
) -> Result<SmallIndex, SmallIndexError> {
let id = u32::from_ne_bytes(bytes);
if id > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(id) });
}
Ok(SmallIndex::new_unchecked(id.as_usize()))
}
/// Decode this small index from the bytes given using the native endian
/// byte order for the current target.
///
/// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
/// check whether the decoded integer is representable as a small index.
#[inline]
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
}
/// Return the underlying small index integer as raw bytes in native endian
/// format.
#[inline]
pub fn to_ne_bytes(&self) -> [u8; 4] {
self.0.to_ne_bytes()
}
}
impl<T> core::ops::Index<SmallIndex> for [T] {
type Output = T;
#[inline]
fn index(&self, index: SmallIndex) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<SmallIndex> for [T] {
#[inline]
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
&mut self[index.as_usize()]
}
}
impl<T> core::ops::Index<SmallIndex> for Vec<T> {
type Output = T;
#[inline]
fn index(&self, index: SmallIndex) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
#[inline]
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
&mut self[index.as_usize()]
}
}
impl From<StateID> for SmallIndex {
fn from(sid: StateID) -> SmallIndex {
sid.0
}
}
impl From<PatternID> for SmallIndex {
fn from(pid: PatternID) -> SmallIndex {
pid.0
}
}
impl From<u8> for SmallIndex {
fn from(index: u8) -> SmallIndex {
SmallIndex::new_unchecked(usize::from(index))
}
}
impl TryFrom<u16> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
if u32::from(index) > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(index) });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<u32> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(index) });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<u64> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_u64() {
return Err(SmallIndexError { attempted: index });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<usize> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_usize() {
return Err(SmallIndexError { attempted: index.as_u64() });
}
Ok(SmallIndex::new_unchecked(index))
}
}
/// This error occurs when a small index could not be constructed.
///
/// This occurs when given an integer exceeding the maximum small index value.
///
/// When the `std` feature is enabled, this implements the `Error` trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SmallIndexError {
attempted: u64,
}
impl SmallIndexError {
/// Returns the value that could not be converted to a small index.
pub fn attempted(&self) -> u64 {
self.attempted
}
}
#[cfg(feature = "std")]
impl std::error::Error for SmallIndexError {}
impl core::fmt::Display for SmallIndexError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create small index from {:?}, which exceeds {:?}",
self.attempted(),
SmallIndex::MAX,
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct SmallIndexIter {
rng: core::ops::Range<usize>,
}
impl Iterator for SmallIndexIter {
type Item = SmallIndex;
fn next(&mut self) -> Option<SmallIndex> {
if self.rng.start >= self.rng.end {
return None;
}
let next_id = self.rng.start + 1;
let id = core::mem::replace(&mut self.rng.start, next_id);
// new_unchecked is OK since we asserted that the number of
// elements in this iterator will fit in an ID at construction.
Some(SmallIndex::new_unchecked(id))
}
}
macro_rules! index_type_impls {
($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
impl $name {
/// The maximum value.
pub const MAX: $name = $name(SmallIndex::MAX);
/// The total number of values that can be represented.
pub const LIMIT: usize = SmallIndex::LIMIT;
/// The zero value.
pub const ZERO: $name = $name(SmallIndex::ZERO);
/// The number of bytes that a single value uses in memory.
pub const SIZE: usize = SmallIndex::SIZE;
/// Create a new value that is represented by a "small index."
///
/// If the given index exceeds the maximum allowed value, then this
/// returns an error.
#[inline]
pub fn new(value: usize) -> Result<$name, $err> {
SmallIndex::new(value).map($name).map_err($err)
}
/// Create a new value without checking whether the given argument
/// exceeds the maximum.
///
/// Using this routine with an invalid value will result in
/// unspecified behavior, but *not* undefined behavior. In
/// particular, an invalid ID value is likely to cause panics or
/// possibly even silent logical errors.
///
/// Callers must never rely on this type to be within a certain
/// range for memory safety.
#[inline]
pub const fn new_unchecked(value: usize) -> $name {
$name(SmallIndex::new_unchecked(value))
}
/// Create a new value from a `u32` without checking whether the
/// given value exceeds the maximum.
///
/// Using this routine with an invalid value will result in
/// unspecified behavior, but *not* undefined behavior. In
/// particular, an invalid ID value is likely to cause panics or
/// possibly even silent logical errors.
///
/// Callers must never rely on this type to be within a certain
/// range for memory safety.
#[inline]
pub const fn from_u32_unchecked(index: u32) -> $name {
$name(SmallIndex::from_u32_unchecked(index))
}
/// Like `new`, but panics if the given value is not valid.
#[inline]
pub fn must(value: usize) -> $name {
$name::new(value).expect(concat!(
"invalid ",
stringify!($name),
" value"
))
}
/// Return the internal value as a `usize`. This is guaranteed to
/// never overflow `usize`.
#[inline]
pub const fn as_usize(&self) -> usize {
self.0.as_usize()
}
/// Return the internal value as a `u64`. This is guaranteed to
/// never overflow.
#[inline]
pub const fn as_u64(&self) -> u64 {
self.0.as_u64()
}
/// Return the internal value as a `u32`. This is guaranteed to
/// never overflow `u32`.
#[inline]
pub const fn as_u32(&self) -> u32 {
self.0.as_u32()
}
/// Return the internal value as a `i32`. This is guaranteed to
/// never overflow an `i32`.
#[inline]
pub const fn as_i32(&self) -> i32 {
self.0.as_i32()
}
/// Returns one more than this value as a usize.
///
/// Since values represented by a "small index" have constraints
/// on their maximum value, adding `1` to it will always fit in a
/// `usize`, `u32` and a `i32`.
#[inline]
pub fn one_more(&self) -> usize {
self.0.one_more()
}
/// Decode this value from the bytes given using the native endian
/// byte order for the current target.
///
/// If the decoded integer is not representable as a small index
/// for the current target, then this returns an error.
#[inline]
pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
}
/// Decode this value from the bytes given using the native endian
/// byte order for the current target.
///
/// This is analogous to `new_unchecked` in that is does not check
/// whether the decoded integer is representable as a small index.
#[inline]
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
$name(SmallIndex::from_ne_bytes_unchecked(bytes))
}
/// Return the underlying integer as raw bytes in native endian
/// format.
#[inline]
pub fn to_ne_bytes(&self) -> [u8; 4] {
self.0.to_ne_bytes()
}
/// Returns an iterator over all values from 0 up to and not
/// including the given length.
///
/// If the given length exceeds this type's limit, then this
/// panics.
pub(crate) fn iter(len: usize) -> $iter {
$iter::new(len)
}
}
// We write our own Debug impl so that we get things like PatternID(5)
// instead of PatternID(SmallIndex(5)).
impl core::fmt::Debug for $name {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
}
}
impl<T> core::ops::Index<$name> for [T] {
type Output = T;
#[inline]
fn index(&self, index: $name) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<$name> for [T] {
#[inline]
fn index_mut(&mut self, index: $name) -> &mut T {
&mut self[index.as_usize()]
}
}
impl<T> core::ops::Index<$name> for Vec<T> {
type Output = T;
#[inline]
fn index(&self, index: $name) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<$name> for Vec<T> {
#[inline]
fn index_mut(&mut self, index: $name) -> &mut T {
&mut self[index.as_usize()]
}
}
impl From<SmallIndex> for $name {
fn from(index: SmallIndex) -> $name {
$name(index)
}
}
impl From<u8> for $name {
fn from(value: u8) -> $name {
$name(SmallIndex::from(value))
}
}
impl TryFrom<u16> for $name {
type Error = $err;
fn try_from(value: u16) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<u32> for $name {
type Error = $err;
fn try_from(value: u32) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<u64> for $name {
type Error = $err;
fn try_from(value: u64) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<usize> for $name {
type Error = $err;
fn try_from(value: usize) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
/// This error occurs when an ID could not be constructed.
///
/// This occurs when given an integer exceeding the maximum allowed
/// value.
///
/// When the `std` feature is enabled, this implements the `Error`
/// trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct $err(SmallIndexError);
impl $err {
/// Returns the value that could not be converted to an ID.
pub fn attempted(&self) -> u64 {
self.0.attempted()
}
}
#[cfg(feature = "std")]
impl std::error::Error for $err {}
impl core::fmt::Display for $err {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create {} from {:?}, which exceeds {:?}",
stringify!($name),
self.attempted(),
$name::MAX,
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct $iter(SmallIndexIter);
impl $iter {
fn new(len: usize) -> $iter {
assert!(
len <= $name::LIMIT,
"cannot create iterator for {} when number of \
elements exceed {:?}",
stringify!($name),
$name::LIMIT,
);
$iter(SmallIndexIter { rng: 0..len })
}
}
impl Iterator for $iter {
type Item = $name;
fn next(&mut self) -> Option<$name> {
self.0.next().map($name)
}
}
/// An iterator adapter that is like std::iter::Enumerate, but attaches
/// small index values instead. It requires `ExactSizeIterator`. At
/// construction, it ensures that the index of each element in the
/// iterator is representable in the corresponding small index type.
#[derive(Clone, Debug)]
pub(crate) struct $withiter<I> {
it: I,
ids: $iter,
}
impl<I: Iterator + ExactSizeIterator> $withiter<I> {
fn new(it: I) -> $withiter<I> {
let ids = $name::iter(it.len());
$withiter { it, ids }
}
}
impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
type Item = ($name, I::Item);
fn next(&mut self) -> Option<($name, I::Item)> {
let item = self.it.next()?;
// Number of elements in this iterator must match, according
// to contract of ExactSizeIterator.
let id = self.ids.next().unwrap();
Some((id, item))
}
}
};
}
/// The identifier of a pattern in an Aho-Corasick automaton.
///
/// It is represented by a `u32` even on 64-bit systems in order to conserve
/// space. Namely, on all targets, this type guarantees that its value will
/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
/// targets, for example, this type's maximum value will never overflow an
/// `isize`, which means it will never overflow a `i16` even though its
/// internal representation is still a `u32`.
///
/// # Safety
///
/// While a `PatternID` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `StateID` with an
/// invalid value can be done in entirely safe code. This may in turn result in
/// panics or silent logical errors.
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct PatternID(SmallIndex);
/// The identifier of a finite automaton state.
///
/// It is represented by a `u32` even on 64-bit systems in order to conserve
/// space. Namely, on all targets, this type guarantees that its value will
/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
/// targets, for example, this type's maximum value will never overflow an
/// `isize`, which means it will never overflow a `i16` even though its
/// internal representation is still a `u32`.
///
/// # Safety
///
/// While a `StateID` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `StateID` with an
/// invalid value can be done in entirely safe code. This may in turn result in
/// panics or silent logical errors.
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct StateID(SmallIndex);
index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
/// A utility trait that defines a couple of adapters for making it convenient
/// to access indices as "small index" types. We require ExactSizeIterator so
/// that iterator construction can do a single check to make sure the index of
/// each element is representable by its small index type.
pub(crate) trait IteratorIndexExt: Iterator {
fn with_pattern_ids(self) -> WithPatternIDIter<Self>
where
Self: Sized + ExactSizeIterator,
{
WithPatternIDIter::new(self)
}
fn with_state_ids(self) -> WithStateIDIter<Self>
where
Self: Sized + ExactSizeIterator,
{
WithStateIDIter::new(self)
}
}
impl<I: Iterator> IteratorIndexExt for I {}

214
vendor/aho-corasick/src/util/remapper.rs vendored Normal file
View File

@@ -0,0 +1,214 @@
use alloc::vec::Vec;
use crate::{nfa::noncontiguous, util::primitives::StateID};
/// Remappable is a tightly coupled abstraction that facilitates remapping
/// state identifiers in DFAs.
///
/// The main idea behind remapping state IDs is that DFAs often need to check
/// if a certain state is a "special" state of some kind (like a match state)
/// during a search. Since this is extremely perf critical code, we want this
/// check to be as fast as possible. Partitioning state IDs into, for example,
/// into "non-match" and "match" states means one can tell if a state is a
/// match state via a simple comparison of the state ID.
///
/// The issue is that during the DFA construction process, it's not
/// particularly easy to partition the states. Instead, the simplest thing is
/// to often just do a pass over all of the states and shuffle them into their
/// desired partitionings. To do that, we need a mechanism for swapping states.
/// Hence, this abstraction.
///
/// Normally, for such little code, I would just duplicate it. But this is a
/// key optimization and the implementation is a bit subtle. So the abstraction
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
/// the dense and one-pass DFAs.
///
/// See also src/dfa/special.rs for a more detailed explanation of how dense
/// DFAs are partitioned.
pub(crate) trait Remappable: core::fmt::Debug {
/// Return the total number of states.
fn state_len(&self) -> usize;
/// Swap the states pointed to by the given IDs. The underlying finite
/// state machine should be mutated such that all of the transitions in
/// `id1` are now in the memory region where the transitions for `id2`
/// were, and all of the transitions in `id2` are now in the memory region
/// where the transitions for `id1` were.
///
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
///
/// It is expected that, after calling this, the underlying state machine
/// will be left in an inconsistent state, since any other transitions
/// pointing to, e.g., `id1` need to be updated to point to `id2`, since
/// that's where `id1` moved to.
///
/// In order to "fix" the underlying inconsistent state, a `Remapper`
/// should be used to guarantee that `remap` is called at the appropriate
/// time.
fn swap_states(&mut self, id1: StateID, id2: StateID);
/// This must remap every single state ID in the underlying value according
/// to the function given. For example, in a DFA, this should remap every
/// transition and every starting state ID.
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
}
/// Remapper is an abstraction the manages the remapping of state IDs in a
/// finite state machine. This is useful when one wants to shuffle states into
/// different positions in the machine.
///
/// One of the key complexities this manages is the ability to correctly move
/// one state multiple times.
///
/// Once shuffling is complete, `remap` must be called, which will rewrite
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
/// will almost certainly result in a corrupt machine.
#[derive(Debug)]
pub(crate) struct Remapper {
/// A map from the index of a state to its pre-multiplied identifier.
///
/// When a state is swapped with another, then their corresponding
/// locations in this map are also swapped. Thus, its new position will
/// still point to its old pre-multiplied StateID.
///
/// While there is a bit more to it, this then allows us to rewrite the
/// state IDs in a DFA's transition table in a single pass. This is done
/// by iterating over every ID in this map, then iterating over each
/// transition for the state at that ID and re-mapping the transition from
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
/// in this map where `old_id` *started*, and set it to where it ended up
/// after all swaps have been completed.
map: Vec<StateID>,
/// A way to map indices to state IDs (and back).
idx: IndexMapper,
}
impl Remapper {
/// Create a new remapper from the given remappable implementation. The
/// remapper can then be used to swap states. The remappable value given
/// here must the same one given to `swap` and `remap`.
///
/// The given stride should be the stride of the transition table expressed
/// as a power of 2. This stride is used to map between state IDs and state
/// indices. If state IDs and state indices are equivalent, then provide
/// a `stride2` of `0`, which acts as an identity.
pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
let idx = IndexMapper { stride2 };
let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect();
Remapper { map, idx }
}
/// Swap two states. Once this is called, callers must follow through to
/// call `remap`, or else it's possible for the underlying remappable
/// value to be in a corrupt state.
pub(crate) fn swap(
&mut self,
r: &mut impl Remappable,
id1: StateID,
id2: StateID,
) {
if id1 == id2 {
return;
}
r.swap_states(id1, id2);
self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
}
/// Complete the remapping process by rewriting all state IDs in the
/// remappable value according to the swaps performed.
pub(crate) fn remap(mut self, r: &mut impl Remappable) {
// Update the map to account for states that have been swapped
// multiple times. For example, if (A, C) and (C, G) are swapped, then
// transitions previously pointing to A should now point to G. But if
// we don't update our map, they will erroneously be set to C. All we
// do is follow the swaps in our map until we see our original state
// ID.
//
// The intuition here is to think about how changes are made to the
// map: only through pairwise swaps. That means that starting at any
// given state, it is always possible to find the loop back to that
// state by following the swaps represented in the map (which might be
// 0 swaps).
//
// We are also careful to clone the map before starting in order to
// freeze it. We use the frozen map to find our loops, since we need to
// update our map as well. Without freezing it, our updates could break
// the loops referenced above and produce incorrect results.
let oldmap = self.map.clone();
for i in 0..r.state_len() {
let cur_id = self.idx.to_state_id(i);
let mut new_id = oldmap[i];
if cur_id == new_id {
continue;
}
loop {
let id = oldmap[self.idx.to_index(new_id)];
if cur_id == id {
self.map[i] = new_id;
break;
}
new_id = id;
}
}
r.remap(|sid| self.map[self.idx.to_index(sid)]);
}
}
/// A simple type for mapping between state indices and state IDs.
///
/// The reason why this exists is because state IDs are "premultiplied" in a
/// DFA. That is, in order to get to the transitions for a particular state,
/// one need only use the state ID as-is, instead of having to multiply it by
/// transition table's stride.
///
/// The downside of this is that it's inconvenient to map between state IDs
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`,
/// etc.
///
/// Since our state IDs are premultiplied, we can convert back-and-forth
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
/// indices.
///
/// Note that for a sparse NFA, state IDs and indices are equivalent. In this
/// case, we set the stride of the index mapped to be `0`, which acts as an
/// identity.
#[derive(Debug)]
struct IndexMapper {
/// The power of 2 corresponding to the stride of the corresponding
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
/// stride2' pre-multiplies an index to an ID.
stride2: usize,
}
impl IndexMapper {
/// Convert a state ID to a state index.
fn to_index(&self, id: StateID) -> usize {
id.as_usize() >> self.stride2
}
/// Convert a state index to a state ID.
fn to_state_id(&self, index: usize) -> StateID {
// CORRECTNESS: If the given index is not valid, then it is not
// required for this to panic or return a valid state ID. We'll "just"
// wind up with panics or silent logic errors at some other point. But
// this is OK because if Remappable::state_len is correct and so is
// 'to_index', then all inputs to 'to_state_id' should be valid indices
// and thus transform into valid state IDs.
StateID::new_unchecked(index << self.stride2)
}
}
impl Remappable for noncontiguous::NFA {
fn state_len(&self) -> usize {
noncontiguous::NFA::states(self).len()
}
fn swap_states(&mut self, id1: StateID, id2: StateID) {
noncontiguous::NFA::swap_states(self, id1, id2)
}
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
noncontiguous::NFA::remap(self, map)
}
}

1148
vendor/aho-corasick/src/util/search.rs vendored Normal file

File diff suppressed because it is too large Load Diff

42
vendor/aho-corasick/src/util/special.rs vendored Normal file
View File

@@ -0,0 +1,42 @@
use crate::util::primitives::StateID;
/// A collection of sentinel state IDs for Aho-Corasick automata.
///
/// This specifically enables the technique by which we determine which states
/// are dead, matches or start states. Namely, by arranging states in a
/// particular order, we can determine the type of a state simply by looking at
/// its ID.
#[derive(Clone, Debug)]
pub(crate) struct Special {
/// The maximum ID of all the "special" states. This corresponds either to
/// start_anchored_id when a prefilter is active and max_match_id when a
/// prefilter is not active. The idea here is that if there is no prefilter,
/// then there is no point in treating start states as special.
pub(crate) max_special_id: StateID,
/// The maximum ID of all the match states. Any state ID bigger than this
/// is guaranteed to be a non-match ID.
///
/// It is possible and legal for max_match_id to be equal to
/// start_anchored_id, which occurs precisely in the case where the empty
/// string is a pattern that was added to the underlying automaton.
pub(crate) max_match_id: StateID,
/// The state ID of the start state used for unanchored searches.
pub(crate) start_unanchored_id: StateID,
/// The state ID of the start state used for anchored searches. This is
/// always start_unanchored_id+1.
pub(crate) start_anchored_id: StateID,
}
impl Special {
/// Create a new set of "special" state IDs with all IDs initialized to
/// zero. The general idea here is that they will be updated and set to
/// correct values later.
pub(crate) fn zero() -> Special {
Special {
max_special_id: StateID::ZERO,
max_match_id: StateID::ZERO,
start_unanchored_id: StateID::ZERO,
start_anchored_id: StateID::ZERO,
}
}
}