Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

517
vendor/regex-automata/src/dfa/accel.rs vendored Normal file
View File

@@ -0,0 +1,517 @@
// This module defines some core types for dealing with accelerated DFA states.
// Briefly, a DFA state can be "accelerated" if all of its transitions except
// for a few loop back to itself. This directly implies that the only way out
// of such a state is if a byte corresponding to one of those non-loopback
// transitions is found. Such states are often found in simple repetitions in
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
// DFA with regex-cli:
//
// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
// D 000000:
// Q 000001:
// *000002:
// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
// 000005: \x00-` => 4, b-\xFF => 4
// 000006: \x00-` => 3, a => 6, b-\xFF => 3
// 000007: \x00-\xFF => 2, EOI => 2
// 000008: \x00-\xFF => 2, EOI => 2
//
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
// the only way to leave that state once entered is to see an 'a' byte. If
// there is a long run of non-'a' bytes, then using something like 'memchr'
// to find the next 'a' byte can be significantly faster than just using the
// standard byte-at-a-time state machine.
//
// Unfortunately, this optimization rarely applies when Unicode is enabled.
// For example, patterns like '[^a]' don't actually match any byte that isn't
// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
// 'a'. This makes the state machine much more complex---far beyond a single
// state---and removes the ability to easily accelerate it. (Because if the
// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
//
// In practice, we only consider accelerating states that have 3 or fewer
// non-loop transitions. At a certain point, you get diminishing returns, but
// also because that's what the memchr crate supports. The structures below
// hard-code this assumption and provide (de)serialization APIs for use inside
// a DFA.
//
// And finally, note that there is some trickery involved in making it very
// fast to not only check whether a state is accelerated at search time, but
// also to access the bytes to search for to implement the acceleration itself.
// dfa/special.rs provides more detail, but the short story is that all
// accelerated states appear contiguously in a DFA. This means we can represent
// the ID space of all accelerated DFA states with a single range. So given
// a state ID, we can determine whether it's accelerated via
//
// min_accel_id <= id <= max_accel_id
//
// And find its corresponding accelerator with:
//
// accels.get((id - min_accel_id) / dfa_stride)
#[cfg(feature = "dfa-build")]
use alloc::{vec, vec::Vec};
use crate::util::{
int::Pointer,
memchr,
wire::{self, DeserializeError, Endian, SerializeError},
};
/// The base type used to represent a collection of accelerators.
///
/// While an `Accel` is represented as a fixed size array of bytes, a
/// *collection* of `Accel`s (called `Accels`) is represented internally as a
/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
/// fairly low-risk not-safe code, it lets us remove the need for a second type
/// parameter in the definition of dense::DFA. (Which really wants everything
/// to be a slice of u32.)
type AccelTy = u32;
/// The size of the unit of representation for accelerators.
///
/// ACCEL_CAP *must* be a multiple of this size.
const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
/// The maximum length in bytes that a single Accel can be. This is distinct
/// from the capacity of an accelerator in that the length represents only the
/// bytes that should be read.
const ACCEL_LEN: usize = 4;
/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
/// multiple of 4 (our ID size) and because it gives us a little wiggle room
/// if we want to support more accel bytes in the future without a breaking
/// change.
///
/// This MUST be a multiple of ACCEL_TY_SIZE.
const ACCEL_CAP: usize = 8;
/// Search for between 1 and 3 needle bytes in the given haystack, starting the
/// search at the given position. If `needles` has a length other than 1-3,
/// then this panics.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn find_fwd(
needles: &[u8],
haystack: &[u8],
at: usize,
) -> Option<usize> {
let bs = needles;
let i = match needles.len() {
1 => memchr::memchr(bs[0], &haystack[at..])?,
2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
0 => panic!("cannot find with empty needles"),
n => panic!("invalid needles length: {n}"),
};
Some(at + i)
}
/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
/// starting the search at the given position. If `needles` has a length other
/// than 1-3, then this panics.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn find_rev(
needles: &[u8],
haystack: &[u8],
at: usize,
) -> Option<usize> {
let bs = needles;
match needles.len() {
1 => memchr::memrchr(bs[0], &haystack[..at]),
2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
0 => panic!("cannot find with empty needles"),
n => panic!("invalid needles length: {n}"),
}
}
/// Represents the accelerators for all accelerated states in a dense DFA.
///
/// The `A` type parameter represents the type of the underlying bytes.
/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
#[derive(Clone)]
pub(crate) struct Accels<A> {
/// A length prefixed slice of contiguous accelerators. See the top comment
/// in this module for more details on how we can jump from a DFA's state
/// ID to an accelerator in this list.
///
/// The first 4 bytes always correspond to the number of accelerators
/// that follow.
accels: A,
}
#[cfg(feature = "dfa-build")]
impl Accels<Vec<AccelTy>> {
/// Create an empty sequence of accelerators for a DFA.
pub fn empty() -> Accels<Vec<AccelTy>> {
Accels { accels: vec![0] }
}
/// Add an accelerator to this sequence.
///
/// This adds to the accelerator to the end of the sequence and therefore
/// should be done in correspondence with its state in the DFA.
///
/// This panics if this results in more accelerators than AccelTy::MAX.
pub fn add(&mut self, accel: Accel) {
self.accels.extend_from_slice(&accel.as_accel_tys());
let len = self.len();
self.set_len(len + 1);
}
/// Set the number of accelerators in this sequence, which is encoded in
/// the first 4 bytes of the underlying bytes.
fn set_len(&mut self, new_len: usize) {
// The only way an accelerator gets added is if a state exists for
// it, and if a state exists, then its index is guaranteed to be
// representable by a AccelTy by virtue of the guarantees provided by
// StateID.
let new_len = AccelTy::try_from(new_len).unwrap();
self.accels[0] = new_len;
}
}
impl<'a> Accels<&'a [AccelTy]> {
/// Deserialize a sequence of accelerators from the given bytes. If there
/// was a problem deserializing, then an error is returned.
///
/// This is guaranteed to run in constant time. This does not guarantee
/// that every accelerator in the returned collection is valid. Thus,
/// accessing one may panic, or not-safe code that relies on accelerators
/// being correct my result in UB.
///
/// Callers may check the validity of every accelerator with the `validate`
/// method.
pub fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
let slice_start = slice.as_ptr().as_usize();
let (accel_len, _) =
wire::try_read_u32_as_usize(slice, "accelerators length")?;
// The accelerator length is part of the accel_tys slice that
// we deserialize. This is perhaps a bit idiosyncratic. It would
// probably be better to split out the length into a real field.
let accel_tys_len = wire::add(
wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
1,
"total number of accel_tys",
)?;
let accel_tys_bytes_len = wire::mul(
ACCEL_TY_SIZE,
accel_tys_len,
"total number of bytes in accelerators",
)?;
wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
wire::check_alignment::<AccelTy>(slice)?;
let accel_tys = &slice[..accel_tys_bytes_len];
slice = &slice[accel_tys_bytes_len..];
// SAFETY: We've checked the length and alignment above, and since
// slice is just bytes and AccelTy is just a u32, we can safely cast to
// a slice of &[AccelTy].
let accels = unsafe {
core::slice::from_raw_parts(
accel_tys.as_ptr().cast::<AccelTy>(),
accel_tys_len,
)
};
Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
}
}
impl<A: AsRef<[AccelTy]>> Accels<A> {
/// Return an owned version of the accelerators.
#[cfg(feature = "alloc")]
pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
Accels { accels: self.accels.as_ref().to_vec() }
}
/// Return a borrowed version of the accelerators.
pub fn as_ref(&self) -> Accels<&[AccelTy]> {
Accels { accels: self.accels.as_ref() }
}
/// Return the bytes representing the serialization of the accelerators.
pub fn as_bytes(&self) -> &[u8] {
let accels = self.accels.as_ref();
// SAFETY: This is safe because accels is a just a slice of AccelTy,
// and u8 always has a smaller alignment.
unsafe {
core::slice::from_raw_parts(
accels.as_ptr().cast::<u8>(),
accels.len() * ACCEL_TY_SIZE,
)
}
}
/// Returns the memory usage, in bytes, of these accelerators.
///
/// The memory usage is computed based on the number of bytes used to
/// represent all of the accelerators.
///
/// This does **not** include the stack size used by this value.
pub fn memory_usage(&self) -> usize {
self.as_bytes().len()
}
/// Return the bytes to search for corresponding to the accelerator in this
/// sequence at index `i`. If no such accelerator exists, then this panics.
///
/// The significance of the index is that it should be in correspondence
/// with the index of the corresponding DFA. That is, accelerated DFA
/// states are stored contiguously in the DFA and have an ordering implied
/// by their respective state IDs. The state's index in that sequence
/// corresponds to the index of its corresponding accelerator.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn needles(&self, i: usize) -> &[u8] {
if i >= self.len() {
panic!("invalid accelerator index {i}");
}
let bytes = self.as_bytes();
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
let len = usize::from(bytes[offset]);
&bytes[offset + 1..offset + 1 + len]
}
/// Return the total number of accelerators in this sequence.
pub fn len(&self) -> usize {
// This should never panic since deserialization checks that the
// length can fit into a usize.
usize::try_from(self.accels.as_ref()[0]).unwrap()
}
/// Return the accelerator in this sequence at index `i`. If no such
/// accelerator exists, then this returns None.
///
/// See the docs for `needles` on the significance of the index.
fn get(&self, i: usize) -> Option<Accel> {
if i >= self.len() {
return None;
}
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
let accel = Accel::from_slice(&self.as_bytes()[offset..])
.expect("Accels must contain valid accelerators");
Some(accel)
}
/// Returns an iterator of accelerators in this sequence.
fn iter(&self) -> IterAccels<'_, A> {
IterAccels { accels: self, i: 0 }
}
/// Writes these accelerators to the given byte buffer using the indicated
/// endianness. If the given buffer is too small, then an error is
/// returned. Upon success, the total number of bytes written is returned.
/// The number of bytes written is guaranteed to be a multiple of 8.
pub fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = self.write_to_len();
assert_eq!(
nwrite % ACCEL_TY_SIZE,
0,
"expected accelerator bytes written to be a multiple \
of {ACCEL_TY_SIZE}",
);
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("accelerators"));
}
// The number of accelerators can never exceed AccelTy::MAX.
E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
// The actual accelerators are just raw bytes and thus their endianness
// is irrelevant. So we can copy them as bytes.
dst[ACCEL_TY_SIZE..nwrite]
.copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
Ok(nwrite)
}
/// Validates that every accelerator in this collection can be successfully
/// deserialized as a valid accelerator.
pub fn validate(&self) -> Result<(), DeserializeError> {
for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
let _ = Accel::from_slice(chunk)?;
}
Ok(())
}
/// Returns the total number of bytes written by `write_to`.
pub fn write_to_len(&self) -> usize {
self.as_bytes().len()
}
}
impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "Accels(")?;
let mut list = f.debug_list();
for a in self.iter() {
list.entry(&a);
}
list.finish()?;
write!(f, ")")
}
}
#[derive(Debug)]
struct IterAccels<'a, A: AsRef<[AccelTy]>> {
accels: &'a Accels<A>,
i: usize,
}
impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
type Item = Accel;
fn next(&mut self) -> Option<Accel> {
let accel = self.accels.get(self.i)?;
self.i += 1;
Some(accel)
}
}
/// Accel represents a structure for determining how to "accelerate" a DFA
/// state.
///
/// Namely, it contains zero or more bytes that must be seen in order for the
/// DFA to leave the state it is associated with. In practice, the actual range
/// is 1 to 3 bytes.
///
/// The purpose of acceleration is to identify states whose vast majority
/// of transitions are just loops back to the same state. For example,
/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
/// looking for the next occurrence of either `a` or `b` instead of explicitly
/// following transitions. (In this case, `b` transitions to the next state
/// where as `a` would transition to the dead state.)
#[derive(Clone)]
pub(crate) struct Accel {
/// The first byte is the length. Subsequent bytes are the accelerated
/// bytes.
///
/// Note that we make every accelerator 8 bytes as a slightly wasteful
/// way of making sure alignment is always correct for state ID sizes of
/// 1, 2, 4 and 8. This should be okay since accelerated states aren't
/// particularly common, especially when Unicode is enabled.
bytes: [u8; ACCEL_CAP],
}
impl Accel {
/// Returns an empty accel, where no bytes are accelerated.
#[cfg(feature = "dfa-build")]
pub fn new() -> Accel {
Accel { bytes: [0; ACCEL_CAP] }
}
/// Returns a verified accelerator derived from the beginning of the given
/// slice.
///
/// If the slice is not long enough or contains invalid bytes for an
/// accelerator, then this returns an error.
pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
let bytes = slice
.try_into()
.map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
Accel::from_bytes(bytes)
}
/// Returns a verified accelerator derived from raw bytes.
///
/// If the given bytes are invalid, then this returns an error.
fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
if usize::from(bytes[0]) >= ACCEL_LEN {
return Err(DeserializeError::generic(
"accelerator bytes cannot have length more than 3",
));
}
Ok(Accel::from_bytes_unchecked(bytes))
}
/// Returns an accelerator derived from raw bytes.
///
/// This does not check whether the given bytes are valid. Invalid bytes
/// cannot sacrifice memory safety, but may result in panics or silent
/// logic bugs.
fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
}
/// Attempts to add the given byte to this accelerator. If the accelerator
/// is already full or thinks the byte is a poor accelerator, then this
/// returns false. Otherwise, returns true.
///
/// If the given byte is already in this accelerator, then it panics.
#[cfg(feature = "dfa-build")]
pub fn add(&mut self, byte: u8) -> bool {
if self.len() >= 3 {
return false;
}
// As a special case, we totally reject trying to accelerate a state
// with an ASCII space. In most cases, it occurs very frequently, and
// tends to result in worse overall performance.
if byte == b' ' {
return false;
}
assert!(
!self.contains(byte),
"accelerator already contains {:?}",
crate::util::escape::DebugByte(byte)
);
self.bytes[self.len() + 1] = byte;
self.bytes[0] += 1;
true
}
/// Return the number of bytes in this accelerator.
pub fn len(&self) -> usize {
usize::from(self.bytes[0])
}
/// Returns true if and only if there are no bytes in this accelerator.
#[cfg(feature = "dfa-build")]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the slice of bytes to accelerate.
///
/// If this accelerator is empty, then this returns an empty slice.
fn needles(&self) -> &[u8] {
&self.bytes[1..1 + self.len()]
}
/// Returns true if and only if this accelerator will accelerate the given
/// byte.
#[cfg(feature = "dfa-build")]
fn contains(&self, byte: u8) -> bool {
self.needles().iter().position(|&b| b == byte).is_some()
}
/// Returns the accelerator bytes as an array of AccelTys.
#[cfg(feature = "dfa-build")]
fn as_accel_tys(&self) -> [AccelTy; 2] {
assert_eq!(ACCEL_CAP, 8);
// These unwraps are OK since ACCEL_CAP is set to 8.
let first =
AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
let second =
AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
[first, second]
}
}
impl core::fmt::Debug for Accel {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "Accel(")?;
let mut set = f.debug_set();
for &b in self.needles() {
set.entry(&crate::util::escape::DebugByte(b));
}
set.finish()?;
write!(f, ")")
}
}

2260
vendor/regex-automata/src/dfa/automaton.rs vendored Normal file

File diff suppressed because it is too large Load Diff

5237
vendor/regex-automata/src/dfa/dense.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,599 @@
use alloc::{collections::BTreeMap, vec::Vec};
use crate::{
dfa::{
dense::{self, BuildError},
DEAD,
},
nfa::thompson,
util::{
self,
alphabet::{self, ByteSet},
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
primitives::{PatternID, StateID},
search::{Anchored, MatchKind},
sparse_set::SparseSets,
start::Start,
},
};
/// A builder for configuring and running a DFA determinizer.
#[derive(Clone, Debug)]
pub(crate) struct Config {
match_kind: MatchKind,
quit: ByteSet,
dfa_size_limit: Option<usize>,
determinize_size_limit: Option<usize>,
}
impl Config {
/// Create a new default config for a determinizer. The determinizer may be
/// configured before calling `run`.
pub fn new() -> Config {
Config {
match_kind: MatchKind::LeftmostFirst,
quit: ByteSet::empty(),
dfa_size_limit: None,
determinize_size_limit: None,
}
}
/// Run determinization on the given NFA and write the resulting DFA into
/// the one given. The DFA given should be initialized but otherwise empty.
/// "Initialized" means that it is setup to handle the NFA's byte classes,
/// number of patterns and whether to build start states for each pattern.
pub fn run(
&self,
nfa: &thompson::NFA,
dfa: &mut dense::OwnedDFA,
) -> Result<(), BuildError> {
let dead = State::dead();
let quit = State::dead();
let mut cache = StateMap::default();
// We only insert the dead state here since its representation is
// identical to the quit state. And we never want anything pointing
// to the quit state other than specific transitions derived from the
// determinizer's configured "quit" bytes.
//
// We do put the quit state into 'builder_states' below. This ensures
// that a proper DFA state ID is allocated for it, and that no other
// DFA state uses the "location after the DEAD state." That is, it
// is assumed that the quit state is always the state immediately
// following the DEAD state.
cache.insert(dead.clone(), DEAD);
let runner = Runner {
config: self.clone(),
nfa,
dfa,
builder_states: alloc::vec![dead, quit],
cache,
memory_usage_state: 0,
sparses: SparseSets::new(nfa.states().len()),
stack: alloc::vec![],
scratch_state_builder: StateBuilderEmpty::new(),
};
runner.run()
}
/// The match semantics to use for determinization.
///
/// MatchKind::All corresponds to the standard textbook construction.
/// All possible match states are represented in the DFA.
/// MatchKind::LeftmostFirst permits greediness and otherwise tries to
/// simulate the match semantics of backtracking regex engines. Namely,
/// only a subset of match states are built, and dead states are used to
/// stop searches with an unanchored prefix.
///
/// The default is MatchKind::LeftmostFirst.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
self.match_kind = kind;
self
}
/// The set of bytes to use that will cause the DFA to enter a quit state,
/// stop searching and return an error. By default, this is empty.
pub fn quit(&mut self, set: ByteSet) -> &mut Config {
self.quit = set;
self
}
/// The limit, in bytes of the heap, that the DFA is permitted to use. This
/// does not include the auxiliary heap storage used by determinization.
pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
self.dfa_size_limit = bytes;
self
}
/// The limit, in bytes of the heap, that determinization itself is allowed
/// to use. This does not include the size of the DFA being built.
pub fn determinize_size_limit(
&mut self,
bytes: Option<usize>,
) -> &mut Config {
self.determinize_size_limit = bytes;
self
}
}
/// The actual implementation of determinization that converts an NFA to a DFA
/// through powerset construction.
///
/// This determinizer roughly follows the typical powerset construction, where
/// each DFA state is comprised of one or more NFA states. In the worst case,
/// there is one DFA state for every possible combination of NFA states. In
/// practice, this only happens in certain conditions, typically when there are
/// bounded repetitions.
///
/// The main differences between this implementation and typical deteminization
/// are that this implementation delays matches by one state and hackily makes
/// look-around work. Comments below attempt to explain this.
///
/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
/// whichever is shorter.
#[derive(Debug)]
struct Runner<'a> {
/// The configuration used to initialize determinization.
config: Config,
/// The NFA we're converting into a DFA.
nfa: &'a thompson::NFA,
/// The DFA we're building.
dfa: &'a mut dense::OwnedDFA,
/// Each DFA state being built is defined as an *ordered* set of NFA
/// states, along with some meta facts about the ordered set of NFA states.
///
/// This is never empty. The first state is always a dummy state such that
/// a state id == 0 corresponds to a dead state. The second state is always
/// the quit state.
///
/// Why do we have states in both a `Vec` and in a cache map below?
/// Well, they serve two different roles based on access patterns.
/// `builder_states` is the canonical home of each state, and provides
/// constant random access by a DFA state's ID. The cache map below, on
/// the other hand, provides a quick way of searching for identical DFA
/// states by using the DFA state as a key in the map. Of course, we use
/// reference counting to avoid actually duplicating the state's data
/// itself. (Although this has never been benchmarked.) Note that the cache
/// map does not give us full minimization; it just lets us avoid some very
/// obvious redundant states.
///
/// Note that the index into this Vec isn't quite the DFA's state ID.
/// Rather, it's just an index. To get the state ID, you have to multiply
/// it by the DFA's stride. That's done by self.dfa.from_index. And the
/// inverse is self.dfa.to_index.
///
/// Moreover, DFA states don't usually retain the IDs assigned to them
/// by their position in this Vec. After determinization completes,
/// states are shuffled around to support other optimizations. See the
/// sibling 'special' module for more details on that. (The reason for
/// mentioning this is that if you print out the DFA for debugging during
/// determinization, and then print out the final DFA after it is fully
/// built, then the state IDs likely won't match up.)
builder_states: Vec<State>,
/// A cache of DFA states that already exist and can be easily looked up
/// via ordered sets of NFA states.
///
/// See `builder_states` docs for why we store states in two different
/// ways.
cache: StateMap,
/// The memory usage, in bytes, used by builder_states and cache. We track
/// this as new states are added since states use a variable amount of
/// heap. Tracking this as we add states makes it possible to compute the
/// total amount of memory used by the determinizer in constant time.
memory_usage_state: usize,
/// A pair of sparse sets for tracking ordered sets of NFA state IDs.
/// These are reused throughout determinization. A bounded sparse set
/// gives us constant time insertion, membership testing and clearing.
sparses: SparseSets,
/// Scratch space for a stack of NFA states to visit, for depth first
/// visiting without recursion.
stack: Vec<StateID>,
/// Scratch space for storing an ordered sequence of NFA states, for
/// amortizing allocation. This is principally useful for when we avoid
/// adding a new DFA state since it already exists. In order to detect this
/// case though, we still need an ordered set of NFA state IDs. So we use
/// this space to stage that ordered set before we know whether we need to
/// create a new DFA state or not.
scratch_state_builder: StateBuilderEmpty,
}
/// A map from states to state identifiers. When using std, we use a standard
/// hashmap, since it's a bit faster for this use case. (Other maps, like
/// one's based on FNV, have not yet been benchmarked.)
///
/// The main purpose of this map is to reuse states where possible. This won't
/// fully minimize the DFA, but it works well in a lot of cases.
#[cfg(feature = "std")]
type StateMap = std::collections::HashMap<State, StateID>;
#[cfg(not(feature = "std"))]
type StateMap = BTreeMap<State, StateID>;
impl<'a> Runner<'a> {
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
fn run(mut self) -> Result<(), BuildError> {
if self.nfa.look_set_any().contains_word_unicode()
&& !self.config.quit.contains_range(0x80, 0xFF)
{
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
}
// A sequence of "representative" bytes drawn from each equivalence
// class. These representative bytes are fed to the NFA to compute
// state transitions. This allows us to avoid re-computing state
// transitions for bytes that are guaranteed to produce identical
// results. Since computing the representatives needs to do a little
// work, we do it once here because we'll be iterating over them a lot.
let representatives: Vec<alphabet::Unit> =
self.dfa.byte_classes().representatives(..).collect();
// The set of all DFA state IDs that still need to have their
// transitions set. We start by seeding this with all starting states.
let mut uncompiled = alloc::vec![];
self.add_all_starts(&mut uncompiled)?;
while let Some(dfa_id) = uncompiled.pop() {
for &unit in &representatives {
if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
{
continue;
}
// In many cases, the state we transition to has already been
// computed. 'cached_state' will do the minimal amount of work
// to check this, and if it exists, immediately return an
// already existing state ID.
let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
self.dfa.set_transition(dfa_id, unit, next_dfa_id);
// If the state ID we got back is newly created, then we need
// to compile it, so add it to our uncompiled frontier.
if is_new {
uncompiled.push(next_dfa_id);
}
}
}
debug!(
"determinization complete, memory usage: {}, \
dense DFA size: {}, \
is reverse? {}",
self.memory_usage(),
self.dfa.memory_usage(),
self.nfa.is_reverse(),
);
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
// ID corresponds to a distinct regex pattern that matches in the state
// corresponding to the key.
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
self.cache.clear();
#[cfg(feature = "logging")]
let mut total_pat_len = 0;
for (i, state) in self.builder_states.into_iter().enumerate() {
if let Some(pat_ids) = state.match_pattern_ids() {
let id = self.dfa.to_state_id(i);
log! {
total_pat_len += pat_ids.len();
}
matches.insert(id, pat_ids);
}
}
log! {
use core::mem::size_of;
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
let pats = total_pat_len * size_of::<PatternID>();
let mem = (matches.len() * per_elem) + pats;
log::debug!("matches map built, memory usage: {mem}");
}
// At this point, we shuffle the "special" states in the final DFA.
// This permits a DFA's match loop to detect a match condition (among
// other things) by merely inspecting the current state's identifier,
// and avoids the need for any additional auxiliary storage.
self.dfa.shuffle(matches)?;
Ok(())
}
/// Return the identifier for the next DFA state given an existing DFA
/// state and an input byte. If the next DFA state already exists, then
/// return its identifier from the cache. Otherwise, build the state, cache
/// it and return its identifier.
///
/// This routine returns a boolean indicating whether a new state was
/// built. If a new state is built, then the caller needs to add it to its
/// frontier of uncompiled DFA states to compute transitions for.
fn cached_state(
&mut self,
dfa_id: StateID,
unit: alphabet::Unit,
) -> Result<(StateID, bool), BuildError> {
// Compute the set of all reachable NFA states, including epsilons.
let empty_builder = self.get_state_builder();
let builder = util::determinize::next(
self.nfa,
self.config.match_kind,
&mut self.sparses,
&mut self.stack,
&self.builder_states[self.dfa.to_index(dfa_id)],
unit,
empty_builder,
);
self.maybe_add_state(builder)
}
/// Compute the set of DFA start states and add their identifiers in
/// 'dfa_state_ids' (no duplicates are added).
fn add_all_starts(
&mut self,
dfa_state_ids: &mut Vec<StateID>,
) -> Result<(), BuildError> {
// These should be the first states added.
assert!(dfa_state_ids.is_empty());
// We only want to add (un)anchored starting states that is consistent
// with our DFA's configuration. Unconditionally adding both (although
// it is the default) can make DFAs quite a bit bigger.
if self.dfa.start_kind().has_unanchored() {
self.add_start_group(Anchored::No, dfa_state_ids)?;
}
if self.dfa.start_kind().has_anchored() {
self.add_start_group(Anchored::Yes, dfa_state_ids)?;
}
// I previously has an 'assert' here checking that either
// 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
// turns out this isn't always true. For example, the NFA might have
// one or more patterns but where all such patterns are just 'fail'
// states. These will ultimately just compile down to DFA dead states,
// and since the dead state was added earlier, no new DFA states are
// added. And thus, it is valid and okay for 'dfa_state_ids' to be
// empty even if there are a non-zero number of patterns in the NFA.
// We only need to compute anchored start states for each pattern if it
// was requested to do so.
if self.dfa.starts_for_each_pattern() {
for pid in self.nfa.patterns() {
self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
}
}
Ok(())
}
/// Add a group of start states for the given match pattern ID. Any new
/// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
/// pushed.)
///
/// When pattern_id is None, then this will compile a group of unanchored
/// start states (if the DFA is unanchored). When the pattern_id is
/// present, then this will compile a group of anchored start states that
/// only match the given pattern.
///
/// This panics if `anchored` corresponds to an invalid pattern ID.
fn add_start_group(
&mut self,
anchored: Anchored,
dfa_state_ids: &mut Vec<StateID>,
) -> Result<(), BuildError> {
let nfa_start = match anchored {
Anchored::No => self.nfa.start_unanchored(),
Anchored::Yes => self.nfa.start_anchored(),
Anchored::Pattern(pid) => {
self.nfa.start_pattern(pid).expect("valid pattern ID")
}
};
// When compiling start states, we're careful not to build additional
// states that aren't necessary. For example, if the NFA has no word
// boundary assertion, then there's no reason to have distinct start
// states for 'NonWordByte' and 'WordByte' starting configurations.
// Instead, the 'WordByte' starting configuration can just point
// directly to the start state for the 'NonWordByte' config.
//
// Note though that we only need to care about assertions in the prefix
// of an NFA since this only concerns the starting states. (Actually,
// the most precisely thing we could do it is look at the prefix
// assertions of each pattern when 'anchored == Anchored::Pattern',
// and then only compile extra states if the prefix is non-empty.) But
// we settle for simplicity here instead of absolute minimalism. It is
// somewhat rare, after all, for multiple patterns in the same regex to
// have different prefix look-arounds.
let (id, is_new) =
self.add_one_start(nfa_start, Start::NonWordByte)?;
self.dfa.set_start_state(anchored, Start::NonWordByte, id);
if is_new {
dfa_state_ids.push(id);
}
if !self.nfa.look_set_prefix_any().contains_word() {
self.dfa.set_start_state(anchored, Start::WordByte, id);
} else {
let (id, is_new) =
self.add_one_start(nfa_start, Start::WordByte)?;
self.dfa.set_start_state(anchored, Start::WordByte, id);
if is_new {
dfa_state_ids.push(id);
}
}
if !self.nfa.look_set_prefix_any().contains_anchor() {
self.dfa.set_start_state(anchored, Start::Text, id);
self.dfa.set_start_state(anchored, Start::LineLF, id);
self.dfa.set_start_state(anchored, Start::LineCR, id);
self.dfa.set_start_state(
anchored,
Start::CustomLineTerminator,
id,
);
} else {
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
self.dfa.set_start_state(anchored, Start::Text, id);
if is_new {
dfa_state_ids.push(id);
}
let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
self.dfa.set_start_state(anchored, Start::LineLF, id);
if is_new {
dfa_state_ids.push(id);
}
let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
self.dfa.set_start_state(anchored, Start::LineCR, id);
if is_new {
dfa_state_ids.push(id);
}
let (id, is_new) =
self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
self.dfa.set_start_state(
anchored,
Start::CustomLineTerminator,
id,
);
if is_new {
dfa_state_ids.push(id);
}
}
Ok(())
}
/// Add a new DFA start state corresponding to the given starting NFA
/// state, and the starting search configuration. (The starting search
/// configuration essentially tells us which look-behind assertions are
/// true for this particular state.)
///
/// The boolean returned indicates whether the state ID returned is a newly
/// created state, or a previously cached state.
fn add_one_start(
&mut self,
nfa_start: StateID,
start: Start,
) -> Result<(StateID, bool), BuildError> {
// Compute the look-behind assertions that are true in this starting
// configuration, and the determine the epsilon closure. While
// computing the epsilon closure, we only follow conditional epsilon
// transitions that satisfy the look-behind assertions in 'look_have'.
let mut builder_matches = self.get_state_builder().into_matches();
util::determinize::set_lookbehind_from_start(
self.nfa,
&start,
&mut builder_matches,
);
self.sparses.set1.clear();
util::determinize::epsilon_closure(
self.nfa,
nfa_start,
builder_matches.look_have(),
&mut self.stack,
&mut self.sparses.set1,
);
let mut builder = builder_matches.into_nfa();
util::determinize::add_nfa_states(
&self.nfa,
&self.sparses.set1,
&mut builder,
);
self.maybe_add_state(builder)
}
/// Adds the given state to the DFA being built depending on whether it
/// already exists in this determinizer's cache.
///
/// If it does exist, then the memory used by 'state' is put back into the
/// determinizer and the previously created state's ID is returned. (Along
/// with 'false', indicating that no new state was added.)
///
/// If it does not exist, then the state is added to the DFA being built
/// and a fresh ID is allocated (if ID allocation fails, then an error is
/// returned) and returned. (Along with 'true', indicating that a new state
/// was added.)
fn maybe_add_state(
&mut self,
builder: StateBuilderNFA,
) -> Result<(StateID, bool), BuildError> {
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
self.put_state_builder(builder);
return Ok((cached_id, false));
}
self.add_state(builder).map(|sid| (sid, true))
}
/// Add the given state to the DFA and make it available in the cache.
///
/// The state initially has no transitions. That is, it transitions to the
/// dead state for all possible inputs, and transitions to the quit state
/// for all quit bytes.
///
/// If adding the state would exceed the maximum value for StateID, then an
/// error is returned.
fn add_state(
&mut self,
builder: StateBuilderNFA,
) -> Result<StateID, BuildError> {
let id = self.dfa.add_empty_state()?;
if !self.config.quit.is_empty() {
for b in self.config.quit.iter() {
self.dfa.set_transition(
id,
alphabet::Unit::u8(b),
self.dfa.quit_id(),
);
}
}
let state = builder.to_state();
// States use reference counting internally, so we only need to count
// their memory usage once.
self.memory_usage_state += state.memory_usage();
self.builder_states.push(state.clone());
self.cache.insert(state, id);
self.put_state_builder(builder);
if let Some(limit) = self.config.dfa_size_limit {
if self.dfa.memory_usage() > limit {
return Err(BuildError::dfa_exceeded_size_limit(limit));
}
}
if let Some(limit) = self.config.determinize_size_limit {
if self.memory_usage() > limit {
return Err(BuildError::determinize_exceeded_size_limit(
limit,
));
}
}
Ok(id)
}
/// Returns a state builder from this determinizer that might have existing
/// capacity. This helps avoid allocs in cases where a state is built that
/// turns out to already be cached.
///
/// Callers must put the state builder back with 'put_state_builder',
/// otherwise the allocation reuse won't work.
fn get_state_builder(&mut self) -> StateBuilderEmpty {
core::mem::replace(
&mut self.scratch_state_builder,
StateBuilderEmpty::new(),
)
}
/// Puts the given state builder back into this determinizer for reuse.
///
/// Note that building a 'State' from a builder always creates a new
/// alloc, so callers should always put the builder back.
fn put_state_builder(&mut self, builder: StateBuilderNFA) {
let _ = core::mem::replace(
&mut self.scratch_state_builder,
builder.clear(),
);
}
/// Return the memory usage, in bytes, of this determinizer at the current
/// point in time. This does not include memory used by the NFA or the
/// dense DFA itself.
fn memory_usage(&self) -> usize {
use core::mem::size_of;
self.builder_states.len() * size_of::<State>()
// Maps likely use more memory than this, but it's probably close.
+ self.cache.len() * (size_of::<State>() + size_of::<StateID>())
+ self.memory_usage_state
+ self.stack.capacity() * size_of::<StateID>()
+ self.scratch_state_builder.capacity()
}
}

View File

@@ -0,0 +1,463 @@
use core::{cell::RefCell, fmt, mem};
use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
use crate::{
dfa::{automaton::Automaton, dense, DEAD},
util::{
alphabet,
primitives::{PatternID, StateID},
},
};
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
///
/// The algorithm implemented here is mostly taken from Wikipedia:
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
///
/// This code has had some light optimization attention paid to it,
/// particularly in the form of reducing allocation as much as possible.
/// However, it is still generally slow. Future optimization work should
/// probably focus on the bigger picture rather than micro-optimizations. For
/// example:
///
/// 1. Figure out how to more intelligently create initial partitions. That is,
/// Hopcroft's algorithm starts by creating two partitions of DFA states
/// that are known to NOT be equivalent: match states and non-match states.
/// The algorithm proceeds by progressively refining these partitions into
/// smaller partitions. If we could start with more partitions, then we
/// could reduce the amount of work that Hopcroft's algorithm needs to do.
/// 2. For every partition that we visit, we find all incoming transitions to
/// every state in the partition for *every* element in the alphabet. (This
/// is why using byte classes can significantly decrease minimization times,
/// since byte classes shrink the alphabet.) This is quite costly and there
/// is perhaps some redundant work being performed depending on the specific
/// states in the set. For example, we might be able to only visit some
/// elements of the alphabet based on the transitions.
/// 3. Move parts of minimization into determinization. If minimization has
/// fewer states to deal with, then it should run faster. A prime example
/// of this might be large Unicode classes, which are generated in way that
/// can create a lot of redundant states. (Some work has been done on this
/// point during NFA compilation via the algorithm described in the
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
/// paper.)
pub(crate) struct Minimizer<'a> {
dfa: &'a mut dense::OwnedDFA,
in_transitions: Vec<Vec<Vec<StateID>>>,
partitions: Vec<StateSet>,
waiting: Vec<StateSet>,
}
impl<'a> fmt::Debug for Minimizer<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Minimizer")
.field("dfa", &self.dfa)
.field("in_transitions", &self.in_transitions)
.field("partitions", &self.partitions)
.field("waiting", &self.waiting)
.finish()
}
}
/// A set of states. A state set makes up a single partition in Hopcroft's
/// algorithm.
///
/// It is represented by an ordered set of state identifiers. We use shared
/// ownership so that a single state set can be in both the set of partitions
/// and in the set of waiting sets simultaneously without an additional
/// allocation. Generally, once a state set is built, it becomes immutable.
///
/// We use this representation because it avoids the overhead of more
/// traditional set data structures (HashSet/BTreeSet), and also because
/// computing intersection/subtraction on this representation is especially
/// fast.
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
struct StateSet {
ids: Rc<RefCell<Vec<StateID>>>,
}
impl<'a> Minimizer<'a> {
pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
let in_transitions = Minimizer::incoming_transitions(dfa);
let partitions = Minimizer::initial_partitions(dfa);
let waiting = partitions.clone();
Minimizer { dfa, in_transitions, partitions, waiting }
}
pub fn run(mut self) {
let stride2 = self.dfa.stride2();
let as_state_id = |index: usize| -> StateID {
StateID::new(index << stride2).unwrap()
};
let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
let mut incoming = StateSet::empty();
let mut scratch1 = StateSet::empty();
let mut scratch2 = StateSet::empty();
let mut newparts = vec![];
// This loop is basically Hopcroft's algorithm. Everything else is just
// shuffling data around to fit our representation.
while let Some(set) = self.waiting.pop() {
for b in self.dfa.byte_classes().iter() {
self.find_incoming_to(b, &set, &mut incoming);
// If incoming is empty, then the intersection with any other
// set must also be empty. So 'newparts' just ends up being
// 'self.partitions'. So there's no need to go through the loop
// below.
//
// This actually turns out to be rather large optimization. On
// the order of making minimization 4-5x faster. It's likely
// that the vast majority of all states have very few incoming
// transitions.
if incoming.is_empty() {
continue;
}
for p in 0..self.partitions.len() {
self.partitions[p].intersection(&incoming, &mut scratch1);
if scratch1.is_empty() {
newparts.push(self.partitions[p].clone());
continue;
}
self.partitions[p].subtract(&incoming, &mut scratch2);
if scratch2.is_empty() {
newparts.push(self.partitions[p].clone());
continue;
}
let (x, y) =
(scratch1.deep_clone(), scratch2.deep_clone());
newparts.push(x.clone());
newparts.push(y.clone());
match self.find_waiting(&self.partitions[p]) {
Some(i) => {
self.waiting[i] = x;
self.waiting.push(y);
}
None => {
if x.len() <= y.len() {
self.waiting.push(x);
} else {
self.waiting.push(y);
}
}
}
}
newparts = mem::replace(&mut self.partitions, newparts);
newparts.clear();
}
}
// At this point, we now have a minimal partitioning of states, where
// each partition is an equivalence class of DFA states. Now we need to
// use this partitioning to update the DFA to only contain one state for
// each partition.
// Create a map from DFA state ID to the representative ID of the
// equivalence class to which it belongs. The representative ID of an
// equivalence class of states is the minimum ID in that class.
let mut state_to_part = vec![DEAD; self.dfa.state_len()];
for p in &self.partitions {
p.iter(|id| state_to_part[as_index(id)] = p.min());
}
// Generate a new contiguous sequence of IDs for minimal states, and
// create a map from equivalence IDs to the new IDs. Thus, the new
// minimal ID of *any* state in the unminimized DFA can be obtained
// with minimals_ids[state_to_part[old_id]].
let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
let mut new_index = 0;
for state in self.dfa.states() {
if state_to_part[as_index(state.id())] == state.id() {
minimal_ids[as_index(state.id())] = as_state_id(new_index);
new_index += 1;
}
}
// The total number of states in the minimal DFA.
let minimal_count = new_index;
// Convenience function for remapping state IDs. This takes an old ID,
// looks up its Hopcroft partition and then maps that to the new ID
// range.
let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
// Re-map this DFA in place such that the only states remaining
// correspond to the representative states of every equivalence class.
for id in (0..self.dfa.state_len()).map(as_state_id) {
// If this state isn't a representative for an equivalence class,
// then we skip it since it won't appear in the minimal DFA.
if state_to_part[as_index(id)] != id {
continue;
}
self.dfa.remap_state(id, remap);
self.dfa.swap_states(id, minimal_ids[as_index(id)]);
}
// Trim off all unused states from the pre-minimized DFA. This
// represents all states that were merged into a non-singleton
// equivalence class of states, and appeared after the first state
// in each such class. (Because the state with the smallest ID in each
// equivalence class is its representative ID.)
self.dfa.truncate_states(minimal_count);
// Update the new start states, which is now just the minimal ID of
// whatever state the old start state was collapsed into. Also, we
// collect everything before-hand to work around the borrow checker.
// We're already allocating so much that this is probably fine. If this
// turns out to be costly, then I guess add a `starts_mut` iterator.
let starts: Vec<_> = self.dfa.starts().collect();
for (old_start_id, anchored, start_type) in starts {
self.dfa.set_start_state(
anchored,
start_type,
remap(old_start_id),
);
}
// Update the match state pattern ID list for multi-regexes. All we
// need to do is remap the match state IDs. The pattern ID lists are
// always the same as they were since match states with distinct
// pattern ID lists are always considered distinct states.
let mut pmap = BTreeMap::new();
for (match_id, pattern_ids) in self.dfa.pattern_map() {
let new_id = remap(match_id);
pmap.insert(new_id, pattern_ids);
}
// This unwrap is OK because minimization never increases the number of
// match states or patterns in those match states. Since minimization
// runs after the pattern map has already been set at least once, we
// know that our match states cannot error.
self.dfa.set_pattern_map(&pmap).unwrap();
// In order to update the ID of the maximum match state, we need to
// find the maximum ID among all of the match states in the minimized
// DFA. This is not necessarily the new ID of the unminimized maximum
// match state, since that could have been collapsed with a much
// earlier match state. Therefore, to find the new max match state,
// we iterate over all previous match states, find their corresponding
// new minimal ID, and take the maximum of those.
let old = self.dfa.special().clone();
let new = self.dfa.special_mut();
// ... but only remap if we had match states.
if old.matches() {
new.min_match = StateID::MAX;
new.max_match = StateID::ZERO;
for i in as_index(old.min_match)..=as_index(old.max_match) {
let new_id = remap(as_state_id(i));
if new_id < new.min_match {
new.min_match = new_id;
}
if new_id > new.max_match {
new.max_match = new_id;
}
}
}
// ... same, but for start states.
if old.starts() {
new.min_start = StateID::MAX;
new.max_start = StateID::ZERO;
for i in as_index(old.min_start)..=as_index(old.max_start) {
let new_id = remap(as_state_id(i));
if new_id == DEAD {
continue;
}
if new_id < new.min_start {
new.min_start = new_id;
}
if new_id > new.max_start {
new.max_start = new_id;
}
}
if new.max_start == DEAD {
new.min_start = DEAD;
}
}
new.quit_id = remap(new.quit_id);
new.set_max();
}
fn find_waiting(&self, set: &StateSet) -> Option<usize> {
self.waiting.iter().position(|s| s == set)
}
fn find_incoming_to(
&self,
b: alphabet::Unit,
set: &StateSet,
incoming: &mut StateSet,
) {
incoming.clear();
set.iter(|id| {
for &inid in
&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
{
incoming.add(inid);
}
});
incoming.canonicalize();
}
fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
// For match states, we know that two match states with different
// pattern ID lists will *always* be distinct, so we can partition them
// initially based on that.
let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
let mut is_quit = StateSet::empty();
let mut no_match = StateSet::empty();
for state in dfa.states() {
if dfa.is_match_state(state.id()) {
let mut pids = vec![];
for i in 0..dfa.match_len(state.id()) {
pids.push(dfa.match_pattern(state.id(), i));
}
matching
.entry(pids)
.or_insert(StateSet::empty())
.add(state.id());
} else if dfa.is_quit_state(state.id()) {
is_quit.add(state.id());
} else {
no_match.add(state.id());
}
}
let mut sets: Vec<StateSet> =
matching.into_iter().map(|(_, set)| set).collect();
sets.push(no_match);
sets.push(is_quit);
sets
}
fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
let mut incoming = vec![];
for _ in dfa.states() {
incoming.push(vec![vec![]; dfa.alphabet_len()]);
}
for state in dfa.states() {
for (b, next) in state.transitions() {
incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
}
}
incoming
}
}
impl StateSet {
fn empty() -> StateSet {
StateSet { ids: Rc::new(RefCell::new(vec![])) }
}
fn add(&mut self, id: StateID) {
self.ids.borrow_mut().push(id);
}
fn min(&self) -> StateID {
self.ids.borrow()[0]
}
fn canonicalize(&mut self) {
self.ids.borrow_mut().sort();
self.ids.borrow_mut().dedup();
}
fn clear(&mut self) {
self.ids.borrow_mut().clear();
}
fn len(&self) -> usize {
self.ids.borrow().len()
}
fn is_empty(&self) -> bool {
self.len() == 0
}
fn deep_clone(&self) -> StateSet {
let ids = self.ids.borrow().iter().cloned().collect();
StateSet { ids: Rc::new(RefCell::new(ids)) }
}
fn iter<F: FnMut(StateID)>(&self, mut f: F) {
for &id in self.ids.borrow().iter() {
f(id);
}
}
fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
dest.clear();
if self.is_empty() || other.is_empty() {
return;
}
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
if a == b {
dest.add(a);
a = match ita.next() {
None => break,
Some(a) => a,
};
b = match itb.next() {
None => break,
Some(b) => b,
};
} else if a < b {
a = match ita.next() {
None => break,
Some(a) => a,
};
} else {
b = match itb.next() {
None => break,
Some(b) => b,
};
}
}
}
fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
dest.clear();
if self.is_empty() || other.is_empty() {
self.iter(|s| dest.add(s));
return;
}
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
if a == b {
a = match ita.next() {
None => break,
Some(a) => a,
};
b = match itb.next() {
None => {
dest.add(a);
break;
}
Some(b) => b,
};
} else if a < b {
dest.add(a);
a = match ita.next() {
None => break,
Some(a) => a,
};
} else {
b = match itb.next() {
None => {
dest.add(a);
break;
}
Some(b) => b,
};
}
}
for a in ita {
dest.add(a);
}
}
}

360
vendor/regex-automata/src/dfa/mod.rs vendored Normal file
View File

@@ -0,0 +1,360 @@
/*!
A module for building and searching with deterministic finite automata (DFAs).
Like other modules in this crate, DFAs support a rich regex syntax with Unicode
features. DFAs also have extensive options for configuring the best space vs
time trade off for your use case and provides support for cheap deserialization
of automata for use in `no_std` environments.
If you're looking for lazy DFAs that build themselves incrementally during
search, then please see the top-level [`hybrid` module](crate::hybrid).
# Overview
This section gives a brief overview of the primary types in this module:
* A [`regex::Regex`] provides a way to search for matches of a regular
expression using DFAs. This includes iterating over matches with both the start
and end positions of each match.
* A [`dense::DFA`] provides low level access to a DFA that uses a dense
representation (uses lots of space, but fast searching).
* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
representation (uses less space, but slower searching).
* An [`Automaton`] trait that defines an interface that both dense and sparse
DFAs implement. (A `regex::Regex` is generic over this trait.)
* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
[`dense::DFA::from_bytes`]).
There is also a [`onepass`] module that provides a [one-pass
DFA](onepass::DFA). The unique advantage of this DFA is that, for the class
of regexes it can be built with, it supports reporting the spans of matching
capturing groups. It is the only DFA in this crate capable of such a thing.
# Example: basic regex searching
This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```
use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: searching with regex sets
The DFAs in this module all fully support searching with multiple regexes
simultaneously. You can use this support with standard leftmost-first style
searching to find non-overlapping matches:
```
# if cfg!(miri) { return Ok(()); } // miri takes too long
use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
let text = b"@foo bar";
let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(1, 0..4),
Match::must(0, 5..8),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: use sparse DFAs
By default, compiling a regex will use dense DFAs internally. This uses more
memory, but executes searches more quickly. If you can abide slower searches
(somewhere around 3-5x), then sparse DFAs might make more sense since they can
use significantly less space.
Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
`Regex::new`:
```
use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
If you already have dense DFAs for some reason, they can be converted to sparse
DFAs and used to build a new `Regex`. For example:
```
use regex_automata::{Match, dfa::regex::Regex};
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let sparse_re = Regex::builder().build_from_dfas(
dense_re.forward().to_sparse()?,
dense_re.reverse().to_sparse()?,
);
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = sparse_re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: deserialize a DFA
This shows how to first serialize a DFA into raw bytes, and then deserialize
those raw bytes back into a DFA. While this particular example is a
bit contrived, this same technique can be used in your program to
deserialize a DFA at start up time or by memory mapping a file.
```
use regex_automata::{Match, dfa::{dense, regex::Regex}};
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both the forward and reverse DFAs, see note below
let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
// now deserialize both---we need to specify the correct type!
let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
// finally, reconstruct our regex
let re2 = Regex::builder().build_from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re2.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
There are a few points worth noting here:
* We need to extract the raw DFAs used by the regex and serialize those. You
can build the DFAs manually yourself using [`dense::Builder`], but using
the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
particular, a `Regex` constructs a reverse DFA for finding the starting
location of matches.)
* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
deserializing your DFA from. If you intend to deserialize on either platform,
then you'll need to serialize both and deserialize the right one depending on
your target's endianness.
* Safely deserializing a DFA requires verifying the raw bytes, particularly if
they are untrusted, since an invalid DFA could cause logical errors, panics
or even undefined behavior. This verification step requires visiting all of
the transitions in the DFA, which can be costly. If cheaper verification is
desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
verification that can be performed in constant time. However, one can only use
this routine if the caller can guarantee that the bytes provided encoded a
valid DFA.
The same process can be achieved with sparse DFAs as well:
```
use regex_automata::{Match, dfa::{sparse, regex::Regex}};
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both
let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
// now deserialize both---we need to specify the correct type!
let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
// finally, reconstruct our regex
let re2 = Regex::builder().build_from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re2.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
Conversely, dense DFAs must be aligned to the same alignment as a
[`StateID`](crate::util::primitives::StateID).
# Support for `no_std` and `alloc`-only
This crate comes with `alloc` and `std` features that are enabled by default.
When the `alloc` or `std` features are enabled, the API of this module will
include the facilities necessary for compiling, serializing, deserializing
and searching with DFAs. When only the `alloc` feature is enabled, then
implementations of the `std::error::Error` trait are dropped, but everything
else generally remains the same. When both the `alloc` and `std` features are
disabled, the API of this module will shrink such that it only includes the
facilities necessary for deserializing and searching with DFAs.
The intended workflow for `no_std` environments is thus as follows:
* Write a program with the `alloc` or `std` features that compiles and
serializes a regular expression. You may need to serialize both little and big
endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
* In your `no_std` environment, follow the examples above for deserializing
your previously serialized DFAs into regexes. You can then search with them as
you would any regex.
Deserialization can happen anywhere. For example, with bytes embedded into a
binary or with a file memory mapped at runtime.
The `regex-cli` command (found in the same repository as this crate) can be
used to serialize DFAs to files and generate Rust code to read them.
# Syntax
This module supports the same syntax as the `regex` crate, since they share the
same parser. You can find an exhaustive list of supported syntax in the
[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
There are two things that are not supported by the DFAs in this module:
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
of them) can only find the offsets of an entire match, but cannot resolve
the offsets of each capturing group. This is because DFAs do not have the
expressive power necessary.
* Unicode word boundaries. These present particularly difficult challenges for
DFA construction and would result in an explosion in the number of states.
One can enable [`dense::Config::unicode_word_boundary`] though, which provides
heuristic support for Unicode word boundaries that only works on ASCII text.
Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
on any input.
There are no plans to lift either of these limitations.
Note that these restrictions are identical to the restrictions on lazy DFAs.
# Differences with general purpose regexes
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
general purpose regular expression engine. It aims to automatically balance low
compile times, fast search times and low memory usage, while also providing
a convenient API for users. In contrast, this module provides a lower level
regular expression interface based exclusively on DFAs that is a bit less
convenient while providing more explicit control over memory usage and search
times.
Here are some specific negative differences:
* **Compilation can take an exponential amount of time and space** in the size
of the regex pattern. While most patterns do not exhibit worst case exponential
time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
with approximately `2^(N+2)` states. For this reason, untrusted patterns should
not be compiled with this module. (In the future, the API may expose an option
to return an error if the DFA gets too big.)
* This module does not support sub-match extraction via capturing groups, which
can be achieved with the regex crate's "captures" API.
* While the regex crate doesn't necessarily sport fast compilation times,
the regexes in this module are almost universally slow to compile, especially
when they contain large Unicode character classes. For example, on my system,
compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
a sparse regex takes about the same time but only uses about 1.2MB of
memory.) Conversely, compiling the same regex without Unicode support, e.g.,
`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
reason, you should only use Unicode character classes if you absolutely need
them! (They are enabled by default though.)
* This module does not support Unicode word boundaries. ASCII word boundaries
may be used though by disabling Unicode or selectively doing so in the syntax,
e.g., `(?-u:\b)`. There is also an option to
[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
where the corresponding DFA will give up if any non-ASCII byte is seen.
* As a lower level API, this module does not do literal optimizations
automatically. Although it does provide hooks in its API to make use of the
[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
optimizations means that searches may run much slower than what you're
accustomed to, although, it does provide more predictable and consistent
performance.
* There is no `&str` API like in the regex crate. In this module, all APIs
operate on `&[u8]`. By default, match indices are
guaranteed to fall on UTF-8 boundaries, unless either of
[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or
[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled.
With some of the downsides out of the way, here are some positive differences:
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
deserialized. Deserialization can be done in constant time with the unchecked
APIs, since searching can be performed directly on the raw serialized bytes of
a DFA.
* This module was specifically designed so that the searching phase of a
DFA has minimal runtime requirements, and can therefore be used in `no_std`
environments. While `no_std` environments cannot compile regexes, they can
deserialize pre-compiled regexes.
* Since this module builds DFAs ahead of time, it will generally out-perform
the `regex` crate on equivalent tasks. The performance difference is likely
not large. However, because of a complex set of optimizations in the regex
crate (like literal optimizations), an accurate performance comparison may be
difficult to do.
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
performance a small amount, but uses much less storage space. Potentially even
less than what the regex crate uses.
* This module exposes DFAs directly, such as [`dense::DFA`] and
[`sparse::DFA`], which enables one to do less work in some cases. For example,
if you only need the end of a match and not the start of a match, then you can
use a DFA directly without building a `Regex`, which always requires a second
DFA to find the start of a match.
* This module provides more control over memory usage. Aside from choosing
between dense and sparse DFAs, one can also choose a smaller state identifier
representation to use less space. Also, one can enable DFA minimization
via [`dense::Config::minimize`], but it can increase compilation times
dramatically.
*/
#[cfg(feature = "dfa-search")]
pub use crate::dfa::{
automaton::{Automaton, OverlappingState, StartError},
start::StartKind,
};
/// This is an alias for a state ID of zero. It has special significance
/// because it always corresponds to the first state in a DFA, and the first
/// state in a DFA is always "dead." That is, the dead state always has all
/// of its transitions set to itself. Moreover, the dead state is used as a
/// sentinel for various things. e.g., In search, reaching a dead state means
/// that the search must stop.
const DEAD: crate::util::primitives::StateID =
crate::util::primitives::StateID::ZERO;
#[cfg(feature = "dfa-search")]
pub mod dense;
#[cfg(feature = "dfa-onepass")]
pub mod onepass;
#[cfg(feature = "dfa-search")]
pub mod regex;
#[cfg(feature = "dfa-search")]
pub mod sparse;
#[cfg(feature = "dfa-search")]
pub(crate) mod accel;
#[cfg(feature = "dfa-search")]
mod automaton;
#[cfg(feature = "dfa-build")]
mod determinize;
#[cfg(feature = "dfa-build")]
mod minimize;
#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))]
mod remapper;
#[cfg(feature = "dfa-search")]
mod search;
#[cfg(feature = "dfa-search")]
mod special;
#[cfg(feature = "dfa-search")]
mod start;

3192
vendor/regex-automata/src/dfa/onepass.rs vendored Normal file

File diff suppressed because it is too large Load Diff

871
vendor/regex-automata/src/dfa/regex.rs vendored Normal file
View File

@@ -0,0 +1,871 @@
/*!
A DFA-backed `Regex`.
This module provides [`Regex`], which is defined generically over the
[`Automaton`] trait. A `Regex` implements convenience routines you might have
come to expect, such as finding the start/end of a match and iterating over
all non-overlapping matches. This `Regex` type is limited in its capabilities
to what a DFA can provide. Therefore, APIs involving capturing groups, for
example, are not provided.
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
finds the end offset of a match, where as the other is a "reverse" DFA that
find the start offset of a match.
See the [parent module](crate::dfa) for examples.
*/
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
#[cfg(feature = "dfa-build")]
use crate::dfa::dense::BuildError;
use crate::{
dfa::{automaton::Automaton, dense},
util::{iter, search::Input},
Anchored, Match, MatchError,
};
#[cfg(feature = "alloc")]
use crate::{
dfa::{sparse, StartKind},
util::search::MatchKind,
};
// When the alloc feature is enabled, the regex type sets its A type parameter
// to default to an owned dense DFA. But without alloc, we set no default. This
// makes things a lot more convenient in the common case, since writing out the
// DFA types is pretty annoying.
//
// Since we have two different definitions but only want to write one doc
// string, we use a macro to capture the doc and other attributes once and then
// repeat them for each definition.
macro_rules! define_regex_type {
($(#[$doc:meta])*) => {
#[cfg(feature = "alloc")]
$(#[$doc])*
pub struct Regex<A = dense::OwnedDFA> {
forward: A,
reverse: A,
}
#[cfg(not(feature = "alloc"))]
$(#[$doc])*
pub struct Regex<A> {
forward: A,
reverse: A,
}
};
}
define_regex_type!(
/// A regular expression that uses deterministic finite automata for fast
/// searching.
///
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
/// a match while the reverse DFA is responsible for detecting the start
/// of a match. Thus, in order to find the bounds of any given match, a
/// forward search must first be run followed by a reverse search. A match
/// found by the forward DFA guarantees that the reverse DFA will also find
/// a match.
///
/// The type of the DFA used by a `Regex` corresponds to the `A` type
/// parameter, which must satisfy the [`Automaton`] trait. Typically,
/// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
/// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
/// memory but search faster, while sparse DFAs use less memory but search
/// more slowly.
///
/// # Crate features
///
/// Note that despite what the documentation auto-generates, the _only_
/// crate feature needed to use this type is `dfa-search`. You do _not_
/// need to enable the `alloc` feature.
///
/// By default, a regex's automaton type parameter is set to
/// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
/// in-memory work loads, this is the most convenient type that gives the
/// best search performance. When the `alloc` feature is disabled, no
/// default type is used.
///
/// # When should I use this?
///
/// Generally speaking, if you can afford the overhead of building a full
/// DFA for your regex, and you don't need things like capturing groups,
/// then this is a good choice if you're looking to optimize for matching
/// speed. Note however that its speed may be worse than a general purpose
/// regex engine if you don't provide a [`dense::Config::prefilter`] to the
/// underlying DFA.
///
/// # Sparse DFAs
///
/// Since a `Regex` is generic over the [`Automaton`] trait, it can be
/// used with any kind of DFA. While this crate constructs dense DFAs by
/// default, it is easy enough to build corresponding sparse DFAs, and then
/// build a regex from them:
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// // First, build a regex that uses dense DFAs.
/// let dense_re = Regex::new("foo[0-9]+")?;
///
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
/// let fwd = dense_re.forward().to_sparse()?;
/// let rev = dense_re.reverse().to_sparse()?;
///
/// // Third, build a new regex from the constituent sparse DFAs.
/// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
///
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Alternatively, one can use a [`Builder`] to construct a sparse DFA
/// more succinctly. (Note though that dense DFAs are still constructed
/// first internally, and then converted to sparse DFAs, as in the example
/// above.)
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
/// assert!(sparse_re.is_match(b"foo123"));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Fallibility
///
/// Most of the search routines defined on this type will _panic_ when the
/// underlying search fails. This might be because the DFA gave up because
/// it saw a quit byte, whether configured explicitly or via heuristic
/// Unicode word boundary support, although neither are enabled by default.
/// Or it might fail because an invalid `Input` configuration is given,
/// for example, with an unsupported [`Anchored`] mode.
///
/// If you need to handle these error cases instead of allowing them to
/// trigger a panic, then the lower level [`Regex::try_search`] provides
/// a fallible API that never panics.
///
/// # Example
///
/// This example shows how to cause a search to terminate if it sees a
/// `\n` byte, and handle the error returned. This could be useful if, for
/// example, you wanted to prevent a user supplied pattern from matching
/// across a line boundary.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
///
/// let re = Regex::builder()
/// .dense(dfa::dense::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
///
/// let input = Input::new("foo\nbar");
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
/// let expected = MatchError::quit(b'\n', 3);
/// let got = re.try_search(&input).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
);
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
impl Regex {
/// Parse the given regular expression using the default configuration and
/// return the corresponding regex.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(
/// Some(Match::must(0, 3..14)),
/// re.find(b"zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
Builder::new().build(pattern)
}
/// Like `new`, but parses multiple patterns into a single "regex set."
/// This similarly uses the default regex configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
///
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex, BuildError> {
Builder::new().build_many(patterns)
}
}
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
impl Regex<sparse::DFA<Vec<u8>>> {
/// Parse the given regular expression using the default configuration,
/// except using sparse DFAs, and return the corresponding regex.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
/// assert_eq!(
/// Some(Match::must(0, 3..14)),
/// re.find(b"zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_sparse(
pattern: &str,
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
Builder::new().build_sparse(pattern)
}
/// Like `new`, but parses multiple patterns into a single "regex set"
/// using sparse DFAs. This otherwise similarly uses the default regex
/// configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
///
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_many_sparse<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
Builder::new().build_many_sparse(patterns)
}
}
/// Convenience routines for regex construction.
impl Regex<dense::DFA<&'static [u32]>> {
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
/// [`Builder`] type in common cases.
///
/// # Example
///
/// This example shows how to use the builder to disable UTF-8 mode
/// everywhere.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
/// let expected = Some(Match::must(0, 1..9));
/// let got = re.find(haystack);
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn builder() -> Builder {
Builder::new()
}
}
/// Standard search routines for finding and iterating over matches.
impl<A: Automaton> Regex<A> {
/// Returns true if and only if this regex matches the given haystack.
///
/// This routine may short circuit if it knows that scanning future input
/// will never lead to a different result. In particular, if the underlying
/// DFA enters a match state or a dead state, then this routine will return
/// `true` or `false`, respectively, without inspecting any future input.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the DFA quitting.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(true, re.is_match("foo12345bar"));
/// assert_eq!(false, re.is_match("foobar"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
// Not only can we do an "earliest" search, but we can avoid doing a
// reverse scan too.
let input = input.into().earliest(true);
self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
}
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the DFA quitting.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// // Greediness is applied appropriately.
/// let re = Regex::new("foo[0-9]+")?;
/// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the default leftmost-first match semantics demand that we find the
/// // earliest match that prefers earlier parts of the pattern over latter
/// // parts.
/// let re = Regex::new("abc|a")?;
/// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
self.try_search(&input.into()).unwrap()
}
/// Returns an iterator over all non-overlapping leftmost matches in the
/// given bytes. If no match exists, then the iterator yields no elements.
///
/// This corresponds to the "standard" regex search iterator.
///
/// # Panics
///
/// If the search returns an error during iteration, then iteration
/// panics. See [`Regex::find`] for the panic conditions.
///
/// Use [`Regex::try_search`] with
/// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
/// handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let text = "foo1 foo12 foo123";
/// let matches: Vec<Match> = re.find_iter(text).collect();
/// assert_eq!(matches, vec![
/// Match::must(0, 0..4),
/// Match::must(0, 5..10),
/// Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
&'r self,
input: I,
) -> FindMatches<'r, 'h, A> {
let it = iter::Searcher::new(input.into());
FindMatches { re: self, it }
}
}
/// Lower level fallible search routines that permit controlling where the
/// search starts and ends in a particular sequence.
impl<A: Automaton> Regex<A> {
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// This is like [`Regex::find`] but with two differences:
///
/// 1. It is not generic over `Into<Input>` and instead accepts a
/// `&Input`. This permits reusing the same `Input` for multiple searches
/// without needing to create a new one. This _may_ help with latency.
/// 2. It returns an error if the search could not complete where as
/// [`Regex::find`] will panic.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in the following circumstances:
///
/// * The configuration of the DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the DFA quitting.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
#[inline]
pub fn try_search(
&self,
input: &Input<'_>,
) -> Result<Option<Match>, MatchError> {
let (fwd, rev) = (self.forward(), self.reverse());
let end = match fwd.try_search_fwd(input)? {
None => return Ok(None),
Some(end) => end,
};
// This special cases an empty match at the beginning of the search. If
// our end matches our start, then since a reverse DFA can't match past
// the start, it must follow that our starting position is also our end
// position. So short circuit and skip the reverse search.
if input.start() == end.offset() {
return Ok(Some(Match::new(
end.pattern(),
end.offset()..end.offset(),
)));
}
// We can also skip the reverse search if we know our search was
// anchored. This occurs either when the input config is anchored or
// when we know the regex itself is anchored. In this case, we know the
// start of the match, if one is found, must be the start of the
// search.
if self.is_anchored(input) {
return Ok(Some(Match::new(
end.pattern(),
input.start()..end.offset(),
)));
}
// N.B. I have tentatively convinced myself that it isn't necessary
// to specify the specific pattern for the reverse search since the
// reverse search will always find the same pattern to match as the
// forward search. But I lack a rigorous proof. Why not just provide
// the pattern anyway? Well, if it is needed, then leaving it out
// gives us a chance to find a witness. (Also, if we don't need to
// specify the pattern, then we don't need to build the reverse DFA
// with 'starts_for_each_pattern' enabled.)
//
// We also need to be careful to disable 'earliest' for the reverse
// search, since it could be enabled for the forward search. In the
// reverse case, to satisfy "leftmost" criteria, we need to match
// as much as we can. We also need to be careful to make the search
// anchored. We don't want the reverse search to report any matches
// other than the one beginning at the end of our forward search.
let revsearch = input
.clone()
.span(input.start()..end.offset())
.anchored(Anchored::Yes)
.earliest(false);
let start = rev
.try_search_rev(&revsearch)?
.expect("reverse search must match if forward search does");
assert_eq!(
start.pattern(),
end.pattern(),
"forward and reverse search must match same pattern",
);
assert!(start.offset() <= end.offset());
Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}
/// Returns true if either the given input specifies an anchored search
/// or if the underlying DFA is always anchored.
fn is_anchored(&self, input: &Input<'_>) -> bool {
match input.get_anchored() {
Anchored::No => self.forward().is_always_start_anchored(),
Anchored::Yes | Anchored::Pattern(_) => true,
}
}
}
/// Non-search APIs for querying information about the regex and setting a
/// prefilter.
impl<A: Automaton> Regex<A> {
/// Return the underlying DFA responsible for forward matching.
///
/// This is useful for accessing the underlying DFA and converting it to
/// some other format or size. See the [`Builder::build_from_dfas`] docs
/// for an example of where this might be useful.
pub fn forward(&self) -> &A {
&self.forward
}
/// Return the underlying DFA responsible for reverse matching.
///
/// This is useful for accessing the underlying DFA and converting it to
/// some other format or size. See the [`Builder::build_from_dfas`] docs
/// for an example of where this might be useful.
pub fn reverse(&self) -> &A {
&self.reverse
}
/// Returns the total number of patterns matched by this regex.
///
/// # Example
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::dfa::regex::Regex;
///
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
/// assert_eq!(3, re.pattern_len());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn pattern_len(&self) -> usize {
assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
self.forward().pattern_len()
}
}
/// An iterator over all non-overlapping matches for an infallible search.
///
/// The iterator yields a [`Match`] value until no more matches could be found.
/// If the underlying regex engine returns an error, then a panic occurs.
///
/// The type parameters are as follows:
///
/// * `A` represents the type of the underlying DFA that implements the
/// [`Automaton`] trait.
///
/// The lifetime parameters are as follows:
///
/// * `'h` represents the lifetime of the haystack being searched.
/// * `'r` represents the lifetime of the regex object itself.
///
/// This iterator can be created with the [`Regex::find_iter`] method.
#[derive(Debug)]
pub struct FindMatches<'r, 'h, A> {
re: &'r Regex<A>,
it: iter::Searcher<'h>,
}
impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
type Item = Match;
#[inline]
fn next(&mut self) -> Option<Match> {
let FindMatches { re, ref mut it } = *self;
it.advance(|input| re.try_search(input))
}
}
/// A builder for a regex based on deterministic finite automatons.
///
/// This builder permits configuring options for the syntax of a pattern, the
/// NFA construction, the DFA construction and finally the regex searching
/// itself. This builder is different from a general purpose regex builder in
/// that it permits fine grain configuration of the construction process. The
/// trade off for this is complexity, and the possibility of setting a
/// configuration that might not make sense. For example, there are two
/// different UTF-8 modes:
///
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
/// whether the pattern itself can contain sub-expressions that match invalid
/// UTF-8.
/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
/// how the regex iterators themselves advance the starting position of the
/// next search when a match with zero length is found.
///
/// Generally speaking, callers will want to either enable all of these or
/// disable all of these.
///
/// Internally, building a regex requires building two DFAs, where one is
/// responsible for finding the end of a match and the other is responsible
/// for finding the start of a match. If you only need to detect whether
/// something matched, or only the end of a match, then you should use a
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
/// building two DFAs.
///
/// # Build methods
///
/// This builder has a few "build" methods. In general, it's the result of
/// combining the following parameters:
///
/// * Building one or many regexes.
/// * Building a regex with dense or sparse DFAs.
///
/// The simplest "build" method is [`Builder::build`]. It accepts a single
/// pattern and builds a dense DFA using `usize` for the state identifier
/// representation.
///
/// The most general "build" method is [`Builder::build_many`], which permits
/// building a regex that searches for multiple patterns simultaneously while
/// using a specific state identifier representation.
///
/// The most flexible "build" method, but hardest to use, is
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
/// just a pair of DFAs, and this method allows you to specify those DFAs
/// exactly.
///
/// # Example
///
/// This example shows how to disable UTF-8 mode in the syntax and the regex
/// itself. This is generally what you want for matching on arbitrary bytes.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
/// let expected = Some(Match::must(0, 1..9));
/// let got = re.find(haystack);
/// assert_eq!(expected, got);
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
/// // but the subsequent `.*` does not! Disabling UTF-8
/// // on the syntax permits this.
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
#[cfg(feature = "dfa-build")]
dfa: dense::Builder,
}
impl Builder {
/// Create a new regex builder with the default configuration.
pub fn new() -> Builder {
Builder {
#[cfg(feature = "dfa-build")]
dfa: dense::Builder::new(),
}
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
self.build_many(&[pattern])
}
/// Build a regex from the given pattern using sparse DFAs.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_sparse(
&self,
pattern: &str,
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
self.build_many_sparse(&[pattern])
}
/// Build a regex from the given patterns.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<Regex, BuildError> {
let forward = self.dfa.build_many(patterns)?;
let reverse = self
.dfa
.clone()
.configure(
dense::Config::new()
.prefilter(None)
.specialize_start_states(false)
.start_kind(StartKind::Anchored)
.match_kind(MatchKind::All),
)
.thompson(crate::nfa::thompson::Config::new().reverse(true))
.build_many(patterns)?;
Ok(self.build_from_dfas(forward, reverse))
}
/// Build a sparse regex from the given patterns.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_many_sparse<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
let re = self.build_many(patterns)?;
let forward = re.forward().to_sparse()?;
let reverse = re.reverse().to_sparse()?;
Ok(self.build_from_dfas(forward, reverse))
}
/// Build a regex from its component forward and reverse DFAs.
///
/// This is useful when deserializing a regex from some arbitrary
/// memory region. This is also useful for building regexes from other
/// types of DFAs.
///
/// If you're building the DFAs from scratch instead of building new DFAs
/// from other DFAs, then you'll need to make sure that the reverse DFA is
/// configured correctly to match the intended semantics. Namely:
///
/// * It should be anchored.
/// * It should use [`MatchKind::All`] semantics.
/// * It should match in reverse.
/// * Otherwise, its configuration should match the forward DFA.
///
/// If these conditions aren't satisfied, then the behavior of searches is
/// unspecified.
///
/// Note that when using this constructor, no configuration is applied.
/// Since this routine provides the DFAs to the builder, there is no
/// opportunity to apply other configuration options.
///
/// # Example
///
/// This example is a bit a contrived. The usual use of these methods
/// would involve serializing `initial_re` somewhere and then deserializing
/// it later to build a regex. But in this case, we do everything in
/// memory.
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
/// let re = Regex::builder().build_from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// This example shows how to build a `Regex` that uses sparse DFAs instead
/// of dense DFAs without using one of the convenience `build_sparse`
/// routines:
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let fwd = initial_re.forward().to_sparse()?;
/// let rev = initial_re.reverse().to_sparse()?;
/// let re = Regex::builder().build_from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn build_from_dfas<A: Automaton>(
&self,
forward: A,
reverse: A,
) -> Regex<A> {
Regex { forward, reverse }
}
/// Set the syntax configuration for this builder using
/// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn syntax(
&mut self,
config: crate::util::syntax::Config,
) -> &mut Builder {
self.dfa.syntax(config);
self
}
/// Set the Thompson NFA configuration for this builder using
/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
///
/// This permits setting things like whether additional time should be
/// spent shrinking the size of the NFA.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn thompson(
&mut self,
config: crate::nfa::thompson::Config,
) -> &mut Builder {
self.dfa.thompson(config);
self
}
/// Set the dense DFA compilation configuration for this builder using
/// [`dense::Config`].
///
/// This permits setting things like whether the underlying DFAs should
/// be minimized.
#[cfg(feature = "dfa-build")]
pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
self.dfa.configure(config);
self
}
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}

View File

@@ -0,0 +1,242 @@
use alloc::vec::Vec;
use crate::util::primitives::StateID;
/// Remappable is a tightly coupled abstraction that facilitates remapping
/// state identifiers in DFAs.
///
/// The main idea behind remapping state IDs is that DFAs often need to check
/// if a certain state is a "special" state of some kind (like a match state)
/// during a search. Since this is extremely perf critical code, we want this
/// check to be as fast as possible. Partitioning state IDs into, for example,
/// into "non-match" and "match" states means one can tell if a state is a
/// match state via a simple comparison of the state ID.
///
/// The issue is that during the DFA construction process, it's not
/// particularly easy to partition the states. Instead, the simplest thing is
/// to often just do a pass over all of the states and shuffle them into their
/// desired partitionings. To do that, we need a mechanism for swapping states.
/// Hence, this abstraction.
///
/// Normally, for such little code, I would just duplicate it. But this is a
/// key optimization and the implementation is a bit subtle. So the abstraction
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
/// the dense and one-pass DFAs.
///
/// See also src/dfa/special.rs for a more detailed explanation of how dense
/// DFAs are partitioned.
pub(super) trait Remappable: core::fmt::Debug {
/// Return the total number of states.
fn state_len(&self) -> usize;
/// Return the power-of-2 exponent that yields the stride. The pertinent
/// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
fn stride2(&self) -> usize;
/// Swap the states pointed to by the given IDs. The underlying finite
/// state machine should be mutated such that all of the transitions in
/// `id1` are now in the memory region where the transitions for `id2`
/// were, and all of the transitions in `id2` are now in the memory region
/// where the transitions for `id1` were.
///
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
///
/// It is expected that, after calling this, the underlying value will be
/// left in an inconsistent state, since any other transitions pointing to,
/// e.g., `id1` need to be updated to point to `id2`, since that's where
/// `id1` moved to.
///
/// In order to "fix" the underlying inconsistent state, a `Remapper`
/// should be used to guarantee that `remap` is called at the appropriate
/// time.
fn swap_states(&mut self, id1: StateID, id2: StateID);
/// This must remap every single state ID in the underlying value according
/// to the function given. For example, in a DFA, this should remap every
/// transition and every starting state ID.
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
}
/// Remapper is an abstraction the manages the remapping of state IDs in a
/// finite state machine. This is useful when one wants to shuffle states into
/// different positions in the machine.
///
/// One of the key complexities this manages is the ability to correctly move
/// one state multiple times.
///
/// Once shuffling is complete, `remap` must be called, which will rewrite
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
/// will almost certainly result in a corrupt machine.
#[derive(Debug)]
pub(super) struct Remapper {
/// A map from the index of a state to its pre-multiplied identifier.
///
/// When a state is swapped with another, then their corresponding
/// locations in this map are also swapped. Thus, its new position will
/// still point to its old pre-multiplied StateID.
///
/// While there is a bit more to it, this then allows us to rewrite the
/// state IDs in a DFA's transition table in a single pass. This is done
/// by iterating over every ID in this map, then iterating over each
/// transition for the state at that ID and re-mapping the transition from
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
/// in this map where `old_id` *started*, and set it to where it ended up
/// after all swaps have been completed.
map: Vec<StateID>,
/// A mapper from state index to state ID (and back).
idxmap: IndexMapper,
}
impl Remapper {
/// Create a new remapper from the given remappable implementation. The
/// remapper can then be used to swap states. The remappable value given
/// here must the same one given to `swap` and `remap`.
pub(super) fn new(r: &impl Remappable) -> Remapper {
let idxmap = IndexMapper { stride2: r.stride2() };
let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
Remapper { map, idxmap }
}
/// Swap two states. Once this is called, callers must follow through to
/// call `remap`, or else it's possible for the underlying remappable
/// value to be in a corrupt state.
pub(super) fn swap(
&mut self,
r: &mut impl Remappable,
id1: StateID,
id2: StateID,
) {
if id1 == id2 {
return;
}
r.swap_states(id1, id2);
self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
}
/// Complete the remapping process by rewriting all state IDs in the
/// remappable value according to the swaps performed.
pub(super) fn remap(mut self, r: &mut impl Remappable) {
// Update the map to account for states that have been swapped
// multiple times. For example, if (A, C) and (C, G) are swapped, then
// transitions previously pointing to A should now point to G. But if
// we don't update our map, they will erroneously be set to C. All we
// do is follow the swaps in our map until we see our original state
// ID.
//
// The intuition here is to think about how changes are made to the
// map: only through pairwise swaps. That means that starting at any
// given state, it is always possible to find the loop back to that
// state by following the swaps represented in the map (which might be
// 0 swaps).
//
// We are also careful to clone the map before starting in order to
// freeze it. We use the frozen map to find our loops, since we need to
// update our map as well. Without freezing it, our updates could break
// the loops referenced above and produce incorrect results.
let oldmap = self.map.clone();
for i in 0..r.state_len() {
let cur_id = self.idxmap.to_state_id(i);
let mut new_id = oldmap[i];
if cur_id == new_id {
continue;
}
loop {
let id = oldmap[self.idxmap.to_index(new_id)];
if cur_id == id {
self.map[i] = new_id;
break;
}
new_id = id;
}
}
r.remap(|next| self.map[self.idxmap.to_index(next)]);
}
}
/// A simple type for mapping between state indices and state IDs.
///
/// The reason why this exists is because state IDs are "premultiplied." That
/// is, in order to get to the transitions for a particular state, one need
/// only use the state ID as-is, instead of having to multiple it by transition
/// table's stride.
///
/// The downside of this is that it's inconvenient to map between state IDs
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
/// `2`, `3`, etc.
///
/// Since our state IDs are premultiplied, we can convert back-and-forth
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
/// indices.
#[derive(Debug)]
struct IndexMapper {
/// The power of 2 corresponding to the stride of the corresponding
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
/// stride2' pre-multiplies an index to an ID.
stride2: usize,
}
impl IndexMapper {
/// Convert a state ID to a state index.
fn to_index(&self, id: StateID) -> usize {
id.as_usize() >> self.stride2
}
/// Convert a state index to a state ID.
fn to_state_id(&self, index: usize) -> StateID {
// CORRECTNESS: If the given index is not valid, then it is not
// required for this to panic or return a valid state ID. We'll "just"
// wind up with panics or silent logic errors at some other point.
StateID::new_unchecked(index << self.stride2)
}
}
#[cfg(feature = "dfa-build")]
mod dense {
use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
use super::Remappable;
impl Remappable for OwnedDFA {
fn state_len(&self) -> usize {
OwnedDFA::state_len(self)
}
fn stride2(&self) -> usize {
OwnedDFA::stride2(self)
}
fn swap_states(&mut self, id1: StateID, id2: StateID) {
OwnedDFA::swap_states(self, id1, id2)
}
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
OwnedDFA::remap(self, map)
}
}
}
#[cfg(feature = "dfa-onepass")]
mod onepass {
use crate::{dfa::onepass::DFA, util::primitives::StateID};
use super::Remappable;
impl Remappable for DFA {
fn state_len(&self) -> usize {
DFA::state_len(self)
}
fn stride2(&self) -> usize {
// We don't do pre-multiplication for the one-pass DFA, so
// returning 0 has the effect of making state IDs and state indices
// equivalent.
0
}
fn swap_states(&mut self, id1: StateID, id2: StateID) {
DFA::swap_states(self, id1, id2)
}
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
DFA::remap(self, map)
}
}
}

644
vendor/regex-automata/src/dfa/search.rs vendored Normal file
View File

@@ -0,0 +1,644 @@
use crate::{
dfa::{
accel,
automaton::{Automaton, OverlappingState},
},
util::{
prefilter::Prefilter,
primitives::StateID,
search::{Anchored, HalfMatch, Input, Span},
},
MatchError,
};
#[inline(never)]
pub fn find_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
if input.is_done() {
return Ok(None);
}
let pre = if input.get_anchored().is_anchored() {
None
} else {
dfa.get_prefilter()
};
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() {
if input.get_earliest() {
find_fwd_imp(dfa, input, pre, true)
} else {
find_fwd_imp(dfa, input, pre, false)
}
} else {
if input.get_earliest() {
find_fwd_imp(dfa, input, None, true)
} else {
find_fwd_imp(dfa, input, None, false)
}
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_fwd_imp<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
pre: Option<&'_ Prefilter>,
earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
// See 'prefilter_restart' docs for explanation.
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
let mut mat = None;
let mut sid = init_fwd(dfa, input)?;
let mut at = input.start();
// This could just be a closure, but then I think it would be unsound
// because it would need to be safe to invoke. This way, the lack of safety
// is clearer in the code below.
macro_rules! next_unchecked {
($sid:expr, $at:expr) => {{
let byte = *input.haystack().get_unchecked($at);
dfa.next_state_unchecked($sid, byte)
}};
}
if let Some(ref pre) = pre {
let span = Span::from(at..input.end());
// If a prefilter doesn't report false positives, then we don't need to
// touch the DFA at all. However, since all matches include the pattern
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
// limit this optimization to cases where there is exactly one pattern.
// In that case, any match must be the 0th pattern.
match pre.find(input.haystack(), span) {
None => return Ok(mat),
Some(ref span) => {
at = span.start;
if !universal_start {
sid = prefilter_restart(dfa, &input, at)?;
}
}
}
}
while at < input.end() {
// SAFETY: There are two safety invariants we need to uphold here in
// the loops below: that 'sid' and 'prev_sid' are valid state IDs
// for this DFA, and that 'at' is a valid index into 'haystack'.
// For the former, we rely on the invariant that next_state* and
// start_state_forward always returns a valid state ID (given a valid
// state ID in the former case). For the latter safety invariant, we
// always guard unchecked access with a check that 'at' is less than
// 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
// ensure that 'at' is always in bounds.
//
// PERF: See a similar comment in src/hybrid/search.rs that justifies
// this extra work to make the search loop fast. The same reasoning and
// benchmarks apply here.
let mut prev_sid;
while at < input.end() {
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at += 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at += 1;
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid) {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at += 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at += 1;
}
if dfa.is_special_state(sid) {
if dfa.is_start_state(sid) {
if let Some(ref pre) = pre {
let span = Span::from(at..input.end());
match pre.find(input.haystack(), span) {
None => return Ok(mat),
Some(ref span) => {
// We want to skip any update to 'at' below
// at the end of this iteration and just
// jump immediately back to the next state
// transition at the leading position of the
// candidate match.
//
// ... but only if we actually made progress
// with our prefilter, otherwise if the start
// state has a self-loop, we can get stuck.
if span.start > at {
at = span.start;
if !universal_start {
sid = prefilter_restart(dfa, &input, at)?;
}
continue;
}
}
}
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_fwd(needles, input.haystack(), at + 1)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
mat = Some(HalfMatch::new(pattern, at));
if earliest {
return Ok(mat);
}
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_fwd(needles, input.haystack(), at + 1)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
at = accel::find_fwd(needs, input.haystack(), at + 1)
.unwrap_or(input.end());
continue;
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else {
// It's important that this is a debug_assert, since this can
// actually be tripped even if DFA::from_bytes succeeds and
// returns a supposedly valid DFA.
return Err(MatchError::quit(input.haystack()[at], at));
}
}
at += 1;
}
eoi_fwd(dfa, input, &mut sid, &mut mat)?;
Ok(mat)
}
#[inline(never)]
pub fn find_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
if input.is_done() {
return Ok(None);
}
if input.get_earliest() {
find_rev_imp(dfa, input, true)
} else {
find_rev_imp(dfa, input, false)
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_rev_imp<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
let mut mat = None;
let mut sid = init_rev(dfa, input)?;
// In reverse search, the loop below can't handle the case of searching an
// empty slice. Ideally we could write something congruent to the forward
// search, i.e., 'while at >= start', but 'start' might be 0. Since we use
// an unsigned offset, 'at >= 0' is trivially always true. We could avoid
// this extra case handling by using a signed offset, but Rust makes it
// annoying to do. So... We just handle the empty case separately.
if input.start() == input.end() {
eoi_rev(dfa, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
macro_rules! next_unchecked {
($sid:expr, $at:expr) => {{
let byte = *input.haystack().get_unchecked($at);
dfa.next_state_unchecked($sid, byte)
}};
}
loop {
// SAFETY: See comments in 'find_fwd' for a safety argument.
let mut prev_sid;
while at >= input.start() {
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid)
|| at <= input.start().saturating_add(3)
{
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at -= 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at -= 1;
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid) {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at -= 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at -= 1;
}
if dfa.is_special_state(sid) {
if dfa.is_start_state(sid) {
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
.unwrap_or(input.start());
}
} else if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
// Since reverse searches report the beginning of a match
// and the beginning is inclusive (not exclusive like the
// end of a match), we add 1 to make it inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
return Ok(mat);
}
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
.unwrap_or(input.start());
}
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
// If the accelerator returns nothing, why don't we quit the
// search? Well, if the accelerator doesn't find anything, that
// doesn't mean we don't have a match. It just means that we
// can't leave the current state given one of the 255 possible
// byte values. However, there might be an EOI transition. So
// we set 'at' to the end of the haystack, which will cause
// this loop to stop and fall down into the EOI transition.
at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
.unwrap_or(input.start());
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else {
return Err(MatchError::quit(input.haystack()[at], at));
}
}
if at == input.start() {
break;
}
at -= 1;
}
eoi_rev(dfa, input, &mut sid, &mut mat)?;
Ok(mat)
}
#[inline(never)]
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
state.mat = None;
if input.is_done() {
return Ok(());
}
let pre = if input.get_anchored().is_anchored() {
None
} else {
dfa.get_prefilter()
};
if pre.is_some() {
find_overlapping_fwd_imp(dfa, input, pre, state)
} else {
find_overlapping_fwd_imp(dfa, input, None, state)
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
pre: Option<&'_ Prefilter>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
// See 'prefilter_restart' docs for explanation.
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
let mut sid = match state.id {
None => {
state.at = input.start();
init_fwd(dfa, input)?
}
Some(sid) => {
if let Some(match_index) = state.next_match_index {
let match_len = dfa.match_len(sid);
if match_index < match_len {
state.next_match_index = Some(match_index + 1);
let pattern = dfa.match_pattern(sid, match_index);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
}
}
// Once we've reported all matches at a given position, we need to
// advance the search to the next position.
state.at += 1;
if state.at > input.end() {
return Ok(());
}
sid
}
};
// NOTE: We don't optimize the crap out of this routine primarily because
// it seems like most find_overlapping searches will have higher match
// counts, and thus, throughput is perhaps not as important. But if you
// have a use case for something faster, feel free to file an issue.
while state.at < input.end() {
sid = dfa.next_state(sid, input.haystack()[state.at]);
if dfa.is_special_state(sid) {
state.id = Some(sid);
if dfa.is_start_state(sid) {
if let Some(ref pre) = pre {
let span = Span::from(state.at..input.end());
match pre.find(input.haystack(), span) {
None => return Ok(()),
Some(ref span) => {
if span.start > state.at {
state.at = span.start;
if !universal_start {
sid = prefilter_restart(
dfa, &input, state.at,
)?;
}
continue;
}
}
}
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
state.at = accel::find_fwd(
needles,
input.haystack(),
state.at + 1,
)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_match_state(sid) {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(sid, 0);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
} else if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
// If the accelerator returns nothing, why don't we quit the
// search? Well, if the accelerator doesn't find anything, that
// doesn't mean we don't have a match. It just means that we
// can't leave the current state given one of the 255 possible
// byte values. However, there might be an EOI transition. So
// we set 'at' to the end of the haystack, which will cause
// this loop to stop and fall down into the EOI transition.
state.at =
accel::find_fwd(needs, input.haystack(), state.at + 1)
.unwrap_or(input.end());
continue;
} else if dfa.is_dead_state(sid) {
return Ok(());
} else {
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
));
}
}
state.at += 1;
}
let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
state.id = Some(sid);
if state.mat.is_some() {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
state.next_match_index = Some(1);
}
result
}
#[inline(never)]
pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
state.mat = None;
if input.is_done() {
return Ok(());
}
let mut sid = match state.id {
None => {
let sid = init_rev(dfa, input)?;
state.id = Some(sid);
if input.start() == input.end() {
state.rev_eoi = true;
} else {
state.at = input.end() - 1;
}
sid
}
Some(sid) => {
if let Some(match_index) = state.next_match_index {
let match_len = dfa.match_len(sid);
if match_index < match_len {
state.next_match_index = Some(match_index + 1);
let pattern = dfa.match_pattern(sid, match_index);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
}
}
// Once we've reported all matches at a given position, we need
// to advance the search to the next position. However, if we've
// already followed the EOI transition, then we know we're done
// with the search and there cannot be any more matches to report.
if state.rev_eoi {
return Ok(());
} else if state.at == input.start() {
// At this point, we should follow the EOI transition. This
// will cause us the skip the main loop below and fall through
// to the final 'eoi_rev' transition.
state.rev_eoi = true;
} else {
// We haven't hit the end of the search yet, so move on.
state.at -= 1;
}
sid
}
};
while !state.rev_eoi {
sid = dfa.next_state(sid, input.haystack()[state.at]);
if dfa.is_special_state(sid) {
state.id = Some(sid);
if dfa.is_start_state(sid) {
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
state.at =
accel::find_rev(needles, input.haystack(), state.at)
.map(|i| i + 1)
.unwrap_or(input.start());
}
} else if dfa.is_match_state(sid) {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(sid, 0);
state.mat = Some(HalfMatch::new(pattern, state.at + 1));
return Ok(());
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
// If the accelerator returns nothing, why don't we quit the
// search? Well, if the accelerator doesn't find anything, that
// doesn't mean we don't have a match. It just means that we
// can't leave the current state given one of the 255 possible
// byte values. However, there might be an EOI transition. So
// we set 'at' to the end of the haystack, which will cause
// this loop to stop and fall down into the EOI transition.
state.at =
accel::find_rev(needles, input.haystack(), state.at)
.map(|i| i + 1)
.unwrap_or(input.start());
} else if dfa.is_dead_state(sid) {
return Ok(());
} else {
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
));
}
}
if state.at == input.start() {
break;
}
state.at -= 1;
}
let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
state.rev_eoi = true;
state.id = Some(sid);
if state.mat.is_some() {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
state.next_match_index = Some(1);
}
result
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
let sid = dfa.start_state_forward(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
debug_assert!(!dfa.is_match_state(sid));
Ok(sid)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
let sid = dfa.start_state_reverse(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
debug_assert!(!dfa.is_match_state(sid));
Ok(sid)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
sid: &mut StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid = dfa.next_state(*sid, b);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
}
}
Ok(())
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
sid: &mut StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa.next_state(*sid, byte);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
}
Ok(())
}
/// Re-compute the starting state that a DFA should be in after finding a
/// prefilter candidate match at the position `at`.
///
/// The function with the same name has a bit more docs in hybrid/search.rs.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn prefilter_restart<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
at: usize,
) -> Result<StateID, MatchError> {
let mut input = input.clone();
input.set_start(at);
init_fwd(dfa, &input)
}

2649
vendor/regex-automata/src/dfa/sparse.rs vendored Normal file

File diff suppressed because it is too large Load Diff

494
vendor/regex-automata/src/dfa/special.rs vendored Normal file
View File

@@ -0,0 +1,494 @@
use crate::{
dfa::DEAD,
util::{
primitives::StateID,
wire::{self, DeserializeError, Endian, SerializeError},
},
};
macro_rules! err {
($msg:expr) => {
return Err(DeserializeError::generic($msg));
};
}
// Special represents the identifiers in a DFA that correspond to "special"
// states. If a state is one or more of the following, then it is considered
// special:
//
// * dead - A non-matching state where all outgoing transitions lead back to
// itself. There is only one of these, regardless of whether minimization
// has run. The dead state always has an ID of 0. i.e., It is always the
// first state in a DFA.
// * quit - A state that is entered whenever a byte is seen that should cause
// a DFA to give up and stop searching. This results in a MatchError::quit
// error being returned at search time. The default configuration for a DFA
// has no quit bytes, which means this state is unreachable by default,
// although it is always present for reasons of implementation simplicity.
// This state is only reachable when the caller configures the DFA to quit
// on certain bytes. There is always exactly one of these states and it
// is always the second state. (Its actual ID depends on the size of the
// alphabet in dense DFAs, since state IDs are premultiplied in order to
// allow them to be used directly as indices into the transition table.)
// * match - An accepting state, i.e., indicative of a match. There may be
// zero or more of these states.
// * accelerated - A state where all of its outgoing transitions, except a
// few, loop back to itself. These states are candidates for acceleration
// via memchr during search. There may be zero or more of these states.
// * start - A non-matching state that indicates where the automaton should
// start during a search. There is always at least one starting state and
// all are guaranteed to be non-match states. (A start state cannot be a
// match state because the DFAs in this crate delay all matches by one byte.
// So every search that finds a match must move through one transition to
// some other match state, even when searching an empty string.)
//
// These are not mutually exclusive categories. Namely, the following
// overlapping can occur:
//
// * {dead, start} - If a DFA can never lead to a match and it is minimized,
// then it will typically compile to something where all starting IDs point
// to the DFA's dead state.
// * {match, accelerated} - It is possible for a match state to have the
// majority of its transitions loop back to itself, which means it's
// possible for a match state to be accelerated.
// * {start, accelerated} - Similarly, it is possible for a start state to be
// accelerated. Note that it is possible for an accelerated state to be
// neither a match or a start state. Also note that just because both match
// and start states overlap with accelerated states does not mean that
// match and start states overlap with each other. In fact, they are
// guaranteed not to overlap.
//
// As a special mention, every DFA always has a dead and a quit state, even
// though from the perspective of the DFA, they are equivalent. (Indeed,
// minimization special cases them to ensure they don't get merged.) The
// purpose of keeping them distinct is to use the quit state as a sentinel to
// distinguish between whether a search finished successfully without finding
// anything or whether it gave up before finishing.
//
// So the main problem we want to solve here is the *fast* detection of whether
// a state is special or not. And we also want to do this while storing as
// little extra data as possible. AND we want to be able to quickly determine
// which categories a state falls into above if it is special.
//
// We achieve this by essentially shuffling all special states to the beginning
// of a DFA. That is, all special states appear before every other non-special
// state. By representing special states this way, we can determine whether a
// state is special or not by a single comparison, where special.max is the
// identifier of the last special state in the DFA:
//
// if current_state <= special.max:
// ... do something with special state
//
// The only thing left to do is to determine what kind of special state
// it is. Because what we do next depends on that. Since special states
// are typically rare, we can afford to do a bit more extra work, but we'd
// still like this to be as fast as possible. The trick we employ here is to
// continue shuffling states even within the special state range. Such that
// one contiguous region corresponds to match states, another for start states
// and then an overlapping range for accelerated states. At a high level, our
// special state detection might look like this (for leftmost searching, where
// we continue searching even after seeing a match):
//
// byte = input[offset]
// current_state = next_state(current_state, byte)
// offset += 1
// if current_state <= special.max:
// if current_state == 0:
// # We can never leave a dead state, so this always marks the
// # end of our search.
// return last_match
// if current_state == special.quit_id:
// # A quit state means we give up. If he DFA has no quit state,
// # then special.quit_id == 0 == dead, which is handled by the
// # conditional above.
// return Err(MatchError::quit { byte, offset: offset - 1 })
// if special.min_match <= current_state <= special.max_match:
// last_match = Some(offset)
// if special.min_accel <= current_state <= special.max_accel:
// offset = accelerate(input, offset)
// last_match = Some(offset)
// elif special.min_start <= current_state <= special.max_start:
// offset = prefilter.find(input, offset)
// if special.min_accel <= current_state <= special.max_accel:
// offset = accelerate(input, offset)
// elif special.min_accel <= current_state <= special.max_accel:
// offset = accelerate(input, offset)
//
// There are some small details left out of the logic above. For example,
// in order to accelerate a state, we need to know which bytes to search for.
// This in turn implies some extra data we need to store in the DFA. To keep
// things compact, we would ideally only store
//
// N = special.max_accel - special.min_accel + 1
//
// items. But state IDs are premultiplied, which means they are not contiguous.
// So in order to take a state ID and index an array of accelerated structures,
// we need to do:
//
// i = (state_id - special.min_accel) / stride
//
// (N.B. 'stride' is always a power of 2, so the above can be implemented via
// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
// 2^x=stride.)
//
// Moreover, some of these specialty categories may be empty. For example,
// DFAs are not required to have any match states or any accelerated states.
// In that case, the lower and upper bounds are both set to 0 (the dead state
// ID) and the first `current_state == 0` check subsumes cases where the
// ranges are empty.
//
// Loop unrolling, if applicable, has also been left out of the logic above.
//
// Graphically, the ranges look like this, where asterisks indicate ranges
// that can be empty. Each 'x' is a state.
//
// quit
// dead|
// ||
// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
// | | | | start | |
// | |-------------| |-------| |
// | match* | | | |
// | | | | |
// | |----------| | |
// | accel* | |
// | | |
// | | |
// |----------------------------|------------------------
// special non-special*
#[derive(Clone, Copy, Debug)]
pub(crate) struct Special {
/// The identifier of the last special state in a DFA. A state is special
/// if and only if its identifier is less than or equal to `max`.
pub(crate) max: StateID,
/// The identifier of the quit state in a DFA. (There is no analogous field
/// for the dead state since the dead state's ID is always zero, regardless
/// of state ID size.)
pub(crate) quit_id: StateID,
/// The identifier of the first match state.
pub(crate) min_match: StateID,
/// The identifier of the last match state.
pub(crate) max_match: StateID,
/// The identifier of the first accelerated state.
pub(crate) min_accel: StateID,
/// The identifier of the last accelerated state.
pub(crate) max_accel: StateID,
/// The identifier of the first start state.
pub(crate) min_start: StateID,
/// The identifier of the last start state.
pub(crate) max_start: StateID,
}
impl Special {
/// Creates a new set of special ranges for a DFA. All ranges are initially
/// set to only contain the dead state. This is interpreted as an empty
/// range.
#[cfg(feature = "dfa-build")]
pub(crate) fn new() -> Special {
Special {
max: DEAD,
quit_id: DEAD,
min_match: DEAD,
max_match: DEAD,
min_accel: DEAD,
max_accel: DEAD,
min_start: DEAD,
max_start: DEAD,
}
}
/// Remaps all of the special state identifiers using the function given.
#[cfg(feature = "dfa-build")]
pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
Special {
max: map(self.max),
quit_id: map(self.quit_id),
min_match: map(self.min_match),
max_match: map(self.max_match),
min_accel: map(self.min_accel),
max_accel: map(self.max_accel),
min_start: map(self.min_start),
max_start: map(self.max_start),
}
}
/// Deserialize the given bytes into special state ranges. If the slice
/// given is not big enough, then this returns an error. Similarly, if
/// any of the expected invariants around special state ranges aren't
/// upheld, an error is returned. Note that this does not guarantee that
/// the information returned is correct.
///
/// Upon success, this returns the number of bytes read in addition to the
/// special state IDs themselves.
pub(crate) fn from_bytes(
mut slice: &[u8],
) -> Result<(Special, usize), DeserializeError> {
wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
let mut nread = 0;
let mut read_id = |what| -> Result<StateID, DeserializeError> {
let (id, nr) = wire::try_read_state_id(slice, what)?;
nread += nr;
slice = &slice[StateID::SIZE..];
Ok(id)
};
let max = read_id("special max id")?;
let quit_id = read_id("special quit id")?;
let min_match = read_id("special min match id")?;
let max_match = read_id("special max match id")?;
let min_accel = read_id("special min accel id")?;
let max_accel = read_id("special max accel id")?;
let min_start = read_id("special min start id")?;
let max_start = read_id("special max start id")?;
let special = Special {
max,
quit_id,
min_match,
max_match,
min_accel,
max_accel,
min_start,
max_start,
};
special.validate()?;
assert_eq!(nread, special.write_to_len());
Ok((special, nread))
}
/// Validate that the information describing special states satisfies
/// all known invariants.
pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
// Check that both ends of the range are DEAD or neither are.
if self.min_match == DEAD && self.max_match != DEAD {
err!("min_match is DEAD, but max_match is not");
}
if self.min_match != DEAD && self.max_match == DEAD {
err!("max_match is DEAD, but min_match is not");
}
if self.min_accel == DEAD && self.max_accel != DEAD {
err!("min_accel is DEAD, but max_accel is not");
}
if self.min_accel != DEAD && self.max_accel == DEAD {
err!("max_accel is DEAD, but min_accel is not");
}
if self.min_start == DEAD && self.max_start != DEAD {
err!("min_start is DEAD, but max_start is not");
}
if self.min_start != DEAD && self.max_start == DEAD {
err!("max_start is DEAD, but min_start is not");
}
// Check that ranges are well formed.
if self.min_match > self.max_match {
err!("min_match should not be greater than max_match");
}
if self.min_accel > self.max_accel {
err!("min_accel should not be greater than max_accel");
}
if self.min_start > self.max_start {
err!("min_start should not be greater than max_start");
}
// Check that ranges are ordered with respect to one another.
if self.matches() && self.quit_id >= self.min_match {
err!("quit_id should not be greater than min_match");
}
if self.accels() && self.quit_id >= self.min_accel {
err!("quit_id should not be greater than min_accel");
}
if self.starts() && self.quit_id >= self.min_start {
err!("quit_id should not be greater than min_start");
}
if self.matches() && self.accels() && self.min_accel < self.min_match {
err!("min_match should not be greater than min_accel");
}
if self.matches() && self.starts() && self.min_start < self.min_match {
err!("min_match should not be greater than min_start");
}
if self.accels() && self.starts() && self.min_start < self.min_accel {
err!("min_accel should not be greater than min_start");
}
// Check that max is at least as big as everything else.
if self.max < self.quit_id {
err!("quit_id should not be greater than max");
}
if self.max < self.max_match {
err!("max_match should not be greater than max");
}
if self.max < self.max_accel {
err!("max_accel should not be greater than max");
}
if self.max < self.max_start {
err!("max_start should not be greater than max");
}
Ok(())
}
/// Validate that the special state information is compatible with the
/// given state len.
pub(crate) fn validate_state_len(
&self,
len: usize,
stride2: usize,
) -> Result<(), DeserializeError> {
// We assume that 'validate' has already passed, so we know that 'max'
// is truly the max. So all we need to check is that the max state ID
// is less than the state ID len. The max legal value here is len-1,
// which occurs when there are no non-special states.
if (self.max.as_usize() >> stride2) >= len {
err!("max should not be greater than or equal to state length");
}
Ok(())
}
/// Write the IDs and ranges for special states to the given byte buffer.
/// The buffer given must have enough room to store all data, otherwise
/// this will return an error. The number of bytes written is returned
/// on success. The number of bytes written is guaranteed to be a multiple
/// of 8.
pub(crate) fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
use crate::util::wire::write_state_id as write;
if dst.len() < self.write_to_len() {
return Err(SerializeError::buffer_too_small("special state ids"));
}
let mut nwrite = 0;
nwrite += write::<E>(self.max, &mut dst[nwrite..]);
nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
assert_eq!(
self.write_to_len(),
nwrite,
"expected to write certain number of bytes",
);
assert_eq!(
nwrite % 8,
0,
"expected to write multiple of 8 bytes for special states",
);
Ok(nwrite)
}
/// Returns the total number of bytes written by `write_to`.
pub(crate) fn write_to_len(&self) -> usize {
8 * StateID::SIZE
}
/// Sets the maximum special state ID based on the current values. This
/// should be used once all possible state IDs are set.
#[cfg(feature = "dfa-build")]
pub(crate) fn set_max(&mut self) {
use core::cmp::max;
self.max = max(
self.quit_id,
max(self.max_match, max(self.max_accel, self.max_start)),
);
}
/// Sets the maximum special state ID such that starting states are not
/// considered "special." This also marks the min/max starting states as
/// DEAD such that 'is_start_state' always returns false, even if the state
/// is actually a starting state.
///
/// This is useful when there is no prefilter set. It will avoid
/// ping-ponging between the hot path in the DFA search code and the start
/// state handling code, which is typically only useful for executing a
/// prefilter.
#[cfg(feature = "dfa-build")]
pub(crate) fn set_no_special_start_states(&mut self) {
use core::cmp::max;
self.max = max(self.quit_id, max(self.max_match, self.max_accel));
self.min_start = DEAD;
self.max_start = DEAD;
}
/// Returns true if and only if the given state ID is a special state.
#[inline]
pub(crate) fn is_special_state(&self, id: StateID) -> bool {
id <= self.max
}
/// Returns true if and only if the given state ID is a dead state.
#[inline]
pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
id == DEAD
}
/// Returns true if and only if the given state ID is a quit state.
#[inline]
pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.quit_id == id
}
/// Returns true if and only if the given state ID is a match state.
#[inline]
pub(crate) fn is_match_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
}
/// Returns true if and only if the given state ID is an accel state.
#[inline]
pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
}
/// Returns true if and only if the given state ID is a start state.
#[inline]
pub(crate) fn is_start_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
}
/// Returns the total number of match states for a dense table based DFA.
#[inline]
pub(crate) fn match_len(&self, stride: usize) -> usize {
if self.matches() {
(self.max_match.as_usize() - self.min_match.as_usize() + stride)
/ stride
} else {
0
}
}
/// Returns true if and only if there is at least one match state.
#[inline]
pub(crate) fn matches(&self) -> bool {
self.min_match != DEAD
}
/// Returns the total number of accel states.
#[cfg(feature = "dfa-build")]
pub(crate) fn accel_len(&self, stride: usize) -> usize {
if self.accels() {
(self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
/ stride
} else {
0
}
}
/// Returns true if and only if there is at least one accel state.
#[inline]
pub(crate) fn accels(&self) -> bool {
self.min_accel != DEAD
}
/// Returns true if and only if there is at least one start state.
#[inline]
pub(crate) fn starts(&self) -> bool {
self.min_start != DEAD
}
}

74
vendor/regex-automata/src/dfa/start.rs vendored Normal file
View File

@@ -0,0 +1,74 @@
use core::mem::size_of;
use crate::util::wire::{self, DeserializeError, Endian, SerializeError};
/// The kind of anchored starting configurations to support in a DFA.
///
/// Fully compiled DFAs need to be explicitly configured as to which anchored
/// starting configurations to support. The reason for not just supporting
/// everything unconditionally is that it can use more resources (such as
/// memory and build time). The downside of this is that if you try to execute
/// a search using an [`Anchored`](crate::Anchored) mode that is not supported
/// by the DFA, then the search will return an error.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum StartKind {
/// Support both anchored and unanchored searches.
Both,
/// Support only unanchored searches. Requesting an anchored search will
/// panic.
///
/// Note that even if an unanchored search is requested, the pattern itself
/// may still be anchored. For example, `^abc` will only match `abc` at the
/// start of a haystack. This will remain true, even if the regex engine
/// only supported unanchored searches.
Unanchored,
/// Support only anchored searches. Requesting an unanchored search will
/// panic.
Anchored,
}
impl StartKind {
pub(crate) fn from_bytes(
slice: &[u8],
) -> Result<(StartKind, usize), DeserializeError> {
wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?;
let (n, nr) = wire::try_read_u32(slice, "start kind integer")?;
match n {
0 => Ok((StartKind::Both, nr)),
1 => Ok((StartKind::Unanchored, nr)),
2 => Ok((StartKind::Anchored, nr)),
_ => Err(DeserializeError::generic("unrecognized start kind")),
}
}
pub(crate) fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = self.write_to_len();
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("start kind"));
}
let n = match *self {
StartKind::Both => 0,
StartKind::Unanchored => 1,
StartKind::Anchored => 2,
};
E::write_u32(n, dst);
Ok(nwrite)
}
pub(crate) fn write_to_len(&self) -> usize {
size_of::<u32>()
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn has_unanchored(&self) -> bool {
matches!(*self, StartKind::Both | StartKind::Unanchored)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn has_anchored(&self) -> bool {
matches!(*self, StartKind::Both | StartKind::Anchored)
}
}

4417
vendor/regex-automata/src/hybrid/dfa.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,241 @@
use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored};
/// An error that occurs when initial construction of a lazy DFA fails.
///
/// A build error can occur when insufficient cache capacity is configured or
/// if something about the NFA is unsupported. (For example, if one attempts
/// to build a lazy DFA without heuristic Unicode support but with an NFA that
/// contains a Unicode word boundary.)
///
/// This error does not provide many introspection capabilities. There are
/// generally only two things you can do with it:
///
/// * Obtain a human readable message via its `std::fmt::Display` impl.
/// * Access an underlying
/// [`nfa::thompson::BuildError`](crate::nfa::thompson::BuildError)
/// type from its `source` method via the `std::error::Error` trait. This error
/// only occurs when using convenience routines for building a lazy DFA
/// directly from a pattern string.
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug)]
pub struct BuildError {
kind: BuildErrorKind,
}
#[derive(Clone, Debug)]
enum BuildErrorKind {
NFA(nfa::thompson::BuildError),
InsufficientCacheCapacity { minimum: usize, given: usize },
InsufficientStateIDCapacity { err: LazyStateIDError },
Unsupported(&'static str),
}
impl BuildError {
pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
BuildError { kind: BuildErrorKind::NFA(err) }
}
pub(crate) fn insufficient_cache_capacity(
minimum: usize,
given: usize,
) -> BuildError {
BuildError {
kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given },
}
}
pub(crate) fn insufficient_state_id_capacity(
err: LazyStateIDError,
) -> BuildError {
BuildError {
kind: BuildErrorKind::InsufficientStateIDCapacity { err },
}
}
pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
let msg = "cannot build lazy DFAs for regexes with Unicode word \
boundaries; switch to ASCII word boundaries, or \
heuristically enable Unicode word boundaries or use a \
different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}
}
#[cfg(feature = "std")]
impl std::error::Error for BuildError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind {
BuildErrorKind::NFA(ref err) => Some(err),
_ => None,
}
}
}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind {
BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
write!(
f,
"given cache capacity ({given}) is smaller than \
minimum required ({minimum})",
)
}
BuildErrorKind::InsufficientStateIDCapacity { ref err } => {
err.fmt(f)
}
BuildErrorKind::Unsupported(ref msg) => {
write!(f, "unsupported regex feature for DFAs: {msg}")
}
}
}
}
/// An error that can occur when computing the start state for a search.
///
/// Computing a start state can fail for a few reasons, either
/// based on incorrect configuration or even based on whether
/// the look-behind byte triggers a quit state. Typically
/// one does not need to handle this error if you're using
/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward)
/// (or its reverse counterpart), as that routine automatically converts
/// `StartError` to a [`MatchError`](crate::MatchError) for you.
///
/// This error may be returned by the
/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine.
///
/// This error implements the `std::error::Error` trait when the `std` feature
/// is enabled.
///
/// This error is marked as non-exhaustive. New variants may be added in a
/// semver compatible release.
#[non_exhaustive]
#[derive(Clone, Debug)]
pub enum StartError {
/// An error that occurs when cache inefficiency has dropped below the
/// configured heuristic thresholds.
Cache {
/// The underlying cache error that occurred.
err: CacheError,
},
/// An error that occurs when a starting configuration's look-behind byte
/// is in this DFA's quit set.
Quit {
/// The quit byte that was found.
byte: u8,
},
/// An error that occurs when the caller requests an anchored mode that
/// isn't supported by the DFA.
UnsupportedAnchored {
/// The anchored mode given that is unsupported.
mode: Anchored,
},
}
impl StartError {
pub(crate) fn cache(err: CacheError) -> StartError {
StartError::Cache { err }
}
pub(crate) fn quit(byte: u8) -> StartError {
StartError::Quit { byte }
}
pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
StartError::UnsupportedAnchored { mode }
}
}
#[cfg(feature = "std")]
impl std::error::Error for StartError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match *self {
StartError::Cache { ref err } => Some(err),
_ => None,
}
}
}
impl core::fmt::Display for StartError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
StartError::Cache { .. } => write!(
f,
"error computing start state because of cache inefficiency"
),
StartError::Quit { byte } => write!(
f,
"error computing start state because the look-behind byte \
{:?} triggered a quit state",
crate::util::escape::DebugByte(byte),
),
StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
write!(
f,
"error computing start state because \
anchored searches are not supported or enabled"
)
}
StartError::UnsupportedAnchored { mode: Anchored::No } => {
write!(
f,
"error computing start state because \
unanchored searches are not supported or enabled"
)
}
StartError::UnsupportedAnchored {
mode: Anchored::Pattern(pid),
} => {
write!(
f,
"error computing start state because \
anchored searches for a specific pattern ({}) \
are not supported or enabled",
pid.as_usize(),
)
}
}
}
}
/// An error that occurs when cache usage has become inefficient.
///
/// One of the weaknesses of a lazy DFA is that it may need to clear its
/// cache repeatedly if it's not big enough. If this happens too much, then it
/// can slow searching down significantly. A mitigation to this is to use
/// heuristics to detect whether the cache is being used efficiently or not.
/// If not, then a lazy DFA can return a `CacheError`.
///
/// The default configuration of a lazy DFA in this crate is
/// set such that a `CacheError` will never occur. Instead,
/// callers must opt into this behavior with settings like
/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count)
/// and
/// [`dfa::Config::minimum_bytes_per_state`](crate::hybrid::dfa::Config::minimum_bytes_per_state).
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug)]
pub struct CacheError(());
impl CacheError {
pub(crate) fn too_many_cache_clears() -> CacheError {
CacheError(())
}
pub(crate) fn bad_efficiency() -> CacheError {
CacheError(())
}
}
#[cfg(feature = "std")]
impl std::error::Error for CacheError {}
impl core::fmt::Display for CacheError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "lazy DFA cache has been cleared too many times")
}
}

354
vendor/regex-automata/src/hybrid/id.rs vendored Normal file
View File

@@ -0,0 +1,354 @@
/// A state identifier specifically tailored for lazy DFAs.
///
/// A lazy state ID logically represents a pointer to a DFA state. In practice,
/// by limiting the number of DFA states it can address, it reserves some
/// bits of its representation to encode some additional information. That
/// additional information is called a "tag." That tag is used to record
/// whether the state it points to is an unknown, dead, quit, start or match
/// state.
///
/// When implementing a low level search routine with a lazy DFA, it is
/// necessary to query the type of the current state to know what to do:
///
/// * **Unknown** - The state has not yet been computed. The
/// parameters used to get this state ID must be re-passed to
/// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never
/// return an unknown state ID.
/// * **Dead** - A dead state only has transitions to itself. It indicates that
/// the search cannot do anything else and should stop with whatever result it
/// has.
/// * **Quit** - A quit state indicates that the automaton could not answer
/// whether a match exists or not. Correct search implementations must return a
/// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit
/// state.
/// * **Start** - A start state is a state in which a search can begin.
/// Lazy DFAs usually have more than one start state. Branching on
/// this isn't required for correctness, but a common optimization is
/// to run a prefilter when a search enters a start state. Note that
/// start states are *not* tagged automatically, and one must enable the
/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states)
/// setting for start states to be tagged. The reason for this is
/// that a DFA search loop is usually written to execute a prefilter once it
/// enters a start state. But if there is no prefilter, this handling can be
/// quite disastrous as the DFA may ping-pong between the special handling code
/// and a possible optimized hot path for handling untagged states. When start
/// states aren't specialized, then they are untagged and remain in the hot
/// path.
/// * **Match** - A match state indicates that a match has been found.
/// Depending on the semantics of your search implementation, it may either
/// continue until the end of the haystack or a dead state, or it might quit
/// and return the match immediately.
///
/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
/// can be used to determine if a tag exists at all. This is useful to avoid
/// branching on all of the above types for every byte searched.
///
/// # Example
///
/// This example shows how `LazyStateID` can be used to implement a correct
/// search routine with minimal branching. In particular, this search routine
/// implements "leftmost" matching, which means that it doesn't immediately
/// stop once a match is found. Instead, it continues until it reaches a dead
/// state.
///
/// Notice also how a correct search implementation deals with
/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
/// the lazy DFA routines. When a `CacheError` occurs, it returns
/// [`MatchError::gave_up`](crate::MatchError::gave_up).
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::{Cache, DFA},
/// HalfMatch, MatchError, Input,
/// };
///
/// fn find_leftmost_first(
/// dfa: &DFA,
/// cache: &mut Cache,
/// haystack: &[u8],
/// ) -> Result<Option<HalfMatch>, MatchError> {
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack. Note that start states can never
/// // be match states (since DFAs in this crate delay matches by 1
/// // byte), so we don't need to check if the start state is a match.
/// let mut sid = dfa.start_state_forward(
/// cache,
/// &Input::new(haystack),
/// )?;
/// let mut last_match = None;
/// // Walk all the bytes in the haystack. We can quit early if we see
/// // a dead or a quit state. The former means the automaton will
/// // never transition to any other state. The latter means that the
/// // automaton entered a condition in which its search failed.
/// for (i, &b) in haystack.iter().enumerate() {
/// sid = dfa
/// .next_state(cache, sid, b)
/// .map_err(|_| MatchError::gave_up(i))?;
/// if sid.is_tagged() {
/// if sid.is_match() {
/// last_match = Some(HalfMatch::new(
/// dfa.match_pattern(cache, sid, 0),
/// i,
/// ));
/// } else if sid.is_dead() {
/// return Ok(last_match);
/// } else if sid.is_quit() {
/// // It is possible to enter into a quit state after
/// // observing a match has occurred. In that case, we
/// // should return the match instead of an error.
/// if last_match.is_some() {
/// return Ok(last_match);
/// }
/// return Err(MatchError::quit(b, i));
/// }
/// // Implementors may also want to check for start states and
/// // handle them differently for performance reasons. But it is
/// // not necessary for correctness. Note that in order to check
/// // for start states, you'll need to enable the
/// // 'specialize_start_states' config knob, otherwise start
/// // states will not be tagged.
/// }
/// }
/// // Matches are always delayed by 1 byte, so we must explicitly walk
/// // the special "EOI" transition at the end of the search.
/// sid = dfa
/// .next_eoi_state(cache, sid)
/// .map_err(|_| MatchError::gave_up(haystack.len()))?;
/// if sid.is_match() {
/// last_match = Some(HalfMatch::new(
/// dfa.match_pattern(cache, sid, 0),
/// haystack.len(),
/// ));
/// }
/// Ok(last_match)
/// }
///
/// // We use a greedy '+' operator to show how the search doesn't just stop
/// // once a match is detected. It continues extending the match. Using
/// // '[a-z]+?' would also work as expected and stop the search early.
/// // Greediness is built into the automaton.
/// let dfa = DFA::new(r"[a-z]+")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 10);
///
/// // Here's another example that tests our handling of the special
/// // EOI transition. This will fail to find a match if we don't call
/// // 'next_eoi_state' at the end of the search since the match isn't found
/// // until the final byte in the haystack.
/// let dfa = DFA::new(r"[0-9]{4}")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 15);
///
/// // And note that our search implementation above automatically works
/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
/// // the appropriate pattern ID for us.
/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 3);
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 7);
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 5);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
)]
pub struct LazyStateID(u32);
impl LazyStateID {
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
const MAX_BIT: usize = 31;
#[cfg(target_pointer_width = "16")]
const MAX_BIT: usize = 15;
const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
const MAX: usize = LazyStateID::MASK_MATCH - 1;
/// Create a new lazy state ID.
///
/// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
/// an error.
#[inline]
pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
if id > LazyStateID::MAX {
let attempted = u64::try_from(id).unwrap();
return Err(LazyStateIDError { attempted });
}
Ok(LazyStateID::new_unchecked(id))
}
/// Create a new lazy state ID without checking whether the given value
/// exceeds [`LazyStateID::MAX`].
///
/// While this is unchecked, providing an incorrect value must never
/// sacrifice memory safety.
#[inline]
const fn new_unchecked(id: usize) -> LazyStateID {
// FIXME: Use as_u32() once const functions in traits are stable.
LazyStateID(id as u32)
}
/// Return this lazy state ID as an untagged `usize`.
///
/// If this lazy state ID is tagged, then the usize returned is the state
/// ID without the tag. If the ID was not tagged, then the usize returned
/// is equivalent to the state ID.
#[inline]
pub(crate) fn as_usize_untagged(&self) -> usize {
self.as_usize_unchecked() & LazyStateID::MAX
}
/// Return this lazy state ID as its raw internal `usize` value, which may
/// be tagged (and thus greater than LazyStateID::MAX).
#[inline]
pub(crate) const fn as_usize_unchecked(&self) -> usize {
// FIXME: Use as_usize() once const functions in traits are stable.
self.0 as usize
}
#[inline]
pub(crate) const fn to_unknown(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
)
}
#[inline]
pub(crate) const fn to_dead(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
)
}
#[inline]
pub(crate) const fn to_quit(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
)
}
/// Return this lazy state ID as a state ID that is tagged as a start
/// state.
#[inline]
pub(crate) const fn to_start(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_START,
)
}
/// Return this lazy state ID as a lazy state ID that is tagged as a match
/// state.
#[inline]
pub(crate) const fn to_match(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
)
}
/// Return true if and only if this lazy state ID is tagged.
///
/// When a lazy state ID is tagged, then one can conclude that it is one
/// of a match, start, dead, quit or unknown state.
#[inline]
pub const fn is_tagged(&self) -> bool {
self.as_usize_unchecked() > LazyStateID::MAX
}
/// Return true if and only if this represents a lazy state ID that is
/// "unknown." That is, the state has not yet been created. When a caller
/// sees this state ID, it generally means that a state has to be computed
/// in order to proceed.
#[inline]
pub const fn is_unknown(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
}
/// Return true if and only if this represents a dead state. A dead state
/// is a state that can never transition to any other state except the
/// dead state. When a dead state is seen, it generally indicates that a
/// search should stop.
#[inline]
pub const fn is_dead(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
}
/// Return true if and only if this represents a quit state. A quit state
/// is a state that is representationally equivalent to a dead state,
/// except it indicates the automaton has reached a point at which it can
/// no longer determine whether a match exists or not. In general, this
/// indicates an error during search and the caller must either pass this
/// error up or use a different search technique.
#[inline]
pub const fn is_quit(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
}
/// Return true if and only if this lazy state ID has been tagged as a
/// start state.
///
/// Note that if
/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is
/// disabled (which is the default), then this will always return false
/// since start states won't be tagged.
#[inline]
pub const fn is_start(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_START > 0
}
/// Return true if and only if this lazy state ID has been tagged as a
/// match state.
#[inline]
pub const fn is_match(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
}
}
/// This error occurs when a lazy state ID could not be constructed.
///
/// This occurs when given an integer exceeding the maximum lazy state ID
/// value.
///
/// When the `std` feature is enabled, this implements the `Error` trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct LazyStateIDError {
attempted: u64,
}
impl LazyStateIDError {
/// Returns the value that failed to constructed a lazy state ID.
pub(crate) fn attempted(&self) -> u64 {
self.attempted
}
}
#[cfg(feature = "std")]
impl std::error::Error for LazyStateIDError {}
impl core::fmt::Display for LazyStateIDError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create LazyStateID from {:?}, which exceeds {:?}",
self.attempted(),
LazyStateID::MAX,
)
}
}

144
vendor/regex-automata/src/hybrid/mod.rs vendored Normal file
View File

@@ -0,0 +1,144 @@
/*!
A module for building and searching with lazy deterministic finite automata
(DFAs).
Like other modules in this crate, lazy DFAs support a rich regex syntax with
Unicode features. The key feature of a lazy DFA is that it builds itself
incrementally during search, and never uses more than a configured capacity of
memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache"
in which the actual DFA's transition table is stored.
If you're looking for fully compiled DFAs, then please see the top-level
[`dfa` module](crate::dfa).
# Overview
This section gives a brief overview of the primary types in this module:
* A [`regex::Regex`] provides a way to search for matches of a regular
expression using lazy DFAs. This includes iterating over matches with both the
start and end positions of each match.
* A [`dfa::DFA`] provides direct low level access to a lazy DFA.
# Example: basic regex searching
This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```
use regex_automata::{hybrid::regex::Regex, Match};
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
let mut cache = re.create_cache();
let haystack = "2018-12-24 2016-10-08";
let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: searching with multiple regexes
The lazy DFAs in this module all fully support searching with multiple regexes
simultaneously. You can use this support with standard leftmost-first style
searching to find non-overlapping matches:
```
# if cfg!(miri) { return Ok(()); } // miri takes too long
use regex_automata::{hybrid::regex::Regex, Match};
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
let mut cache = re.create_cache();
let haystack = "@foo bar";
let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect();
assert_eq!(matches, vec![
Match::must(1, 0..4),
Match::must(0, 5..8),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# When should I use this?
Generally speaking, if you can abide the use of mutable state during search,
and you don't need things like capturing groups or Unicode word boundary
support in non-ASCII text, then a lazy DFA is likely a robust choice with
respect to both search speed and memory usage. Note however that its speed
may be worse than a general purpose regex engine if you don't select a good
[prefilter](crate::util::prefilter).
If you know ahead of time that your pattern would result in a very large DFA
if it was fully compiled, it may be better to use an NFA simulation instead
of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA
to something that is big enough to hold the state machine (likely through
experimentation). The issue here is that if the cache is too small, then it
could wind up being reset too frequently and this might decrease searching
speed significantly.
# Differences with fully compiled DFAs
A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a
[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities
(and similarly for their underlying DFAs), but they achieve them through
different means. The main difference is that a hybrid or "lazy" regex builds
its DFA lazily during search, where as a fully compiled regex will build its
DFA at construction time. While building a DFA at search time might sound like
it's slow, it tends to work out where most bytes seen during a search will
reuse pre-built parts of the DFA and thus can be almost as fast as a fully
compiled DFA. The main downside is that searching requires mutable space to
store the DFA, and, in the worst case, a search can result in a new state being
created for each byte seen, which would make searching quite a bit slower.
A fully compiled DFA never has to worry about searches being slower once
it's built. (Aside from, say, the transition table being so large that it
is subject to harsh CPU cache effects.) However, of course, building a full
DFA can be quite time consuming and memory hungry. Particularly when large
Unicode character classes are used, which tend to translate into very large
DFAs.
A lazy DFA strikes a nice balance _in practice_, particularly in the
presence of Unicode mode, by only building what is needed. It avoids the
worst case exponential time complexity of DFA compilation by guaranteeing that
it will only build at most one state per byte searched. While the worst
case here can lead to a very high constant, it will never be exponential.
# Syntax
This module supports the same syntax as the `regex` crate, since they share the
same parser. You can find an exhaustive list of supported syntax in the
[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
There are two things that are not supported by the lazy DFAs in this module:
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
of them) can only find the offsets of an entire match, but cannot resolve
the offsets of each capturing group. This is because DFAs do not have the
expressive power necessary. Note that it is okay to build a lazy DFA from an
NFA that contains capture groups. The capture groups will simply be ignored.
* Unicode word boundaries. These present particularly difficult challenges for
DFA construction and would result in an explosion in the number of states.
One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
heuristic support for Unicode word boundaries that only works on ASCII text.
Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
on any input.
There are no plans to lift either of these limitations.
Note that these restrictions are identical to the restrictions on fully
compiled DFAs.
*/
pub use self::{
error::{BuildError, CacheError, StartError},
id::LazyStateID,
};
pub mod dfa;
mod error;
mod id;
pub mod regex;
mod search;

View File

@@ -0,0 +1,895 @@
/*!
A lazy DFA backed `Regex`.
This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
convenience routines you might have come to expect, such as finding a match
and iterating over all non-overlapping matches. This `Regex` type is limited
in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
capturing groups, for example, are not provided.
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
finds the end offset of a match, where as the other is a "reverse" DFA that
find the start offset of a match.
See the [parent module](crate::hybrid) for examples.
*/
use crate::{
hybrid::{
dfa::{self, DFA},
error::BuildError,
},
nfa::thompson,
util::{
iter,
search::{Anchored, Input, Match, MatchError, MatchKind},
},
};
/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
/// for searching.
///
/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
/// a match while the reverse DFA is responsible for detecting the start
/// of a match. Thus, in order to find the bounds of any given match, a
/// forward search must first be run followed by a reverse search. A match
/// found by the forward DFA guarantees that the reverse DFA will also find
/// a match.
///
/// # Fallibility
///
/// Most of the search routines defined on this type will _panic_ when the
/// underlying search fails. This might be because the DFA gave up because it
/// saw a quit byte, whether configured explicitly or via heuristic Unicode
/// word boundary support, although neither are enabled by default. It might
/// also fail if the underlying DFA determines it isn't making effective use of
/// the cache (which also never happens by default). Or it might fail because
/// an invalid `Input` configuration is given, for example, with an unsupported
/// [`Anchored`] mode.
///
/// If you need to handle these error cases instead of allowing them to trigger
/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
/// that never panics.
///
/// # Example
///
/// This example shows how to cause a search to terminate if it sees a
/// `\n` byte, and handle the error returned. This could be useful if, for
/// example, you wanted to prevent a user supplied pattern from matching
/// across a line boundary.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
///
/// let re = Regex::builder()
/// .dfa(dfa::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
/// let mut cache = re.create_cache();
///
/// let input = Input::new("foo\nbar");
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
/// let expected = MatchError::quit(b'\n', 3);
/// let got = re.try_search(&mut cache, &input).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Debug)]
pub struct Regex {
/// The forward lazy DFA. This can only find the end of a match.
forward: DFA,
/// The reverse lazy DFA. This can only find the start of a match.
///
/// This is built with 'all' match semantics (instead of leftmost-first)
/// so that it always finds the longest possible match (which corresponds
/// to the leftmost starting position). It is also compiled as an anchored
/// matcher and has 'starts_for_each_pattern' enabled. Including starting
/// states for each pattern is necessary to ensure that we only look for
/// matches of a pattern that matched in the forward direction. Otherwise,
/// we might wind up finding the "leftmost" starting position of a totally
/// different pattern!
reverse: DFA,
}
/// Convenience routines for regex and cache construction.
impl Regex {
/// Parse the given regular expression using the default configuration and
/// return the corresponding regex.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
/// Some(Match::must(0, 3..14)),
/// re.find(&mut cache, "zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
Regex::builder().build(pattern)
}
/// Like `new`, but parses multiple patterns into a single "multi regex."
/// This similarly uses the default regex configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
/// let mut cache = re.create_cache();
///
/// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex, BuildError> {
Regex::builder().build_many(patterns)
}
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
/// [`Builder`] type in common cases.
///
/// # Example
///
/// This example shows how to use the builder to disable UTF-8 mode
/// everywhere.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
/// let expected = Some(Match::must(0, 1..9));
/// let got = re.find(&mut cache, haystack);
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn builder() -> Builder {
Builder::new()
}
/// Create a new cache for this `Regex`.
///
/// The cache returned should only be used for searches for this
/// `Regex`. If you want to reuse the cache for another `Regex`, then
/// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
/// [`Regex::reset_cache`]).
pub fn create_cache(&self) -> Cache {
Cache::new(self)
}
/// Reset the given cache such that it can be used for searching with the
/// this `Regex` (and only this `Regex`).
///
/// A cache reset permits reusing memory already allocated in this cache
/// with a different `Regex`.
///
/// Resetting a cache sets its "clear count" to 0. This is relevant if the
/// `Regex` has been configured to "give up" after it has cleared the cache
/// a certain number of times.
///
/// # Example
///
/// This shows how to re-purpose a cache for use with a different `Regex`.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re1 = Regex::new(r"\w")?;
/// let re2 = Regex::new(r"\W")?;
///
/// let mut cache = re1.create_cache();
/// assert_eq!(
/// Some(Match::must(0, 0..2)),
/// re1.find(&mut cache, "Δ"),
/// );
///
/// // Using 'cache' with re2 is not allowed. It may result in panics or
/// // incorrect results. In order to re-purpose the cache, we must reset
/// // it with the Regex we'd like to use it with.
/// //
/// // Similarly, after this reset, using the cache with 're1' is also not
/// // allowed.
/// re2.reset_cache(&mut cache);
/// assert_eq!(
/// Some(Match::must(0, 0..3)),
/// re2.find(&mut cache, "☃"),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn reset_cache(&self, cache: &mut Cache) {
self.forward().reset_cache(&mut cache.forward);
self.reverse().reset_cache(&mut cache.reverse);
}
}
/// Standard infallible search routines for finding and iterating over matches.
impl Regex {
/// Returns true if and only if this regex matches the given haystack.
///
/// This routine may short circuit if it knows that scanning future input
/// will never lead to a different result. In particular, if the underlying
/// DFA enters a match state or a dead state, then this routine will return
/// `true` or `false`, respectively, without inspecting any future input.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::hybrid::regex::Regex;
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
///
/// assert!(re.is_match(&mut cache, "foo12345bar"));
/// assert!(!re.is_match(&mut cache, "foobar"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn is_match<'h, I: Into<Input<'h>>>(
&self,
cache: &mut Cache,
input: I,
) -> bool {
// Not only can we do an "earliest" search, but we can avoid doing a
// reverse scan too.
self.forward()
.try_search_fwd(&mut cache.forward, &input.into().earliest(true))
.unwrap()
.is_some()
}
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, hybrid::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
/// Some(Match::must(0, 3..11)),
/// re.find(&mut cache, "zzzfoo12345zzz"),
/// );
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the default leftmost-first match semantics demand that we find the
/// // earliest match that prefers earlier parts of the pattern over latter
/// // parts.
/// let re = Regex::new("abc|a")?;
/// let mut cache = re.create_cache();
/// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find<'h, I: Into<Input<'h>>>(
&self,
cache: &mut Cache,
input: I,
) -> Option<Match> {
self.try_search(cache, &input.into()).unwrap()
}
/// Returns an iterator over all non-overlapping leftmost matches in the
/// given bytes. If no match exists, then the iterator yields no elements.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// The above conditions also apply to the iterator returned as well. For
/// example, if the lazy DFA gives up or quits during a search using this
/// method, then a panic will occur during iteration.
///
/// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
/// if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
///
/// let text = "foo1 foo12 foo123";
/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
/// assert_eq!(matches, vec![
/// Match::must(0, 0..4),
/// Match::must(0, 5..10),
/// Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
&'r self,
cache: &'c mut Cache,
input: I,
) -> FindMatches<'r, 'c, 'h> {
let it = iter::Searcher::new(input.into());
FindMatches { re: self, cache, it }
}
}
/// Lower level "search" primitives that accept a `&Input` for cheap reuse
/// and return an error if one occurs instead of panicking.
impl Regex {
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// This is like [`Regex::find`] but with two differences:
///
/// 1. It is not generic over `Into<Input>` and instead accepts a
/// `&Input`. This permits reusing the same `Input` for multiple searches
/// without needing to create a new one. This _may_ help with latency.
/// 2. It returns an error if the search could not complete where as
/// [`Regex::find`] will panic.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
#[inline]
pub fn try_search(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<Option<Match>, MatchError> {
let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
let end = match self.forward().try_search_fwd(fcache, input)? {
None => return Ok(None),
Some(end) => end,
};
// This special cases an empty match at the beginning of the search. If
// our end matches our start, then since a reverse DFA can't match past
// the start, it must follow that our starting position is also our end
// position. So short circuit and skip the reverse search.
if input.start() == end.offset() {
return Ok(Some(Match::new(
end.pattern(),
end.offset()..end.offset(),
)));
}
// We can also skip the reverse search if we know our search was
// anchored. This occurs either when the input config is anchored or
// when we know the regex itself is anchored. In this case, we know the
// start of the match, if one is found, must be the start of the
// search.
if self.is_anchored(input) {
return Ok(Some(Match::new(
end.pattern(),
input.start()..end.offset(),
)));
}
// N.B. I have tentatively convinced myself that it isn't necessary
// to specify the specific pattern for the reverse search since the
// reverse search will always find the same pattern to match as the
// forward search. But I lack a rigorous proof. Why not just provide
// the pattern anyway? Well, if it is needed, then leaving it out
// gives us a chance to find a witness. (Also, if we don't need to
// specify the pattern, then we don't need to build the reverse DFA
// with 'starts_for_each_pattern' enabled. It doesn't matter too much
// for the lazy DFA, but does make the overall DFA bigger.)
//
// We also need to be careful to disable 'earliest' for the reverse
// search, since it could be enabled for the forward search. In the
// reverse case, to satisfy "leftmost" criteria, we need to match as
// much as we can. We also need to be careful to make the search
// anchored. We don't want the reverse search to report any matches
// other than the one beginning at the end of our forward search.
let revsearch = input
.clone()
.span(input.start()..end.offset())
.anchored(Anchored::Yes)
.earliest(false);
let start = self
.reverse()
.try_search_rev(rcache, &revsearch)?
.expect("reverse search must match if forward search does");
debug_assert_eq!(
start.pattern(),
end.pattern(),
"forward and reverse search must match same pattern",
);
debug_assert!(start.offset() <= end.offset());
Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}
/// Returns true if either the given input specifies an anchored search
/// or if the underlying NFA is always anchored.
fn is_anchored(&self, input: &Input<'_>) -> bool {
match input.get_anchored() {
Anchored::No => {
self.forward().get_nfa().is_always_start_anchored()
}
Anchored::Yes | Anchored::Pattern(_) => true,
}
}
}
/// Non-search APIs for querying information about the regex and setting a
/// prefilter.
impl Regex {
/// Return the underlying lazy DFA responsible for forward matching.
///
/// This is useful for accessing the underlying lazy DFA and using it
/// directly if the situation calls for it.
pub fn forward(&self) -> &DFA {
&self.forward
}
/// Return the underlying lazy DFA responsible for reverse matching.
///
/// This is useful for accessing the underlying lazy DFA and using it
/// directly if the situation calls for it.
pub fn reverse(&self) -> &DFA {
&self.reverse
}
/// Returns the total number of patterns matched by this regex.
///
/// # Example
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::hybrid::regex::Regex;
///
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
/// assert_eq!(3, re.pattern_len());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn pattern_len(&self) -> usize {
assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
self.forward().pattern_len()
}
}
/// An iterator over all non-overlapping matches for an infallible search.
///
/// The iterator yields a [`Match`] value until no more matches could be found.
/// If the underlying regex engine returns an error, then a panic occurs.
///
/// The lifetime parameters are as follows:
///
/// * `'r` represents the lifetime of the regex object.
/// * `'h` represents the lifetime of the haystack being searched.
/// * `'c` represents the lifetime of the regex cache.
///
/// This iterator can be created with the [`Regex::find_iter`] method.
#[derive(Debug)]
pub struct FindMatches<'r, 'c, 'h> {
re: &'r Regex,
cache: &'c mut Cache,
it: iter::Searcher<'h>,
}
impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
type Item = Match;
#[inline]
fn next(&mut self) -> Option<Match> {
let FindMatches { re, ref mut cache, ref mut it } = *self;
it.advance(|input| re.try_search(cache, input))
}
}
/// A cache represents a partially computed forward and reverse DFA.
///
/// A cache is the key component that differentiates a classical DFA and a
/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
/// complete transition table that can handle all possible inputs, a hybrid
/// NFA/DFA starts with an empty transition table and builds only the parts
/// required during search. The parts that are built are stored in a cache. For
/// this reason, a cache is a required parameter for nearly every operation on
/// a [`Regex`].
///
/// Caches can be created from their corresponding `Regex` via
/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
/// that created it, or the `Regex` that was most recently used to reset it
/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
/// panics or incorrect results.
#[derive(Debug, Clone)]
pub struct Cache {
forward: dfa::Cache,
reverse: dfa::Cache,
}
impl Cache {
/// Create a new cache for the given `Regex`.
///
/// The cache returned should only be used for searches for the given
/// `Regex`. If you want to reuse the cache for another `Regex`, then you
/// must call [`Cache::reset`] with that `Regex`.
pub fn new(re: &Regex) -> Cache {
let forward = dfa::Cache::new(re.forward());
let reverse = dfa::Cache::new(re.reverse());
Cache { forward, reverse }
}
/// Reset this cache such that it can be used for searching with the given
/// `Regex` (and only that `Regex`).
///
/// A cache reset permits reusing memory already allocated in this cache
/// with a different `Regex`.
///
/// Resetting a cache sets its "clear count" to 0. This is relevant if the
/// `Regex` has been configured to "give up" after it has cleared the cache
/// a certain number of times.
///
/// # Example
///
/// This shows how to re-purpose a cache for use with a different `Regex`.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re1 = Regex::new(r"\w")?;
/// let re2 = Regex::new(r"\W")?;
///
/// let mut cache = re1.create_cache();
/// assert_eq!(
/// Some(Match::must(0, 0..2)),
/// re1.find(&mut cache, "Δ"),
/// );
///
/// // Using 'cache' with re2 is not allowed. It may result in panics or
/// // incorrect results. In order to re-purpose the cache, we must reset
/// // it with the Regex we'd like to use it with.
/// //
/// // Similarly, after this reset, using the cache with 're1' is also not
/// // allowed.
/// cache.reset(&re2);
/// assert_eq!(
/// Some(Match::must(0, 0..3)),
/// re2.find(&mut cache, "☃"),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn reset(&mut self, re: &Regex) {
self.forward.reset(re.forward());
self.reverse.reset(re.reverse());
}
/// Return a reference to the forward cache.
pub fn forward(&mut self) -> &dfa::Cache {
&self.forward
}
/// Return a reference to the reverse cache.
pub fn reverse(&mut self) -> &dfa::Cache {
&self.reverse
}
/// Return a mutable reference to the forward cache.
///
/// If you need mutable references to both the forward and reverse caches,
/// then use [`Cache::as_parts_mut`].
pub fn forward_mut(&mut self) -> &mut dfa::Cache {
&mut self.forward
}
/// Return a mutable reference to the reverse cache.
///
/// If you need mutable references to both the forward and reverse caches,
/// then use [`Cache::as_parts_mut`].
pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
&mut self.reverse
}
/// Return references to the forward and reverse caches, respectively.
pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
(&self.forward, &self.reverse)
}
/// Return mutable references to the forward and reverse caches,
/// respectively.
pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
(&mut self.forward, &mut self.reverse)
}
/// Returns the heap memory usage, in bytes, as a sum of the forward and
/// reverse lazy DFA caches.
///
/// This does **not** include the stack size used up by this cache. To
/// compute that, use `std::mem::size_of::<Cache>()`.
pub fn memory_usage(&self) -> usize {
self.forward.memory_usage() + self.reverse.memory_usage()
}
}
/// A builder for a regex based on a hybrid NFA/DFA.
///
/// This builder permits configuring options for the syntax of a pattern, the
/// NFA construction, the lazy DFA construction and finally the regex searching
/// itself. This builder is different from a general purpose regex builder
/// in that it permits fine grain configuration of the construction process.
/// The trade off for this is complexity, and the possibility of setting a
/// configuration that might not make sense. For example, there are two
/// different UTF-8 modes:
///
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
/// whether the pattern itself can contain sub-expressions that match invalid
/// UTF-8.
/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
/// advance the starting position of the next search when a match with zero
/// length is found.
///
/// Generally speaking, callers will want to either enable all of these or
/// disable all of these.
///
/// Internally, building a regex requires building two hybrid NFA/DFAs,
/// where one is responsible for finding the end of a match and the other is
/// responsible for finding the start of a match. If you only need to detect
/// whether something matched, or only the end of a match, then you should use
/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
/// than building two of them.
///
/// # Example
///
/// This example shows how to disable UTF-8 mode in the syntax and the regex
/// itself. This is generally what you want for matching on arbitrary bytes.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
/// let expected = Some(Match::must(0, 1..9));
/// let got = re.find(&mut cache, haystack);
/// assert_eq!(expected, got);
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
/// // but the subsequent `.*` does not! Disabling UTF-8
/// // on the syntax permits this.
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
dfa: dfa::Builder,
}
impl Builder {
/// Create a new regex builder with the default configuration.
pub fn new() -> Builder {
Builder { dfa: DFA::builder() }
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
#[cfg(feature = "syntax")]
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
self.build_many(&[pattern])
}
/// Build a regex from the given patterns.
#[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<Regex, BuildError> {
let forward = self.dfa.build_many(patterns)?;
let reverse = self
.dfa
.clone()
.configure(
DFA::config()
.prefilter(None)
.specialize_start_states(false)
.match_kind(MatchKind::All),
)
.thompson(thompson::Config::new().reverse(true))
.build_many(patterns)?;
Ok(self.build_from_dfas(forward, reverse))
}
/// Build a regex from its component forward and reverse hybrid NFA/DFAs.
///
/// This is useful when you've built a forward and reverse lazy DFA
/// separately, and want to combine them into a single regex. Once build,
/// the individual DFAs given can still be accessed via [`Regex::forward`]
/// and [`Regex::reverse`].
///
/// It is important that the reverse lazy DFA be compiled under the
/// following conditions:
///
/// * It should use [`MatchKind::All`] semantics.
/// * It should match in reverse.
/// * Otherwise, its configuration should match the forward DFA.
///
/// If these conditions aren't satisfied, then the behavior of searches is
/// unspecified.
///
/// Note that when using this constructor, no configuration is applied.
/// Since this routine provides the DFAs to the builder, there is no
/// opportunity to apply other configuration options.
///
/// # Example
///
/// This shows how to build individual lazy forward and reverse DFAs, and
/// then combine them into a single `Regex`.
///
/// ```
/// use regex_automata::{
/// hybrid::{dfa::DFA, regex::Regex},
/// nfa::thompson,
/// MatchKind,
/// };
///
/// let fwd = DFA::new(r"foo[0-9]+")?;
/// let rev = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
/// .thompson(thompson::Config::new().reverse(true))
/// .build(r"foo[0-9]+")?;
///
/// let re = Regex::builder().build_from_dfas(fwd, rev);
/// let mut cache = re.create_cache();
/// assert_eq!(true, re.is_match(&mut cache, "foo123"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
Regex { forward, reverse }
}
/// Set the syntax configuration for this builder using
/// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
#[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
config: crate::util::syntax::Config,
) -> &mut Builder {
self.dfa.syntax(config);
self
}
/// Set the Thompson NFA configuration for this builder using
/// [`nfa::thompson::Config`](thompson::Config).
///
/// This permits setting things like whether additional time should be
/// spent shrinking the size of the NFA.
#[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.dfa.thompson(config);
self
}
/// Set the lazy DFA compilation configuration for this builder using
/// [`dfa::Config`].
///
/// This permits setting things like whether Unicode word boundaries should
/// be heuristically supported or settings how the behavior of the cache.
pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
self.dfa.configure(config);
self
}
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}

View File

@@ -0,0 +1,802 @@
use crate::{
hybrid::{
dfa::{Cache, OverlappingState, DFA},
id::LazyStateID,
},
util::{
prefilter::Prefilter,
search::{HalfMatch, Input, MatchError, Span},
},
};
#[inline(never)]
pub(crate) fn find_fwd(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
if input.is_done() {
return Ok(None);
}
let pre = if input.get_anchored().is_anchored() {
None
} else {
dfa.get_config().get_prefilter()
};
// So what we do here is specialize four different versions of 'find_fwd':
// one for each of the combinations for 'has prefilter' and 'is earliest
// search'. The reason for doing this is that both of these things require
// branches and special handling in some code that can be very hot,
// and shaving off as much as we can when we don't need it tends to be
// beneficial in ad hoc benchmarks. To see these differences, you often
// need a query with a high match count. In other words, specializing these
// four routines *tends* to help latency more than throughput.
if pre.is_some() {
if input.get_earliest() {
find_fwd_imp(dfa, cache, input, pre, true)
} else {
find_fwd_imp(dfa, cache, input, pre, false)
}
} else {
if input.get_earliest() {
find_fwd_imp(dfa, cache, input, None, true)
} else {
find_fwd_imp(dfa, cache, input, None, false)
}
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_fwd_imp(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
pre: Option<&'_ Prefilter>,
earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
// See 'prefilter_restart' docs for explanation.
let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
let mut mat = None;
let mut sid = init_fwd(dfa, cache, input)?;
let mut at = input.start();
// This could just be a closure, but then I think it would be unsound
// because it would need to be safe to invoke. This way, the lack of safety
// is clearer in the code below.
macro_rules! next_unchecked {
($sid:expr, $at:expr) => {{
let byte = *input.haystack().get_unchecked($at);
dfa.next_state_untagged_unchecked(cache, $sid, byte)
}};
}
if let Some(ref pre) = pre {
let span = Span::from(at..input.end());
match pre.find(input.haystack(), span) {
None => return Ok(mat),
Some(ref span) => {
at = span.start;
if !universal_start {
sid = prefilter_restart(dfa, cache, &input, at)?;
}
}
}
}
cache.search_start(at);
while at < input.end() {
if sid.is_tagged() {
cache.search_update(at);
sid = dfa
.next_state(cache, sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
} else {
// SAFETY: There are two safety invariants we need to uphold
// here in the loops below: that 'sid' and 'prev_sid' are valid
// state IDs for this DFA, and that 'at' is a valid index into
// 'haystack'. For the former, we rely on the invariant that
// next_state* and start_state_forward always returns a valid state
// ID (given a valid state ID in the former case), and that we are
// only at this place in the code if 'sid' is untagged. Moreover,
// every call to next_state_untagged_unchecked below is guarded by
// a check that sid is untagged. For the latter safety invariant,
// we always guard unchecked access with a check that 'at' is less
// than 'end', where 'end <= haystack.len()'. In the unrolled loop
// below, we ensure that 'at' is always in bounds.
//
// PERF: For justification of omitting bounds checks, it gives us a
// ~10% bump in search time. This was used for a benchmark:
//
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
//
// PERF: For justification for the loop unrolling, we use a few
// different tests:
//
// regex-cli find half hybrid -p '\w{50}' -UBb bigfile
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
// regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
//
// And there are three different configurations:
//
// nounroll: this entire 'else' block vanishes and we just
// always use 'dfa.next_state(..)'.
// unroll1: just the outer loop below
// unroll2: just the inner loop below
// unroll3: both the outer and inner loops below
//
// This results in a matrix of timings for each of the above
// regexes with each of the above unrolling configurations:
//
// '\w{50}' '(?m)^.+$' 'ZQZQZQZQ'
// nounroll 1.51s 2.34s 1.51s
// unroll1 1.53s 2.32s 1.56s
// unroll2 2.22s 1.50s 0.61s
// unroll3 1.67s 1.45s 0.61s
//
// Ideally we'd be able to find a configuration that yields the
// best time for all regexes, but alas we settle for unroll3 that
// gives us *almost* the best for '\w{50}' and the best for the
// other two regexes.
//
// So what exactly is going on here? The first unrolling (grouping
// together runs of untagged transitions) specifically targets
// our choice of representation. The second unrolling (grouping
// together runs of self-transitions) specifically targets a common
// DFA topology. Let's dig in a little bit by looking at our
// regexes:
//
// '\w{50}': This regex spends a lot of time outside of the DFA's
// start state matching some part of the '\w' repetition. This
// means that it's a bit of a worst case for loop unrolling that
// targets self-transitions since the self-transitions in '\w{50}'
// are not particularly active for this haystack. However, the
// first unrolling (grouping together untagged transitions)
// does apply quite well here since very few transitions hit
// match/dead/quit/unknown states. It is however worth mentioning
// that if start states are configured to be tagged (which you
// typically want to do if you have a prefilter), then this regex
// actually slows way down because it is constantly ping-ponging
// out of the unrolled loop and into the handling of a tagged start
// state below. But when start states aren't tagged, the unrolled
// loop stays hot. (This is why it's imperative that start state
// tagging be disabled when there isn't a prefilter!)
//
// '(?m)^.+$': There are two important aspects of this regex: 1)
// on this haystack, its match count is very high, much higher
// than the other two regex and 2) it spends the vast majority
// of its time matching '.+'. Since Unicode mode is disabled,
// this corresponds to repeatedly following self transitions for
// the vast majority of the input. This does benefit from the
// untagged unrolling since most of the transitions will be to
// untagged states, but the untagged unrolling does more work than
// what is actually required. Namely, it has to keep track of the
// previous and next state IDs, which I guess requires a bit more
// shuffling. This is supported by the fact that nounroll+unroll1
// are both slower than unroll2+unroll3, where the latter has a
// loop unrolling that specifically targets self-transitions.
//
// 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it
// spends the vast majority of its time in self-transitions for
// the (implicit) unanchored prefix. The main difference with
// '(?m)^.+$' is that it has a much lower match count. So there
// isn't much time spent in the overhead of reporting matches. This
// is the primary explainer in the perf difference here. We include
// this regex and the former to make sure we have comparison points
// with high and low match counts.
//
// NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
//
// NOTE: In a follow-up, it turns out that the "inner" loop
// mentioned above was a pretty big pessimization in some other
// cases. Namely, it resulted in too much ping-ponging into and out
// of the loop, which resulted in nearly ~2x regressions in search
// time when compared to the originally lazy DFA in the regex crate.
// So I've removed the second loop unrolling that targets the
// self-transition case.
let mut prev_sid = sid;
while at < input.end() {
prev_sid = unsafe { next_unchecked!(sid, at) };
if prev_sid.is_tagged() || at + 3 >= input.end() {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at += 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if sid.is_tagged() {
break;
}
at += 1;
prev_sid = unsafe { next_unchecked!(sid, at) };
if prev_sid.is_tagged() {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at += 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if sid.is_tagged() {
break;
}
at += 1;
}
// If we quit out of the code above with an unknown state ID at
// any point, then we need to re-compute that transition using
// 'next_state', which will do NFA powerset construction for us.
if sid.is_unknown() {
cache.search_update(at);
sid = dfa
.next_state(cache, prev_sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
if let Some(ref pre) = pre {
let span = Span::from(at..input.end());
match pre.find(input.haystack(), span) {
None => {
cache.search_finish(span.end);
return Ok(mat);
}
Some(ref span) => {
// We want to skip any update to 'at' below
// at the end of this iteration and just
// jump immediately back to the next state
// transition at the leading position of the
// candidate match.
//
// ... but only if we actually made progress
// with our prefilter, otherwise if the start
// state has a self-loop, we can get stuck.
if span.start > at {
at = span.start;
if !universal_start {
sid = prefilter_restart(
dfa, cache, &input, at,
)?;
}
continue;
}
}
}
}
} else if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
// Since slice ranges are inclusive at the beginning and
// exclusive at the end, and since forward searches report
// the end, we can return 'at' as-is. This only works because
// matches are delayed by 1 byte. So by the time we observe a
// match, 'at' has already been set to 1 byte past the actual
// match location, which is precisely the exclusive ending
// bound of the match.
mat = Some(HalfMatch::new(pattern, at));
if earliest {
cache.search_finish(at);
return Ok(mat);
}
} else if sid.is_dead() {
cache.search_finish(at);
return Ok(mat);
} else if sid.is_quit() {
cache.search_finish(at);
return Err(MatchError::quit(input.haystack()[at], at));
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
at += 1;
}
eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
cache.search_finish(input.end());
Ok(mat)
}
#[inline(never)]
pub(crate) fn find_rev(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
if input.is_done() {
return Ok(None);
}
if input.get_earliest() {
find_rev_imp(dfa, cache, input, true)
} else {
find_rev_imp(dfa, cache, input, false)
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_rev_imp(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
let mut mat = None;
let mut sid = init_rev(dfa, cache, input)?;
// In reverse search, the loop below can't handle the case of searching an
// empty slice. Ideally we could write something congruent to the forward
// search, i.e., 'while at >= start', but 'start' might be 0. Since we use
// an unsigned offset, 'at >= 0' is trivially always true. We could avoid
// this extra case handling by using a signed offset, but Rust makes it
// annoying to do. So... We just handle the empty case separately.
if input.start() == input.end() {
eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
macro_rules! next_unchecked {
($sid:expr, $at:expr) => {{
let byte = *input.haystack().get_unchecked($at);
dfa.next_state_untagged_unchecked(cache, $sid, byte)
}};
}
cache.search_start(at);
loop {
if sid.is_tagged() {
cache.search_update(at);
sid = dfa
.next_state(cache, sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
} else {
// SAFETY: See comments in 'find_fwd' for a safety argument.
//
// PERF: The comments in 'find_fwd' also provide a justification
// from a performance perspective as to 1) why we elide bounds
// checks and 2) why we do a specialized version of unrolling
// below. The reverse search does have a slightly different
// consideration in that most reverse searches tend to be
// anchored and on shorter haystacks. However, this still makes a
// difference. Take this command for example:
//
// regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
//
// (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
// like in the justification for the forward direction. The 'regex'
// sub-command will find start-of-match and thus run the reverse
// direction.)
//
// Without unrolling below, the above command takes around 3.76s.
// But with the unrolling below, we get down to 2.55s. If we keep
// the unrolling but add in bounds checks, then we get 2.86s.
//
// NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
let mut prev_sid = sid;
while at >= input.start() {
prev_sid = unsafe { next_unchecked!(sid, at) };
if prev_sid.is_tagged()
|| at <= input.start().saturating_add(3)
{
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at -= 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if sid.is_tagged() {
break;
}
at -= 1;
prev_sid = unsafe { next_unchecked!(sid, at) };
if prev_sid.is_tagged() {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at -= 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if sid.is_tagged() {
break;
}
at -= 1;
}
// If we quit out of the code above with an unknown state ID at
// any point, then we need to re-compute that transition using
// 'next_state', which will do NFA powerset construction for us.
if sid.is_unknown() {
cache.search_update(at);
sid = dfa
.next_state(cache, prev_sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
// do nothing
} else if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
// Since reverse searches report the beginning of a match
// and the beginning is inclusive (not exclusive like the
// end of a match), we add 1 to make it inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
cache.search_finish(at);
return Ok(mat);
}
} else if sid.is_dead() {
cache.search_finish(at);
return Ok(mat);
} else if sid.is_quit() {
cache.search_finish(at);
return Err(MatchError::quit(input.haystack()[at], at));
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
if at == input.start() {
break;
}
at -= 1;
}
cache.search_finish(input.start());
eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
Ok(mat)
}
#[inline(never)]
pub(crate) fn find_overlapping_fwd(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
state.mat = None;
if input.is_done() {
return Ok(());
}
let pre = if input.get_anchored().is_anchored() {
None
} else {
dfa.get_config().get_prefilter()
};
if pre.is_some() {
find_overlapping_fwd_imp(dfa, cache, input, pre, state)
} else {
find_overlapping_fwd_imp(dfa, cache, input, None, state)
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
pre: Option<&'_ Prefilter>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
// See 'prefilter_restart' docs for explanation.
let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
let mut sid = match state.id {
None => {
state.at = input.start();
init_fwd(dfa, cache, input)?
}
Some(sid) => {
if let Some(match_index) = state.next_match_index {
let match_len = dfa.match_len(cache, sid);
if match_index < match_len {
state.next_match_index = Some(match_index + 1);
let pattern = dfa.match_pattern(cache, sid, match_index);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
}
}
// Once we've reported all matches at a given position, we need to
// advance the search to the next position.
state.at += 1;
if state.at > input.end() {
return Ok(());
}
sid
}
};
// NOTE: We don't optimize the crap out of this routine primarily because
// it seems like most overlapping searches will have higher match counts,
// and thus, throughput is perhaps not as important. But if you have a use
// case for something faster, feel free to file an issue.
cache.search_start(state.at);
while state.at < input.end() {
sid = dfa
.next_state(cache, sid, input.haystack()[state.at])
.map_err(|_| gave_up(state.at))?;
if sid.is_tagged() {
state.id = Some(sid);
if sid.is_start() {
if let Some(ref pre) = pre {
let span = Span::from(state.at..input.end());
match pre.find(input.haystack(), span) {
None => return Ok(()),
Some(ref span) => {
if span.start > state.at {
state.at = span.start;
if !universal_start {
sid = prefilter_restart(
dfa, cache, &input, state.at,
)?;
}
continue;
}
}
}
}
} else if sid.is_match() {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(cache, sid, 0);
state.mat = Some(HalfMatch::new(pattern, state.at));
cache.search_finish(state.at);
return Ok(());
} else if sid.is_dead() {
cache.search_finish(state.at);
return Ok(());
} else if sid.is_quit() {
cache.search_finish(state.at);
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
));
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
state.at += 1;
cache.search_update(state.at);
}
let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat);
state.id = Some(sid);
if state.mat.is_some() {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
state.next_match_index = Some(1);
}
cache.search_finish(input.end());
result
}
#[inline(never)]
pub(crate) fn find_overlapping_rev(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
state.mat = None;
if input.is_done() {
return Ok(());
}
let mut sid = match state.id {
None => {
let sid = init_rev(dfa, cache, input)?;
state.id = Some(sid);
if input.start() == input.end() {
state.rev_eoi = true;
} else {
state.at = input.end() - 1;
}
sid
}
Some(sid) => {
if let Some(match_index) = state.next_match_index {
let match_len = dfa.match_len(cache, sid);
if match_index < match_len {
state.next_match_index = Some(match_index + 1);
let pattern = dfa.match_pattern(cache, sid, match_index);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
}
}
// Once we've reported all matches at a given position, we need
// to advance the search to the next position. However, if we've
// already followed the EOI transition, then we know we're done
// with the search and there cannot be any more matches to report.
if state.rev_eoi {
return Ok(());
} else if state.at == input.start() {
// At this point, we should follow the EOI transition. This
// will cause us the skip the main loop below and fall through
// to the final 'eoi_rev' transition.
state.rev_eoi = true;
} else {
// We haven't hit the end of the search yet, so move on.
state.at -= 1;
}
sid
}
};
cache.search_start(state.at);
while !state.rev_eoi {
sid = dfa
.next_state(cache, sid, input.haystack()[state.at])
.map_err(|_| gave_up(state.at))?;
if sid.is_tagged() {
state.id = Some(sid);
if sid.is_start() {
// do nothing
} else if sid.is_match() {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(cache, sid, 0);
state.mat = Some(HalfMatch::new(pattern, state.at + 1));
cache.search_finish(state.at);
return Ok(());
} else if sid.is_dead() {
cache.search_finish(state.at);
return Ok(());
} else if sid.is_quit() {
cache.search_finish(state.at);
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
));
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
if state.at == input.start() {
break;
}
state.at -= 1;
cache.search_update(state.at);
}
let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat);
state.rev_eoi = true;
state.id = Some(sid);
if state.mat.is_some() {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
state.next_match_index = Some(1);
}
cache.search_finish(input.start());
result
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
let sid = dfa.start_state_forward(cache, input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
debug_assert!(!sid.is_match());
Ok(sid)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
let sid = dfa.start_state_reverse(cache, input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
debug_assert!(!sid.is_match());
Ok(sid)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
sid: &mut LazyStateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid =
dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if sid.is_quit() {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa
.next_eoi_state(cache, *sid)
.map_err(|_| gave_up(input.haystack().len()))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!sid.is_quit());
}
}
Ok(())
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
sid: &mut LazyStateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa
.next_state(cache, *sid, byte)
.map_err(|_| gave_up(sp.start))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if sid.is_quit() {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid =
dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!sid.is_quit());
}
Ok(())
}
/// Re-compute the starting state that a DFA should be in after finding a
/// prefilter candidate match at the position `at`.
///
/// It is always correct to call this, but not always necessary. Namely,
/// whenever the DFA has a universal start state, the DFA can remain in the
/// start state that it was in when it ran the prefilter. Why? Because in that
/// case, there is only one start state.
///
/// When does a DFA have a universal start state? In precisely cases where
/// it has no look-around assertions in its prefix. So for example, `\bfoo`
/// does not have a universal start state because the start state depends on
/// whether the byte immediately before the start position is a word byte or
/// not. However, `foo\b` does have a universal start state because the word
/// boundary does not appear in the pattern's prefix.
///
/// So... most cases don't need this, but when a pattern doesn't have a
/// universal start state, then after a prefilter candidate has been found, the
/// current state *must* be re-litigated as if computing the start state at the
/// beginning of the search because it might change. That is, not all start
/// states are created equal.
///
/// Why avoid it? Because while it's not super expensive, it isn't a trivial
/// operation to compute the start state. It is much better to avoid it and
/// just state in the current state if you know it to be correct.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn prefilter_restart(
dfa: &DFA,
cache: &mut Cache,
input: &Input<'_>,
at: usize,
) -> Result<LazyStateID, MatchError> {
let mut input = input.clone();
input.set_start(at);
init_fwd(dfa, cache, &input)
}
/// A convenience routine for constructing a "gave up" match error.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn gave_up(offset: usize) -> MatchError {
MatchError::gave_up(offset)
}

652
vendor/regex-automata/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,652 @@
/*!
This crate exposes a variety of regex engines used by the `regex` crate.
It provides a vast, sprawling and "expert" level API to each regex engine.
The regex engines provided by this crate focus heavily on finite automata
implementations and specifically guarantee worst case `O(m * n)` time
complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.)
The primary goal of this crate is to serve as an implementation detail for the
`regex` crate. A secondary goal is to make its internals available for use by
others.
# Table of contents
* [Should I be using this crate?](#should-i-be-using-this-crate) gives some
reasons for and against using this crate.
* [Examples](#examples) provides a small selection of things you can do with
this crate.
* [Available regex engines](#available-regex-engines) provides a hyperlinked
list of all regex engines in this crate.
* [API themes](#api-themes) discusses common elements used throughout this
crate.
* [Crate features](#crate-features) documents the extensive list of Cargo
features available.
# Should I be using this crate?
If you find yourself here because you just want to use regexes, then you should
first check out whether the [`regex` crate](https://docs.rs/regex) meets
your needs. It provides a streamlined and difficult-to-misuse API for regex
searching.
If you're here because there is something specific you want to do that can't
be easily done with `regex` crate, then you are perhaps in the right place.
It's most likely that the first stop you'll want to make is to explore the
[`meta` regex APIs](meta). Namely, the `regex` crate is just a light wrapper
over a [`meta::Regex`], so its API will probably be the easiest to transition
to. In contrast to the `regex` crate, the `meta::Regex` API supports more
search parameters and does multi-pattern searches. However, it isn't quite as
ergonomic.
Otherwise, the following is an inexhaustive list of reasons to use this crate:
* You want to analyze or use a [Thompson `NFA`](nfa::thompson::NFA) directly.
* You want more powerful multi-pattern search than what is provided by
`RegexSet` in the `regex` crate. All regex engines in this crate support
multi-pattern searches.
* You want to use one of the `regex` crate's internal engines directly because
of some interesting configuration that isn't possible via the `regex` crate.
For example, a [lazy DFA's configuration](hybrid::dfa::Config) exposes a
dizzying number of options for controlling its execution.
* You want to use the lower level search APIs. For example, both the [lazy
DFA](hybrid::dfa) and [fully compiled DFAs](dfa) support searching by exploring
the automaton one state at a time. This might be useful, for example, for
stream searches or searches of strings stored in non-contiguous in memory.
* You want to build a fully compiled DFA and then [use zero-copy
deserialization](dfa::dense::DFA::from_bytes) to load it into memory and use
it for searching. This use case is supported in core-only no-std/no-alloc
environments.
* You want to run [anchored searches](Input::anchored) without using the `^`
anchor in your regex pattern.
* You need to work-around contention issues with
sharing a regex across multiple threads. The
[`meta::Regex::search_with`](meta::Regex::search_with) API permits bypassing
any kind of synchronization at all by requiring the caller to provide the
mutable scratch spaced needed during a search.
* You want to build your own regex engine on top of the `regex` crate's
infrastructure.
# Examples
This section tries to identify a few interesting things you can do with this
crate and demonstrates them.
### Multi-pattern searches with capture groups
One of the more frustrating limitations of `RegexSet` in the `regex` crate
(at the time of writing) is that it doesn't report match positions. With this
crate, multi-pattern support was intentionally designed in from the beginning,
which means it works in all regex engines and even for capture groups as well.
This example shows how to search for matches of multiple regexes, where each
regex uses the same capture group names to parse different key-value formats.
```
use regex_automata::{meta::Regex, PatternID};
let re = Regex::new_many(&[
r#"(?m)^(?<key>[[:word:]]+)=(?<val>[[:word:]]+)$"#,
r#"(?m)^(?<key>[[:word:]]+)="(?<val>[^"]+)"$"#,
r#"(?m)^(?<key>[[:word:]]+)='(?<val>[^']+)'$"#,
r#"(?m)^(?<key>[[:word:]]+):\s*(?<val>[[:word:]]+)$"#,
])?;
let hay = r#"
best_album="Blow Your Face Out"
best_quote='"then as it was, then again it will be"'
best_year=1973
best_simpsons_episode: HOMR
"#;
let mut kvs = vec![];
for caps in re.captures_iter(hay) {
// N.B. One could use capture indices '1' and '2' here
// as well. Capture indices are local to each pattern.
// (Just like names are.)
let key = &hay[caps.get_group_by_name("key").unwrap()];
let val = &hay[caps.get_group_by_name("val").unwrap()];
kvs.push((key, val));
}
assert_eq!(kvs, vec![
("best_album", "Blow Your Face Out"),
("best_quote", "\"then as it was, then again it will be\""),
("best_year", "1973"),
("best_simpsons_episode", "HOMR"),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
### Build a full DFA and walk it manually
One of the regex engines in this crate is a fully compiled DFA. It takes worst
case exponential time to build, but once built, it can be easily explored and
used for searches. Here's a simple example that uses its lower level APIs to
implement a simple anchored search by hand.
```
use regex_automata::{dfa::{Automaton, dense}, Input};
let dfa = dense::DFA::new(r"(?-u)\b[A-Z]\w+z\b")?;
let haystack = "Quartz";
// The start state is determined by inspecting the position and the
// initial bytes of the haystack.
let mut state = dfa.start_state_forward(&Input::new(haystack))?;
// Walk all the bytes in the haystack.
for &b in haystack.as_bytes().iter() {
state = dfa.next_state(state, b);
}
// DFAs in this crate require an explicit
// end-of-input transition if a search reaches
// the end of a haystack.
state = dfa.next_eoi_state(state);
assert!(dfa.is_match_state(state));
# Ok::<(), Box<dyn std::error::Error>>(())
```
Or do the same with a lazy DFA that avoids exponential worst case compile time,
but requires mutable scratch space to lazily build the DFA during the search.
```
use regex_automata::{hybrid::dfa::DFA, Input};
let dfa = DFA::new(r"(?-u)\b[A-Z]\w+z\b")?;
let mut cache = dfa.create_cache();
let hay = "Quartz";
// The start state is determined by inspecting the position and the
// initial bytes of the haystack.
let mut state = dfa.start_state_forward(&mut cache, &Input::new(hay))?;
// Walk all the bytes in the haystack.
for &b in hay.as_bytes().iter() {
state = dfa.next_state(&mut cache, state, b)?;
}
// DFAs in this crate require an explicit
// end-of-input transition if a search reaches
// the end of a haystack.
state = dfa.next_eoi_state(&mut cache, state)?;
assert!(state.is_match());
# Ok::<(), Box<dyn std::error::Error>>(())
```
### Find all overlapping matches
This example shows how to build a DFA and use it to find all possible matches,
including overlapping matches. A similar example will work with a lazy DFA as
well. This also works with multiple patterns and will report all matches at the
same position where multiple patterns match.
```
use regex_automata::{
dfa::{dense, Automaton, OverlappingState},
Input, MatchKind,
};
let dfa = dense::DFA::builder()
.configure(dense::DFA::config().match_kind(MatchKind::All))
.build(r"(?-u)\w{3,}")?;
let input = Input::new("homer marge bart lisa maggie");
let mut state = OverlappingState::start();
let mut matches = vec![];
while let Some(hm) = {
dfa.try_search_overlapping_fwd(&input, &mut state)?;
state.get_match()
} {
matches.push(hm.offset());
}
assert_eq!(matches, vec![
3, 4, 5, // hom, home, homer
9, 10, 11, // mar, marg, marge
15, 16, // bar, bart
20, 21, // lis, lisa
25, 26, 27, 28, // mag, magg, maggi, maggie
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Available regex engines
The following is a complete list of all regex engines provided by this crate,
along with a very brief description of it and why you might want to use it.
* [`dfa::regex::Regex`] is a regex engine that works on top of either
[dense](dfa::dense) or [sparse](dfa::sparse) fully compiled DFAs. You might
use a DFA if you need the fastest possible regex engine in this crate and can
afford the exorbitant memory usage usually required by DFAs. Low level APIs on
fully compiled DFAs are provided by the [`Automaton` trait](dfa::Automaton).
Fully compiled dense DFAs can handle all regexes except for searching a regex
with a Unicode word boundary on non-ASCII haystacks. A fully compiled DFA based
regex can only report the start and end of each match.
* [`hybrid::regex::Regex`] is a regex engine that works on top of a lazily
built DFA. Its performance profile is very similar to that of fully compiled
DFAs, but can be slower in some pathological cases. Fully compiled DFAs are
also amenable to more optimizations, such as state acceleration, that aren't
available in a lazy DFA. You might use this lazy DFA if you can't abide the
worst case exponential compile time of a full DFA, but still want the DFA
search performance in the vast majority of cases. A lazy DFA based regex can
only report the start and end of each match.
* [`dfa::onepass::DFA`] is a regex engine that is implemented as a DFA, but
can report the matches of each capture group in addition to the start and end
of each match. The catch is that it only works on a somewhat small subset of
regexes known as "one-pass." You'll want to use this for cases when you need
capture group matches and the regex is one-pass since it is likely to be faster
than any alternative. A one-pass DFA can handle all types of regexes, but does
have some reasonable limits on the number of capture groups it can handle.
* [`nfa::thompson::backtrack::BoundedBacktracker`] is a regex engine that uses
backtracking, but keeps track of the work it has done to avoid catastrophic
backtracking. Like the one-pass DFA, it provides the matches of each capture
group. It retains the `O(m * n)` worst case time bound. This tends to be slower
than the one-pass DFA regex engine, but faster than the PikeVM. It can handle
all types of regexes, but usually only works well with small haystacks and
small regexes due to the memory required to avoid redoing work.
* [`nfa::thompson::pikevm::PikeVM`] is a regex engine that can handle all
regexes, of all sizes and provides capture group matches. It tends to be a tool
of last resort because it is also usually the slowest regex engine.
* [`meta::Regex`] is the meta regex engine that combines *all* of the above
engines into one. The reason for this is that each of the engines above have
their own caveats such as, "only handles a subset of regexes" or "is generally
slow." The meta regex engine accounts for all of these caveats and composes
the engines in a way that attempts to mitigate each engine's weaknesses while
emphasizing its strengths. For example, it will attempt to run a lazy DFA even
if it might fail. In which case, it will restart the search with a likely
slower but more capable regex engine. The meta regex engine is what you should
default to. Use one of the above engines directly only if you have a specific
reason to.
# API themes
While each regex engine has its own APIs and configuration options, there are
some general themes followed by all of them.
### The `Input` abstraction
Most search routines in this crate accept anything that implements
`Into<Input>`. Both `&str` and `&[u8]` haystacks satisfy this constraint, which
means that things like `engine.search("foo")` will work as you would expect.
By virtue of accepting an `Into<Input>` though, callers can provide more than
just a haystack. Indeed, the [`Input`] type has more details, but briefly,
callers can use it to configure various aspects of the search:
* The span of the haystack to search via [`Input::span`] or [`Input::range`],
which might be a substring of the haystack.
* Whether to run an anchored search or not via [`Input::anchored`]. This
permits one to require matches to start at the same offset that the search
started.
* Whether to ask the regex engine to stop as soon as a match is seen via
[`Input::earliest`]. This can be used to find the offset of a match as soon
as it is known without waiting for the full leftmost-first match to be found.
This can also be used to avoid the worst case `O(m * n^2)` time complexity
of iteration.
Some lower level search routines accept an `&Input` for performance reasons.
In which case, `&Input::new("haystack")` can be used for a simple search.
### Error reporting
Most, but not all, regex engines in this crate can fail to execute a search.
When a search fails, callers cannot determine whether or not a match exists.
That is, the result is indeterminate.
Search failure, in all cases in this crate, is represented by a [`MatchError`].
Routines that can fail start with the `try_` prefix in their name. For example,
[`hybrid::regex::Regex::try_search`] can fail for a number of reasons.
Conversely, routines that either can't fail or can panic on failure lack the
`try_` prefix. For example, [`hybrid::regex::Regex::find`] will panic in
cases where [`hybrid::regex::Regex::try_search`] would return an error, and
[`meta::Regex::find`] will never panic. Therefore, callers need to pay close
attention to the panicking conditions in the documentation.
In most cases, the reasons that a search fails are either predictable or
configurable, albeit at some additional cost.
An example of predictable failure is
[`BoundedBacktracker::try_search`](nfa::thompson::backtrack::BoundedBacktracker::try_search).
Namely, it fails whenever the multiplication of the haystack, the regex and some
constant exceeds the
[configured visited capacity](nfa::thompson::backtrack::Config::visited_capacity).
Callers can predict the failure in terms of haystack length via the
[`BoundedBacktracker::max_haystack_len`](nfa::thompson::backtrack::BoundedBacktracker::max_haystack_len)
method. While this form of failure is technically avoidable by increasing the
visited capacity, it isn't practical to do so for all inputs because the
memory usage required for larger haystacks becomes impractically large. So in
practice, if one is using the bounded backtracker, you really do have to deal
with the failure.
An example of configurable failure happens when one enables heuristic support
for Unicode word boundaries in a DFA. Namely, since the DFAs in this crate
(except for the one-pass DFA) do not support Unicode word boundaries on
non-ASCII haystacks, building a DFA from an NFA that contains a Unicode word
boundary will itself fail. However, one can configure DFAs to still be built in
this case by
[configuring heuristic support for Unicode word boundaries](hybrid::dfa::Config::unicode_word_boundary).
If the NFA the DFA is built from contains a Unicode word boundary, then the
DFA will still be built, but special transitions will be added to every state
that cause the DFA to fail if any non-ASCII byte is seen. This failure happens
at search time and it requires the caller to opt into this.
There are other ways for regex engines to fail in this crate, but the above
two should represent the general theme of failures one can find. Dealing
with these failures is, in part, one the responsibilities of the [meta regex
engine](meta). Notice, for example, that the meta regex engine exposes an API
that never returns an error nor panics. It carefully manages all of the ways
in which the regex engines can fail and either avoids the predictable ones
entirely (e.g., the bounded backtracker) or reacts to configured failures by
falling back to a different engine (e.g., the lazy DFA quitting because it saw
a non-ASCII byte).
### Configuration and Builders
Most of the regex engines in this crate come with two types to facilitate
building the regex engine: a `Config` and a `Builder`. A `Config` is usually
specific to that particular regex engine, but other objects such as parsing and
NFA compilation have `Config` types too. A `Builder` is the thing responsible
for taking inputs (either pattern strings or already-parsed patterns or even
NFAs directly) and turning them into an actual regex engine that can be used
for searching.
The main reason why building a regex engine is a bit complicated is because
of the desire to permit composition with de-coupled components. For example,
you might want to [manually construct a Thompson NFA](nfa::thompson::Builder)
and then build a regex engine from it without ever using a regex parser
at all. On the other hand, you might also want to build a regex engine directly
from the concrete syntax. This demonstrates why regex engine construction is
so flexible: it needs to support not just convenient construction, but also
construction from parts built elsewhere.
This is also in turn why there are many different `Config` structs in this
crate. Let's look more closely at an example: [`hybrid::regex::Builder`]. It
accepts three different `Config` types for configuring construction of a lazy
DFA regex:
* [`hybrid::regex::Builder::syntax`] accepts a
[`util::syntax::Config`] for configuring the options found in the
[`regex-syntax`](regex_syntax) crate. For example, whether to match
case insensitively.
* [`hybrid::regex::Builder::thompson`] accepts a [`nfa::thompson::Config`] for
configuring construction of a [Thompson NFA](nfa::thompson::NFA). For example,
whether to build an NFA that matches the reverse language described by the
regex.
* [`hybrid::regex::Builder::dfa`] accept a [`hybrid::dfa::Config`] for
configuring construction of the pair of underlying lazy DFAs that make up the
lazy DFA regex engine. For example, changing the capacity of the cache used to
store the transition table.
The lazy DFA regex engine uses all three of those configuration objects for
methods like [`hybrid::regex::Builder::build`], which accepts a pattern
string containing the concrete syntax of your regex. It uses the syntax
configuration to parse it into an AST and translate it into an HIR. Then the
NFA configuration when compiling the HIR into an NFA. And then finally the DFA
configuration when lazily determinizing the NFA into a DFA.
Notice though that the builder also has a
[`hybrid::regex::Builder::build_from_dfas`] constructor. This permits callers
to build the underlying pair of lazy DFAs themselves (one for the forward
searching to find the end of a match and one for the reverse searching to find
the start of a match), and then build the regex engine from them. The lazy
DFAs, in turn, have their own builder that permits [construction directly from
a Thompson NFA](hybrid::dfa::Builder::build_from_nfa). Continuing down the
rabbit hole, a Thompson NFA has its own compiler that permits [construction
directly from an HIR](nfa::thompson::Compiler::build_from_hir). The lazy DFA
regex engine builder lets you follow this rabbit hole all the way down, but
also provides convenience routines that do it for you when you don't need
precise control over every component.
The [meta regex engine](meta) is a good example of something that utilizes the
full flexibility of these builders. It often needs not only precise control
over each component, but also shares them across multiple regex engines.
(Most sharing is done by internal reference accounting. For example, an
[`NFA`](nfa::thompson::NFA) is reference counted internally which makes cloning
cheap.)
### Size limits
Unlike the `regex` crate, the `regex-automata` crate specifically does not
enable any size limits by default. That means users of this crate need to
be quite careful when using untrusted patterns. Namely, because bounded
repetitions can grow exponentially by stacking them, it is possible to build a
very large internal regex object from just a small pattern string. For example,
the NFA built from the pattern `a{10}{10}{10}{10}{10}{10}{10}` is over 240MB.
There are multiple size limit options in this crate. If one or more size limits
are relevant for the object you're building, they will be configurable via
methods on a corresponding `Config` type.
# Crate features
This crate has a dizzying number of features. The main idea is to be able to
control how much stuff you pull in for your specific use case, since the full
crate is quite large and can dramatically increase compile times and binary
size.
The most barebones but useful configuration is to disable all default features
and enable only `dfa-search`. This will bring in just the DFA deserialization
and search routines without any dependency on `std` or `alloc`. This does
require generating and serializing a DFA, and then storing it somewhere, but
it permits regex searches in freestanding or embedded environments.
Because there are so many features, they are split into a few groups.
The default set of features is: `std`, `syntax`, `perf`, `unicode`, `meta`,
`nfa`, `dfa` and `hybrid`. Basically, the default is to enable everything
except for development related features like `logging`.
### Ecosystem features
* **std** - Enables use of the standard library. In terms of APIs, this usually
just means that error types implement the `std::error::Error` trait. Otherwise,
`std` sometimes enables the code to be faster, for example, using a `HashMap`
instead of a `BTreeMap`. (The `std` feature matters more for dependencies like
`aho-corasick` and `memchr`, where `std` is required to enable certain classes
of SIMD optimizations.) Enabling `std` automatically enables `alloc`.
* **alloc** - Enables use of the `alloc` library. This is required for most
APIs in this crate. The main exception is deserializing and searching with
fully compiled DFAs.
* **logging** - Adds a dependency on the `log` crate and makes this crate emit
log messages of varying degrees of utility. The log messages are especially
useful in trying to understand what the meta regex engine is doing.
### Performance features
**Note**:
To get performance benefits offered by the SIMD, `std` must be enabled.
None of the `perf-*` features will enable `std` implicitly.
* **perf** - Enables all of the below features.
* **perf-inline** - When enabled, `inline(always)` is used in (many) strategic
locations to help performance at the expense of longer compile times and
increased binary size.
* **perf-literal** - Enables all literal related optimizations.
* **perf-literal-substring** - Enables all single substring literal
optimizations. This includes adding a dependency on the `memchr` crate.
* **perf-literal-multisubstring** - Enables all multiple substring literal
optimizations. This includes adding a dependency on the `aho-corasick`
crate.
### Unicode features
* **unicode** -
Enables all Unicode features. This feature is enabled by default, and will
always cover all Unicode features, even if more are added in the future.
* **unicode-age** -
Provide the data for the
[Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
This makes it possible to use classes like `\p{Age:6.0}` to refer to all
codepoints first introduced in Unicode 6.0
* **unicode-bool** -
Provide the data for numerous Unicode boolean properties. The full list
is not included here, but contains properties like `Alphabetic`, `Emoji`,
`Lowercase`, `Math`, `Uppercase` and `White_Space`.
* **unicode-case** -
Provide the data for case insensitive matching using
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
* **unicode-gencat** -
Provide the data for
[Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
This includes, but is not limited to, `Decimal_Number`, `Letter`,
`Math_Symbol`, `Number` and `Punctuation`.
* **unicode-perl** -
Provide the data for supporting the Unicode-aware Perl character classes,
corresponding to `\w`, `\s` and `\d`. This is also necessary for using
Unicode-aware word boundary assertions. Note that if this feature is
disabled, the `\s` and `\d` character classes are still available if the
`unicode-bool` and `unicode-gencat` features are enabled, respectively.
* **unicode-script** -
Provide the data for
[Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
`Latin` and `Thai`.
* **unicode-segment** -
Provide the data necessary to provide the properties used to implement the
[Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
`\p{sb=ATerm}`.
* **unicode-word-boundary** -
Enables support for Unicode word boundaries, i.e., `\b`, in regexes. When
this and `unicode-perl` are enabled, then data tables from `regex-syntax` are
used to implement Unicode word boundaries. However, if `regex-syntax` isn't
enabled as a dependency then one can still enable this feature. It will
cause `regex-automata` to bundle its own data table that would otherwise be
redundant with `regex-syntax`'s table.
### Regex engine features
* **syntax** - Enables a dependency on `regex-syntax`. This makes APIs
for building regex engines from pattern strings available. Without the
`regex-syntax` dependency, the only way to build a regex engine is generally
to deserialize a previously built DFA or to hand assemble an NFA using its
[builder API](nfa::thompson::Builder). Once you have an NFA, you can build any
of the regex engines in this crate. The `syntax` feature also enables `alloc`.
* **meta** - Enables the meta regex engine. This also enables the `syntax` and
`nfa-pikevm` features, as both are the minimal requirements needed. The meta
regex engine benefits from enabling any of the other regex engines and will
use them automatically when appropriate.
* **nfa** - Enables all NFA related features below.
* **nfa-thompson** - Enables the Thompson NFA APIs. This enables `alloc`.
* **nfa-pikevm** - Enables the PikeVM regex engine. This enables
`nfa-thompson`.
* **nfa-backtrack** - Enables the bounded backtracker regex engine. This
enables `nfa-thompson`.
* **dfa** - Enables all DFA related features below.
* **dfa-build** - Enables APIs for determinizing DFAs from NFAs. This
enables `nfa-thompson` and `dfa-search`.
* **dfa-search** - Enables APIs for searching with DFAs.
* **dfa-onepass** - Enables the one-pass DFA API. This enables
`nfa-thompson`.
* **hybrid** - Enables the hybrid NFA/DFA or "lazy DFA" regex engine. This
enables `alloc` and `nfa-thompson`.
*/
// We are no_std.
#![no_std]
// All APIs need docs!
#![deny(missing_docs)]
// Some intra-doc links are broken when certain features are disabled, so we
// only bleat about it when most (all?) features are enabled. But when we do,
// we block the build. Links need to work.
#![cfg_attr(
all(
feature = "std",
feature = "nfa",
feature = "dfa",
feature = "hybrid"
),
deny(rustdoc::broken_intra_doc_links)
)]
// Broken rustdoc links are very easy to come by when you start disabling
// features. Namely, features tend to change imports, and imports change what's
// available to link to.
//
// Basically, we just don't support rustdoc for anything other than the maximal
// feature configuration. Other configurations will work, they just won't be
// perfect.
//
// So here, we specifically allow them so we don't even get warned about them.
#![cfg_attr(
not(all(
feature = "std",
feature = "nfa",
feature = "dfa",
feature = "hybrid"
)),
allow(rustdoc::broken_intra_doc_links)
)]
// Kinda similar, but eliminating all of the dead code and unused import
// warnings for every feature combo is a fool's errand. Instead, we just
// suppress those, but still let them through in a common configuration when we
// build most of everything.
//
// This does actually suggest that when features are disabled, we are actually
// compiling more code than we need to be. And this is perhaps not so great
// because disabling features is usually done in order to reduce compile times
// by reducing the amount of code one compiles... However, usually, most of the
// time this dead code is a relatively small amount from the 'util' module.
// But... I confess... There isn't a ton of visibility on this.
//
// I'm happy to try to address this in a different way, but "let's annotate
// every function in 'util' with some non-local combination of features" just
// cannot be the way forward.
#![cfg_attr(
not(all(
feature = "std",
feature = "nfa",
feature = "dfa",
feature = "hybrid",
feature = "perf-literal-substring",
feature = "perf-literal-multisubstring",
)),
allow(dead_code, unused_imports, unused_variables)
)]
// We generally want all types to impl Debug.
#![warn(missing_debug_implementations)]
// No clue why this thing is still unstable because it's pretty amazing. This
// adds Cargo feature annotations to items in the rustdoc output. Which is
// sadly hugely beneficial for this crate due to the number of features.
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
// I have literally never tested this crate on 16-bit, so it is quite
// suspicious to advertise support for it. But... the regex crate, at time
// of writing, at least claims to support it by not doing any conditional
// compilation based on the target pointer width. So I guess I remain
// consistent with that here.
//
// If you are here because you're on a 16-bit system and you were somehow using
// the regex crate previously, please file an issue. Please be prepared to
// provide some kind of reproduction or carve out some path to getting 16-bit
// working in CI. (Via qemu?)
#[cfg(not(any(
target_pointer_width = "16",
target_pointer_width = "32",
target_pointer_width = "64"
)))]
compile_error!("not supported on non-{16,32,64}, please file an issue");
#[cfg(any(test, feature = "std"))]
extern crate std;
#[cfg(feature = "alloc")]
extern crate alloc;
#[cfg(doctest)]
doc_comment::doctest!("../README.md");
#[doc(inline)]
pub use crate::util::primitives::PatternID;
pub use crate::util::search::*;
#[macro_use]
mod macros;
#[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))]
pub mod dfa;
#[cfg(feature = "hybrid")]
pub mod hybrid;
#[cfg(feature = "meta")]
pub mod meta;
#[cfg(feature = "nfa-thompson")]
pub mod nfa;
pub mod util;

20
vendor/regex-automata/src/macros.rs vendored Normal file
View File

@@ -0,0 +1,20 @@
// Some feature combinations result in some of these macros never being used.
// Which is fine. Just squash the warnings.
#![allow(unused_macros)]
macro_rules! log {
($($tt:tt)*) => {
#[cfg(feature = "logging")]
{
$($tt)*
}
}
}
macro_rules! debug {
($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
}
macro_rules! trace {
($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
}

241
vendor/regex-automata/src/meta/error.rs vendored Normal file
View File

@@ -0,0 +1,241 @@
use regex_syntax::{ast, hir};
use crate::{nfa, util::search::MatchError, PatternID};
/// An error that occurs when construction of a `Regex` fails.
///
/// A build error is generally a result of one of two possible failure
/// modes. First is a parse or syntax error in the concrete syntax of a
/// pattern. Second is that the construction of the underlying regex matcher
/// fails, usually because it gets too big with respect to limits like
/// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit).
///
/// This error provides very little introspection capabilities. You can:
///
/// * Ask for the [`PatternID`] of the pattern that caused an error, if one
/// is available. This is available for things like syntax errors, but not for
/// cases where build limits are exceeded.
/// * Ask for the underlying syntax error, but only if the error is a syntax
/// error.
/// * Ask for a human readable message corresponding to the underlying error.
/// * The `BuildError::source` method (from the `std::error::Error`
/// trait implementation) may be used to query for an underlying error if one
/// exists. There are no API guarantees about which error is returned.
///
/// When the `std` feature is enabled, this implements `std::error::Error`.
#[derive(Clone, Debug)]
pub struct BuildError {
kind: BuildErrorKind,
}
#[derive(Clone, Debug)]
enum BuildErrorKind {
Syntax { pid: PatternID, err: regex_syntax::Error },
NFA(nfa::thompson::BuildError),
}
impl BuildError {
/// If it is known which pattern ID caused this build error to occur, then
/// this method returns it.
///
/// Some errors are not associated with a particular pattern. However, any
/// errors that occur as part of parsing a pattern are guaranteed to be
/// associated with a pattern ID.
///
/// # Example
///
/// ```
/// use regex_automata::{meta::Regex, PatternID};
///
/// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err();
/// assert_eq!(Some(PatternID::must(2)), err.pattern());
/// ```
pub fn pattern(&self) -> Option<PatternID> {
match self.kind {
BuildErrorKind::Syntax { pid, .. } => Some(pid),
_ => None,
}
}
/// If this error occurred because the regex exceeded the configured size
/// limit before being built, then this returns the configured size limit.
///
/// The limit returned is what was configured, and corresponds to the
/// maximum amount of heap usage in bytes.
pub fn size_limit(&self) -> Option<usize> {
match self.kind {
BuildErrorKind::NFA(ref err) => err.size_limit(),
_ => None,
}
}
/// If this error corresponds to a syntax error, then a reference to it is
/// returned by this method.
pub fn syntax_error(&self) -> Option<&regex_syntax::Error> {
match self.kind {
BuildErrorKind::Syntax { ref err, .. } => Some(err),
_ => None,
}
}
pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError {
let err = regex_syntax::Error::from(err);
BuildError { kind: BuildErrorKind::Syntax { pid, err } }
}
pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError {
let err = regex_syntax::Error::from(err);
BuildError { kind: BuildErrorKind::Syntax { pid, err } }
}
pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
BuildError { kind: BuildErrorKind::NFA(err) }
}
}
#[cfg(feature = "std")]
impl std::error::Error for BuildError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind {
BuildErrorKind::Syntax { ref err, .. } => Some(err),
BuildErrorKind::NFA(ref err) => Some(err),
}
}
}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind {
BuildErrorKind::Syntax { pid, .. } => {
write!(f, "error parsing pattern {}", pid.as_usize())
}
BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
}
}
}
/// An error that occurs when a search should be retried.
///
/// This retry error distinguishes between two different failure modes.
///
/// The first is one where potential quadratic behavior has been detected.
/// In this case, whatever optimization that led to this behavior should be
/// stopped, and the next best strategy should be used.
///
/// The second indicates that the underlying regex engine has failed for some
/// reason. This usually occurs because either a lazy DFA's cache has become
/// ineffective or because a non-ASCII byte has been seen *and* a Unicode word
/// boundary was used in one of the patterns. In this failure case, a different
/// regex engine that won't fail in these ways (PikeVM, backtracker or the
/// one-pass DFA) should be used.
///
/// This is an internal error only and should never bleed into the public
/// API.
#[derive(Debug)]
pub(crate) enum RetryError {
Quadratic(RetryQuadraticError),
Fail(RetryFailError),
}
#[cfg(feature = "std")]
impl std::error::Error for RetryError {}
impl core::fmt::Display for RetryError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
RetryError::Quadratic(ref err) => err.fmt(f),
RetryError::Fail(ref err) => err.fmt(f),
}
}
}
impl From<MatchError> for RetryError {
fn from(merr: MatchError) -> RetryError {
RetryError::Fail(RetryFailError::from(merr))
}
}
/// An error that occurs when potential quadratic behavior has been detected
/// when applying either the "reverse suffix" or "reverse inner" optimizations.
///
/// When this error occurs, callers should abandon the "reverse" optimization
/// and use a normal forward search.
#[derive(Debug)]
pub(crate) struct RetryQuadraticError(());
impl RetryQuadraticError {
pub(crate) fn new() -> RetryQuadraticError {
RetryQuadraticError(())
}
}
#[cfg(feature = "std")]
impl std::error::Error for RetryQuadraticError {}
impl core::fmt::Display for RetryQuadraticError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "regex engine gave up to avoid quadratic behavior")
}
}
impl From<RetryQuadraticError> for RetryError {
fn from(err: RetryQuadraticError) -> RetryError {
RetryError::Quadratic(err)
}
}
/// An error that occurs when a regex engine "gives up" for some reason before
/// finishing a search. Usually this occurs because of heuristic Unicode word
/// boundary support or because of ineffective cache usage in the lazy DFA.
///
/// When this error occurs, callers should retry the regex search with a
/// different regex engine.
///
/// Note that this has convenient `From` impls that will automatically
/// convert a `MatchError` into this error. This works because the meta
/// regex engine internals guarantee that errors like `HaystackTooLong` and
/// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and
/// `GaveUp`, which both correspond to this "failure" error.
#[derive(Debug)]
pub(crate) struct RetryFailError {
offset: usize,
}
impl RetryFailError {
pub(crate) fn from_offset(offset: usize) -> RetryFailError {
RetryFailError { offset }
}
}
#[cfg(feature = "std")]
impl std::error::Error for RetryFailError {}
impl core::fmt::Display for RetryFailError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "regex engine failed at offset {:?}", self.offset)
}
}
impl From<RetryFailError> for RetryError {
fn from(err: RetryFailError) -> RetryError {
RetryError::Fail(err)
}
}
impl From<MatchError> for RetryFailError {
fn from(merr: MatchError) -> RetryFailError {
use crate::util::search::MatchErrorKind::*;
match *merr.kind() {
Quit { offset, .. } => RetryFailError::from_offset(offset),
GaveUp { offset } => RetryFailError::from_offset(offset),
// These can never occur because we avoid them by construction
// or with higher level control flow logic. For example, the
// backtracker's wrapper will never hand out a backtracker engine
// when the haystack would be too long.
HaystackTooLong { .. } | UnsupportedAnchored { .. } => {
unreachable!("found impossible error in meta engine: {merr}")
}
}
}
}

View File

@@ -0,0 +1,251 @@
/*!
This module defines two bespoke reverse DFA searching routines. (One for the
lazy DFA and one for the fully compiled DFA.) These routines differ from the
usual ones by permitting the caller to specify a minimum starting position.
That is, the search will begin at `input.end()` and will usually stop at
`input.start()`, unless `min_start > input.start()`, in which case, the search
will stop at `min_start`.
In other words, this lets you say, "no, the search must not extend past this
point, even if it's within the bounds of the given `Input`." And if the search
*does* want to go past that point, it stops and returns a "may be quadratic"
error, which indicates that the caller should retry using some other technique.
These routines specifically exist to protect against quadratic behavior when
employing the "reverse suffix" and "reverse inner" optimizations. Without the
backstop these routines provide, it is possible for parts of the haystack to
get re-scanned over and over again. The backstop not only prevents this, but
*tells you when it is happening* so that you can change the strategy.
Why can't we just use the normal search routines? We could use the normal
search routines and just set the start bound on the provided `Input` to our
`min_start` position. The problem here is that it's impossible to distinguish
between "no match because we reached the end of input" and "determined there
was no match well before the end of input." The former case is what we care
about with respect to quadratic behavior. The latter case is totally fine.
Why don't we modify the normal search routines to report the position at which
the search stops? I considered this, and I still wonder if it is indeed the
right thing to do. However, I think the straight-forward thing to do there
would be to complicate the return type signature of almost every search routine
in this crate, which I really do not want to do. It therefore might make more
sense to provide a richer way for search routines to report meta data, but that
was beyond my bandwidth to work on at the time of writing.
See the 'opt/reverse-inner' and 'opt/reverse-suffix' benchmarks in rebar for a
real demonstration of how quadratic behavior is mitigated.
*/
use crate::{
meta::error::{RetryError, RetryQuadraticError},
HalfMatch, Input, MatchError,
};
#[cfg(feature = "dfa-build")]
pub(crate) fn dfa_try_search_half_rev(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
min_start: usize,
) -> Result<Option<HalfMatch>, RetryError> {
use crate::dfa::Automaton;
let mut mat = None;
let mut sid = dfa.start_state_reverse(input)?;
if input.start() == input.end() {
dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
loop {
sid = dfa.next_state(sid, input.haystack()[at]);
if dfa.is_special_state(sid) {
if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
// Since reverse searches report the beginning of a
// match and the beginning is inclusive (not exclusive
// like the end of a match), we add 1 to make it
// inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else if dfa.is_quit_state(sid) {
return Err(MatchError::quit(input.haystack()[at], at).into());
}
}
if at == input.start() {
break;
}
at -= 1;
if at < min_start {
trace!(
"reached position {at} which is before the previous literal \
match, quitting to avoid quadratic behavior",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
}
let was_dead = dfa.is_dead_state(sid);
dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
// If we reach the beginning of the search and we could otherwise still
// potentially keep matching if there was more to match, then we actually
// return an error to indicate giving up on this optimization. Why? Because
// we can't prove that the real match begins at where we would report it.
//
// This only happens when all of the following are true:
//
// 1) We reach the starting point of our search span.
// 2) The match we found is before the starting point.
// 3) The FSM reports we could possibly find a longer match.
//
// We need (1) because otherwise the search stopped before the starting
// point and there is no possible way to find a more leftmost position.
//
// We need (2) because if the match found has an offset equal to the minimum
// possible offset, then there is no possible more leftmost match.
//
// We need (3) because if the FSM couldn't continue anyway (i.e., it's in
// a dead state), then we know we couldn't find anything more leftmost
// than what we have. (We have to check the state we were in prior to the
// EOI transition since the EOI transition will usually bring us to a dead
// state by virtue of it represents the end-of-input.)
if at == input.start()
&& mat.map_or(false, |m| m.offset() > input.start())
&& !was_dead
{
trace!(
"reached beginning of search at offset {at} without hitting \
a dead state, quitting to avoid potential false positive match",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
Ok(mat)
}
#[cfg(feature = "hybrid")]
pub(crate) fn hybrid_try_search_half_rev(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
min_start: usize,
) -> Result<Option<HalfMatch>, RetryError> {
let mut mat = None;
let mut sid = dfa.start_state_reverse(cache, input)?;
if input.start() == input.end() {
hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
loop {
sid = dfa
.next_state(cache, sid, input.haystack()[at])
.map_err(|_| MatchError::gave_up(at))?;
if sid.is_tagged() {
if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
// Since reverse searches report the beginning of a
// match and the beginning is inclusive (not exclusive
// like the end of a match), we add 1 to make it
// inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
} else if sid.is_dead() {
return Ok(mat);
} else if sid.is_quit() {
return Err(MatchError::quit(input.haystack()[at], at).into());
}
}
if at == input.start() {
break;
}
at -= 1;
if at < min_start {
trace!(
"reached position {at} which is before the previous literal \
match, quitting to avoid quadratic behavior",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
}
let was_dead = sid.is_dead();
hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
// See the comments in the full DFA routine above for why we need this.
if at == input.start()
&& mat.map_or(false, |m| m.offset() > input.start())
&& !was_dead
{
trace!(
"reached beginning of search at offset {at} without hitting \
a dead state, quitting to avoid potential false positive match",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
Ok(mat)
}
#[cfg(feature = "dfa-build")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn dfa_eoi_rev(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
sid: &mut crate::util::primitives::StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
use crate::dfa::Automaton;
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa.next_state(*sid, byte);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!dfa.is_quit_state(*sid));
}
Ok(())
}
#[cfg(feature = "hybrid")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn hybrid_eoi_rev(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
sid: &mut crate::hybrid::LazyStateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa
.next_state(cache, *sid, byte)
.map_err(|_| MatchError::gave_up(sp.start))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if sid.is_quit() {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid = dfa
.next_eoi_state(cache, *sid)
.map_err(|_| MatchError::gave_up(sp.start))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!sid.is_quit());
}
Ok(())
}

View File

@@ -0,0 +1,81 @@
use alloc::{vec, vec::Vec};
use regex_syntax::hir::Hir;
use crate::{meta::regex::RegexInfo, util::search::MatchKind};
/// Pull out an alternation of literals from the given sequence of HIR
/// expressions.
///
/// There are numerous ways for this to fail. Generally, this only applies
/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
/// are "too few" alternates, in which case, the regex engine is likely faster.
///
/// And currently, this only returns something when 'hirs.len() == 1'.
pub(crate) fn alternation_literals(
info: &RegexInfo,
hirs: &[&Hir],
) -> Option<Vec<Vec<u8>>> {
use regex_syntax::hir::{HirKind, Literal};
// Might as well skip the work below if we know we can't build an
// Aho-Corasick searcher.
if !cfg!(feature = "perf-literal-multisubstring") {
return None;
}
// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
if hirs.len() != 1
|| !info.props()[0].look_set().is_empty()
|| info.props()[0].explicit_captures_len() > 0
|| !info.props()[0].is_alternation_literal()
|| info.config().get_match_kind() != MatchKind::LeftmostFirst
{
return None;
}
let hir = &hirs[0];
let alts = match *hir.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None, // one literal isn't worth it
};
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Literal(Literal(ref bytes)) => {
lit.extend_from_slice(bytes)
}
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(Literal(ref bytes)) => {
lit.extend_from_slice(bytes);
}
_ => unreachable!("expected literal, got {e:?}"),
}
}
}
_ => unreachable!("expected literal or concat, got {alt:?}"),
}
lits.push(lit);
}
// Why do this? Well, when the number of literals is small, it's likely
// that we'll use the lazy DFA which is in turn likely to be faster than
// Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
// a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
// the latter because it is so hungry (in time and space), and the former
// is decently fast, but not as fast as a well oiled lazy DFA.
//
// However, once the number starts getting large, the lazy DFA is likely
// to start thrashing because of the modest default cache size. When
// exactly does this happen? Dunno. But at whatever point that is (we make
// a guess below based on ad hoc benchmarking), we'll want to cut over to
// Aho-Corasick, where even the contiguous NFA is likely to do much better.
if lits.len() < 3000 {
debug!("skipping Aho-Corasick because there are too few literals");
return None;
}
Some(lits)
}

62
vendor/regex-automata/src/meta/mod.rs vendored Normal file
View File

@@ -0,0 +1,62 @@
/*!
Provides a regex matcher that composes several other regex matchers
automatically.
This module is home to a meta [`Regex`], which provides a convenient high
level API for executing regular expressions in linear time.
# Comparison with the `regex` crate
A meta `Regex` is the implementation used directly by the `regex` crate.
Indeed, the `regex` crate API is essentially just a light wrapper over a meta
`Regex`. This means that if you need the full flexibility offered by this
API, then you should be able to switch to using this API directly without
any changes in match semantics or syntax. However, there are some API level
differences:
* The `regex` crate API returns match objects that include references to the
haystack itself, which in turn makes it easy to access the matching strings
without having to slice the haystack yourself. In contrast, a meta `Regex`
returns match objects that only have offsets in them.
* At time of writing, a meta `Regex` doesn't have some of the convenience
routines that the `regex` crate has, such as replacements. Note though that
[`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string)
will handle the replacement string interpolation for you.
* A meta `Regex` supports the [`Input`](crate::Input) abstraction, which
provides a way to configure a search in more ways than is supported by the
`regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can
be used to run an anchored search, regardless of whether the pattern is itself
anchored with a `^`.
* A meta `Regex` supports multi-pattern searching everywhere.
Indeed, every [`Match`](crate::Match) returned by the search APIs
include a [`PatternID`](crate::PatternID) indicating which pattern
matched. In the single pattern case, all matches correspond to
[`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate
has distinct `Regex` and a `RegexSet` APIs. The former only supports a single
pattern, while the latter supports multiple patterns but cannot report the
offsets of a match.
* A meta `Regex` provides the explicit capability of bypassing its internal
memory pool for automatically acquiring mutable scratch space required by its
internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower
level routines such as [`Regex::search_with`].
*/
pub use self::{
error::BuildError,
regex::{
Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split,
SplitN,
},
};
mod error;
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
mod limited;
mod literal;
mod regex;
mod reverse_inner;
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
mod stopat;
mod strategy;
mod wrappers;

3649
vendor/regex-automata/src/meta/regex.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,220 @@
/*!
A module dedicated to plucking inner literals out of a regex pattern, and
then constructing a prefilter for them. We also include a regex pattern
"prefix" that corresponds to the bits of the regex that need to match before
the literals do. The reverse inner optimization then proceeds by looking for
matches of the inner literal(s), and then doing a reverse search of the prefix
from the start of the literal match to find the overall start position of the
match.
The essential invariant we want to uphold here is that the literals we return
reflect a set where *at least* one of them must match in order for the overall
regex to match. We also need to maintain the invariant that the regex prefix
returned corresponds to the entirety of the regex up until the literals we
return.
This somewhat limits what we can do. That is, if we a regex like
`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
we could still extract `{@!, %%}`, and it is true that at least of them must
match. But then, what is our regex prefix? Again, in theory, that could be
`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
matches, and `\w+` only matches when `%%` matches.
All of that is technically possible to do, but it seemingly requires a lot of
sophistication and machinery. Probably the way to tackle that is with some kind
of formalism and approach this problem more generally.
For now, the code below basically just looks for a top-level concatenation.
And if it can find one, it looks for literals in each of the direct child
sub-expressions of that concatenation. If some good ones are found, we return
those and a concatenation of the Hir expressions seen up to that point.
*/
use alloc::vec::Vec;
use regex_syntax::hir::{self, literal, Hir, HirKind};
use crate::{util::prefilter::Prefilter, MatchKind};
/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
/// one was found, then a concatenation of the HIR expressions that precede it
/// is returned.
///
/// The idea here is that the prefilter returned can be used to find candidate
/// matches. And then the HIR returned can be used to build a reverse regex
/// matcher, which will find the start of the candidate match. Finally, the
/// match still has to be confirmed with a normal anchored forward scan to find
/// the end position of the match.
///
/// Note that this assumes leftmost-first match semantics, so callers must
/// not call this otherwise.
pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
if hirs.len() != 1 {
debug!(
"skipping reverse inner optimization since it only \
supports 1 pattern, {} were given",
hirs.len(),
);
return None;
}
let mut concat = match top_concat(hirs[0]) {
Some(concat) => concat,
None => {
debug!(
"skipping reverse inner optimization because a top-level \
concatenation could not found",
);
return None;
}
};
// We skip the first HIR because if it did have a prefix prefilter in it,
// we probably wouldn't be here looking for an inner prefilter.
for i in 1..concat.len() {
let hir = &concat[i];
let pre = match prefilter(hir) {
None => continue,
Some(pre) => pre,
};
// Even if we got a prefilter, if it isn't consider "fast," then we
// probably don't want to bother with it. Namely, since the reverse
// inner optimization requires some overhead, it likely only makes
// sense if the prefilter scan itself is (believed) to be much faster
// than the regex engine.
if !pre.is_fast() {
debug!(
"skipping extracted inner prefilter because \
it probably isn't fast"
);
continue;
}
let concat_suffix = Hir::concat(concat.split_off(i));
let concat_prefix = Hir::concat(concat);
// Look for a prefilter again. Why? Because above we only looked for
// a prefilter on the individual 'hir', but we might be able to find
// something better and more discriminatory by looking at the entire
// suffix. We don't do this above to avoid making this loop worst case
// quadratic in the length of 'concat'.
let pre2 = match prefilter(&concat_suffix) {
None => pre,
Some(pre2) => {
if pre2.is_fast() {
pre2
} else {
pre
}
}
};
return Some((concat_prefix, pre2));
}
debug!(
"skipping reverse inner optimization because a top-level \
sub-expression with a fast prefilter could not be found"
);
None
}
/// Attempt to extract a prefilter from an HIR expression.
///
/// We do a little massaging here to do our best that the prefilter we get out
/// of this is *probably* fast. Basically, the false positive rate has a much
/// higher impact for things like the reverse inner optimization because more
/// work needs to potentially be done for each candidate match.
///
/// Note that this assumes leftmost-first match semantics, so callers must
/// not call this otherwise.
fn prefilter(hir: &Hir) -> Option<Prefilter> {
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Prefix);
let mut prefixes = extractor.extract(hir);
debug!(
"inner prefixes (len={:?}) extracted before optimization: {:?}",
prefixes.len(),
prefixes
);
// Since these are inner literals, we know they cannot be exact. But the
// extractor doesn't know this. We mark them as inexact because this might
// impact literal optimization. Namely, optimization weights "all literals
// are exact" as very high, because it presumes that any match results in
// an overall match. But of course, that is not the case here.
//
// In practice, this avoids plucking out a ASCII-only \s as an alternation
// of single-byte whitespace characters.
prefixes.make_inexact();
prefixes.optimize_for_prefix_by_preference();
debug!(
"inner prefixes (len={:?}) extracted after optimization: {:?}",
prefixes.len(),
prefixes
);
prefixes
.literals()
.and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
}
/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
/// try to return one even if it's embedded in a capturing group, but is
/// otherwise pretty conservative in what is returned.
///
/// The HIR returned is a complete copy of the concat with all capturing
/// groups removed. In effect, the concat returned is "flattened" with respect
/// to capturing groups. This makes the detection logic above for prefixes
/// a bit simpler, and it works because 1) capturing groups never influence
/// whether a match occurs or not and 2) capturing groups are not used when
/// doing the reverse inner search to find the start of the match.
fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
loop {
hir = match hir.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_)
| HirKind::Repetition(_)
| HirKind::Alternation(_) => return None,
HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
HirKind::Concat(ref subs) => {
// We are careful to only do the flattening/copy when we know
// we have a "top level" concat we can inspect. This avoids
// doing extra work in cases where we definitely won't use it.
// (This might still be wasted work if we can't go on to find
// some literals to extract.)
let concat =
Hir::concat(subs.iter().map(|h| flatten(h)).collect());
return match concat.into_kind() {
HirKind::Concat(xs) => Some(xs),
// It is actually possible for this case to occur, because
// 'Hir::concat' might simplify the expression to the point
// that concatenations are actually removed. One wonders
// whether this leads to other cases where we should be
// extracting literals, but in theory, I believe if we do
// get here, then it means that a "real" prefilter failed
// to be extracted and we should probably leave well enough
// alone. (A "real" prefilter is unbothered by "top-level
// concats" and "capturing groups.")
_ => return None,
};
}
};
}
}
/// Returns a copy of the given HIR but with all capturing groups removed.
fn flatten(hir: &Hir) -> Hir {
match hir.kind() {
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
HirKind::Class(ref x) => Hir::class(x.clone()),
HirKind::Look(ref x) => Hir::look(x.clone()),
HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
// This is the interesting case. We just drop the group information
// entirely and use the child HIR itself.
HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
HirKind::Alternation(ref xs) => {
Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
}
HirKind::Concat(ref xs) => {
Hir::concat(xs.iter().map(|x| flatten(x)).collect())
}
}
}

212
vendor/regex-automata/src/meta/stopat.rs vendored Normal file
View File

@@ -0,0 +1,212 @@
/*!
This module defines two bespoke forward DFA search routines. One for the lazy
DFA and one for the fully compiled DFA. These routines differ from the normal
ones by reporting the position at which the search terminates when a match
*isn't* found.
This position at which a search terminates is useful in contexts where the meta
regex engine runs optimizations that could go quadratic if we aren't careful.
Namely, a regex search *could* scan to the end of the haystack only to report a
non-match. If the caller doesn't know that the search scanned to the end of the
haystack, it might restart the search at the next literal candidate it finds
and repeat the process.
Providing the caller with the position at which the search stopped provides a
way for the caller to determine the point at which subsequent scans should not
pass. This is principally used in the "reverse inner" optimization, which works
like this:
1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'.
2. At the spot where 'Z' matches, do a reverse anchored search from there for
'\w+'.
3. If the reverse search matches, it corresponds to the start position of a
(possible) match. At this point, do a forward anchored search to find the end
position. If an end position is found, then we have a match and we know its
bounds.
If the forward anchored search in (3) searches the entire rest of the haystack
but reports a non-match, then a naive implementation of the above will continue
back at step 1 looking for more candidates. There might still be a match to be
found! It's possible. But we already scanned the whole haystack. So if we keep
repeating the process, then we might wind up taking quadratic time in the size
of the haystack, which is not great.
So if the forward anchored search in (3) reports the position at which it
stops, then we can detect whether quadratic behavior might be occurring in
steps (1) and (2). For (1), it occurs if the literal candidate found occurs
*before* the end of the previous search in (3), since that means we're now
going to look for another match in a place where the forward search has already
scanned. It is *correct* to do so, but our technique has become inefficient.
For (2), quadratic behavior occurs similarly when its reverse search extends
past the point where the previous forward search in (3) terminated. Indeed, to
implement (2), we use the sibling 'limited' module for ensuring our reverse
scan doesn't go further than we want.
See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of
how quadratic behavior is mitigated.
*/
use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError};
#[cfg(feature = "dfa-build")]
pub(crate) fn dfa_try_search_half_fwd(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
) -> Result<Result<HalfMatch, usize>, RetryFailError> {
use crate::dfa::{accel, Automaton};
let mut mat = None;
let mut sid = dfa.start_state_forward(input)?;
let mut at = input.start();
while at < input.end() {
sid = dfa.next_state(sid, input.haystack()[at]);
if dfa.is_special_state(sid) {
if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
mat = Some(HalfMatch::new(pattern, at));
if input.get_earliest() {
return Ok(mat.ok_or(at));
}
if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
at = accel::find_fwd(needs, input.haystack(), at)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
at = accel::find_fwd(needs, input.haystack(), at)
.unwrap_or(input.end());
continue;
} else if dfa.is_dead_state(sid) {
return Ok(mat.ok_or(at));
} else if dfa.is_quit_state(sid) {
return Err(MatchError::quit(input.haystack()[at], at).into());
} else {
// Ideally we wouldn't use a DFA that specialized start states
// and thus 'is_start_state()' could never be true here, but in
// practice we reuse the DFA created for the full regex which
// will specialize start states whenever there is a prefilter.
debug_assert!(dfa.is_start_state(sid));
}
}
at += 1;
}
dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?;
Ok(mat.ok_or(at))
}
#[cfg(feature = "hybrid")]
pub(crate) fn hybrid_try_search_half_fwd(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
) -> Result<Result<HalfMatch, usize>, RetryFailError> {
let mut mat = None;
let mut sid = dfa.start_state_forward(cache, input)?;
let mut at = input.start();
while at < input.end() {
sid = dfa
.next_state(cache, sid, input.haystack()[at])
.map_err(|_| MatchError::gave_up(at))?;
if sid.is_tagged() {
if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
mat = Some(HalfMatch::new(pattern, at));
if input.get_earliest() {
return Ok(mat.ok_or(at));
}
} else if sid.is_dead() {
return Ok(mat.ok_or(at));
} else if sid.is_quit() {
return Err(MatchError::quit(input.haystack()[at], at).into());
} else {
// We should NEVER get an unknown state ID back from
// dfa.next_state().
debug_assert!(!sid.is_unknown());
// Ideally we wouldn't use a lazy DFA that specialized start
// states and thus 'sid.is_start()' could never be true here,
// but in practice we reuse the lazy DFA created for the full
// regex which will specialize start states whenever there is
// a prefilter.
debug_assert!(sid.is_start());
}
}
at += 1;
}
hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
Ok(mat.ok_or(at))
}
#[cfg(feature = "dfa-build")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn dfa_eoi_fwd(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
sid: &mut crate::util::primitives::StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
use crate::dfa::Automaton;
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid = dfa.next_state(*sid, b);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!dfa.is_quit_state(*sid));
}
}
Ok(())
}
#[cfg(feature = "hybrid")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn hybrid_eoi_fwd(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
sid: &mut crate::hybrid::LazyStateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid = dfa
.next_state(cache, *sid, b)
.map_err(|_| MatchError::gave_up(sp.end))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if sid.is_quit() {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa
.next_eoi_state(cache, *sid)
.map_err(|_| MatchError::gave_up(input.haystack().len()))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!sid.is_quit());
}
}
Ok(())
}

1905
vendor/regex-automata/src/meta/strategy.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1351
vendor/regex-automata/src/meta/wrappers.rs vendored Normal file

File diff suppressed because it is too large Load Diff

55
vendor/regex-automata/src/nfa/mod.rs vendored Normal file
View File

@@ -0,0 +1,55 @@
/*!
Provides non-deterministic finite automata (NFA) and regex engines that use
them.
While NFAs and DFAs (deterministic finite automata) have equivalent *theoretical*
power, their usage in practice tends to result in different engineering trade
offs. While this isn't meant to be a comprehensive treatment of the topic, here
are a few key trade offs that are, at minimum, true for this crate:
* NFAs tend to be represented sparsely where as DFAs are represented densely.
Sparse representations use less memory, but are slower to traverse. Conversely,
dense representations use more memory, but are faster to traverse. (Sometimes
these lines are blurred. For example, an `NFA` might choose to represent a
particular state in a dense fashion, and a DFA can be built using a sparse
representation via [`sparse::DFA`](crate::dfa::sparse::DFA).
* NFAs have epsilon transitions and DFAs don't. In practice, this means that
handling a single byte in a haystack with an NFA at search time may require
visiting multiple NFA states. In a DFA, each byte only requires visiting
a single state. Stated differently, NFAs require a variable number of CPU
instructions to process one byte in a haystack where as a DFA uses a constant
number of CPU instructions to process one byte.
* NFAs are generally easier to amend with secondary storage. For example, the
[`thompson::pikevm::PikeVM`] uses an NFA to match, but also uses additional
memory beyond the model of a finite state machine to track offsets for matching
capturing groups. Conversely, the most a DFA can do is report the offset (and
pattern ID) at which a match occurred. This is generally why we also compile
DFAs in reverse, so that we can run them after finding the end of a match to
also find the start of a match.
* NFAs take worst case linear time to build, but DFAs take worst case
exponential time to build. The [hybrid NFA/DFA](crate::hybrid) mitigates this
challenge for DFAs in many practical cases.
There are likely other differences, but the bottom line is that NFAs tend to be
more memory efficient and give easier opportunities for increasing expressive
power, where as DFAs are faster to search with.
# Why only a Thompson NFA?
Currently, the only kind of NFA we support in this crate is a [Thompson
NFA](https://en.wikipedia.org/wiki/Thompson%27s_construction). This refers
to a specific construction algorithm that takes the syntax of a regex
pattern and converts it to an NFA. Specifically, it makes gratuitous use of
epsilon transitions in order to keep its structure simple. In exchange, its
construction time is linear in the size of the regex. A Thompson NFA also makes
the guarantee that given any state and a character in a haystack, there is at
most one transition defined for it. (Although there may be many epsilon
transitions.)
It's possible that other types of NFAs will be added in the future, such as a
[Glushkov NFA](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm).
But currently, this crate only provides a Thompson NFA.
*/
#[cfg(feature = "nfa-thompson")]
pub mod thompson;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,182 @@
use crate::util::{
captures, look,
primitives::{PatternID, StateID},
};
/// An error that can occurred during the construction of a thompson NFA.
///
/// This error does not provide many introspection capabilities. There are
/// generally only two things you can do with it:
///
/// * Obtain a human readable message via its `std::fmt::Display` impl.
/// * Access an underlying [`regex_syntax::Error`] type from its `source`
/// method via the `std::error::Error` trait. This error only occurs when using
/// convenience routines for building an NFA directly from a pattern string.
///
/// Otherwise, errors typically occur when a limit has been breached. For
/// example, if the total heap usage of the compiled NFA exceeds the limit
/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
/// building the NFA will fail.
#[derive(Clone, Debug)]
pub struct BuildError {
kind: BuildErrorKind,
}
/// The kind of error that occurred during the construction of a thompson NFA.
#[derive(Clone, Debug)]
enum BuildErrorKind {
/// An error that occurred while parsing a regular expression. Note that
/// this error may be printed over multiple lines, and is generally
/// intended to be end user readable on its own.
#[cfg(feature = "syntax")]
Syntax(regex_syntax::Error),
/// An error that occurs if the capturing groups provided to an NFA builder
/// do not satisfy the documented invariants. For example, things like
/// too many groups, missing groups, having the first (zeroth) group be
/// named or duplicate group names within the same pattern.
Captures(captures::GroupInfoError),
/// An error that occurs when an NFA contains a Unicode word boundary, but
/// where the crate was compiled without the necessary data for dealing
/// with Unicode word boundaries.
Word(look::UnicodeWordBoundaryError),
/// An error that occurs if too many patterns were given to the NFA
/// compiler.
TooManyPatterns {
/// The number of patterns given, which exceeds the limit.
given: usize,
/// The limit on the number of patterns.
limit: usize,
},
/// An error that occurs if too states are produced while building an NFA.
TooManyStates {
/// The minimum number of states that are desired, which exceeds the
/// limit.
given: usize,
/// The limit on the number of states.
limit: usize,
},
/// An error that occurs when NFA compilation exceeds a configured heap
/// limit.
ExceededSizeLimit {
/// The configured limit, in bytes.
limit: usize,
},
/// An error that occurs when an invalid capture group index is added to
/// the NFA. An "invalid" index can be one that would otherwise overflow
/// a `usize` on the current target.
InvalidCaptureIndex {
/// The invalid index that was given.
index: u32,
},
/// An error that occurs when one tries to build a reverse NFA with
/// captures enabled. Currently, this isn't supported, but we probably
/// should support it at some point.
#[cfg(feature = "syntax")]
UnsupportedCaptures,
}
impl BuildError {
/// If this error occurred because the NFA exceeded the configured size
/// limit before being built, then this returns the configured size limit.
///
/// The limit returned is what was configured, and corresponds to the
/// maximum amount of heap usage in bytes.
pub fn size_limit(&self) -> Option<usize> {
match self.kind {
BuildErrorKind::ExceededSizeLimit { limit } => Some(limit),
_ => None,
}
}
fn kind(&self) -> &BuildErrorKind {
&self.kind
}
#[cfg(feature = "syntax")]
pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError {
BuildError { kind: BuildErrorKind::Syntax(err) }
}
pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError {
BuildError { kind: BuildErrorKind::Captures(err) }
}
pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError {
BuildError { kind: BuildErrorKind::Word(err) }
}
pub(crate) fn too_many_patterns(given: usize) -> BuildError {
let limit = PatternID::LIMIT;
BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } }
}
pub(crate) fn too_many_states(given: usize) -> BuildError {
let limit = StateID::LIMIT;
BuildError { kind: BuildErrorKind::TooManyStates { given, limit } }
}
pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError {
BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
}
pub(crate) fn invalid_capture_index(index: u32) -> BuildError {
BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } }
}
#[cfg(feature = "syntax")]
pub(crate) fn unsupported_captures() -> BuildError {
BuildError { kind: BuildErrorKind::UnsupportedCaptures }
}
}
#[cfg(feature = "std")]
impl std::error::Error for BuildError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind() {
#[cfg(feature = "syntax")]
BuildErrorKind::Syntax(ref err) => Some(err),
BuildErrorKind::Captures(ref err) => Some(err),
_ => None,
}
}
}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind() {
#[cfg(feature = "syntax")]
BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"),
BuildErrorKind::Captures(_) => {
write!(f, "error with capture groups")
}
BuildErrorKind::Word(_) => {
write!(f, "NFA contains Unicode word boundary")
}
BuildErrorKind::TooManyPatterns { given, limit } => write!(
f,
"attempted to compile {given} patterns, \
which exceeds the limit of {limit}",
),
BuildErrorKind::TooManyStates { given, limit } => write!(
f,
"attempted to compile {given} NFA states, \
which exceeds the limit of {limit}",
),
BuildErrorKind::ExceededSizeLimit { limit } => write!(
f,
"heap usage during NFA compilation exceeded limit of {limit}",
),
BuildErrorKind::InvalidCaptureIndex { index } => write!(
f,
"capture group index {index} is invalid \
(too big or discontinuous)",
),
#[cfg(feature = "syntax")]
BuildErrorKind::UnsupportedCaptures => write!(
f,
"currently captures must be disabled when compiling \
a reverse NFA",
),
}
}
}

View File

@@ -0,0 +1,528 @@
use core::mem;
use alloc::{vec, vec::Vec};
use crate::{
nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder},
util::primitives::{IteratorIndexExt, StateID},
};
/// A trie that preserves leftmost-first match semantics.
///
/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN'
/// patterns. It can *only* handle alternations of literals, which makes it
/// somewhat restricted in its scope, but literal alternations are fairly
/// common.
///
/// At a 5,000 foot level, the main idea of this trie is make an alternation of
/// literals look more like a DFA than an NFA via epsilon removal.
///
/// More precisely, the main issue is in how alternations are compiled into
/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state
/// with an epsilon transition for every branch of the alternation pointing to
/// an NFA state corresponding to the start of that branch. The main problem
/// with this representation is the cost of computing an epsilon closure. Once
/// you hit the alternation's start state, it acts as a sort of "clog" that
/// requires you to traverse all of the epsilon transitions to compute the full
/// closure.
///
/// While fixing such clogs in the general case is pretty tricky without going
/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems).
/// But at least in the case of an alternation of literals, we can convert
/// that to a prefix trie without too much cost. In theory, that's all you
/// really need to do: build the trie and then compile it to a Thompson NFA.
/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it
/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by
/// reducing the number of epsilon transitions out of the alternation's start
/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state,
/// which we do below). It's a small effect here, but when your alternation is
/// huge, the savings is also huge.
///
/// And that is... essentially what a LiteralTrie does. But there is one
/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile
/// that when leftmost-first semantics are used? If 'sam|samwise' was the
/// entire regex, then you could just drop the 'samwise' branch entirely since
/// it is impossible to match ('sam' will always take priority, and since it
/// is a prefix of 'samwise', 'samwise' will never match). But what about the
/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because
/// it might match when 'sam' doesn't fall on a word boundary.
///
/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)',
/// which is a precisely equivalent regex that also gets rid of the clog.
///
/// Another example is 'zapper|z|zap'. That gets translated to
/// 'z(?:apper||ap)'.
///
/// We accomplish this by giving each state in the trie multiple "chunks" of
/// transitions. Each chunk barrier represents a match. The idea is that once
/// you know a match occurs, none of the transitions after the match can be
/// re-ordered and mixed in with the transitions before the match. Otherwise,
/// the match semantics could be changed.
///
/// See the 'State' data type for a bit more detail.
///
/// Future work:
///
/// * In theory, it would be nice to generalize the idea of removing clogs and
/// apply it to the NFA graph itself. Then this could in theory work for
/// case insensitive alternations of literals, or even just alternations where
/// each branch starts with a non-epsilon transition.
/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick
/// crate deals with leftmost-first matches correctly, but I think this implies
/// encoding failure transitions into a Thompson NFA somehow. Which seems fine,
/// because failure transitions are just unconditional epsilon transitions?
/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick
/// directly? At time of writing, 0.7 is the current version of the
/// aho-corasick crate, and that definitely cannot be used as-is. But if we
/// expose the underlying finite state machine API, then could we use it? That
/// would be super. If we could figure that out, it might also lend itself to
/// more general composition of finite state machines.
#[derive(Clone)]
pub(crate) struct LiteralTrie {
/// The set of trie states. Each state contains one or more chunks, where
/// each chunk is a sparse set of transitions to other states. A leaf state
/// is always a match state that contains only empty chunks (i.e., no
/// transitions).
states: Vec<State>,
/// Whether to add literals in reverse to the trie. Useful when building
/// a reverse NFA automaton.
rev: bool,
}
impl LiteralTrie {
/// Create a new literal trie that adds literals in the forward direction.
pub(crate) fn forward() -> LiteralTrie {
let root = State::default();
LiteralTrie { states: vec![root], rev: false }
}
/// Create a new literal trie that adds literals in reverse.
pub(crate) fn reverse() -> LiteralTrie {
let root = State::default();
LiteralTrie { states: vec![root], rev: true }
}
/// Add the given literal to this trie.
///
/// If the literal could not be added because the `StateID` space was
/// exhausted, then an error is returned. If an error returns, the trie
/// is in an unspecified state.
pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> {
let mut prev = StateID::ZERO;
let mut it = bytes.iter().copied();
while let Some(b) = if self.rev { it.next_back() } else { it.next() } {
prev = self.get_or_add_state(prev, b)?;
}
self.states[prev].add_match();
Ok(())
}
/// If the given transition is defined, then return the next state ID.
/// Otherwise, add the transition to `from` and point it to a new state.
///
/// If a new state ID could not be allocated, then an error is returned.
fn get_or_add_state(
&mut self,
from: StateID,
byte: u8,
) -> Result<StateID, BuildError> {
let active = self.states[from].active_chunk();
match active.binary_search_by_key(&byte, |t| t.byte) {
Ok(i) => Ok(active[i].next),
Err(i) => {
// Add a new state and get its ID.
let next = StateID::new(self.states.len()).map_err(|_| {
BuildError::too_many_states(self.states.len())
})?;
self.states.push(State::default());
// Offset our position to account for all transitions and not
// just the ones in the active chunk.
let i = self.states[from].active_chunk_start() + i;
let t = Transition { byte, next };
self.states[from].transitions.insert(i, t);
Ok(next)
}
}
}
/// Compile this literal trie to the NFA builder given.
///
/// This forwards any errors that may occur while using the given builder.
pub(crate) fn compile(
&self,
builder: &mut Builder,
) -> Result<ThompsonRef, BuildError> {
// Compilation proceeds via depth-first traversal of the trie.
//
// This is overall pretty brutal. The recursive version of this is
// deliciously simple. (See 'compile_to_hir' below for what it might
// look like.) But recursion on a trie means your call stack grows
// in accordance with the longest literal, which just does not seem
// appropriate. So we push the call stack to the heap. But as a result,
// the trie traversal becomes pretty brutal because we essentially
// have to encode the state of a double for-loop into an explicit call
// frame. If someone can simplify this without using recursion, that'd
// be great.
// 'end' is our match state for this trie, but represented in the the
// NFA. Any time we see a match in the trie, we insert a transition
// from the current state we're in to 'end'.
let end = builder.add_empty()?;
let mut stack = vec![];
let mut f = Frame::new(&self.states[StateID::ZERO]);
loop {
if let Some(t) = f.transitions.next() {
if self.states[t.next].is_leaf() {
f.sparse.push(thompson::Transition {
start: t.byte,
end: t.byte,
next: end,
});
} else {
f.sparse.push(thompson::Transition {
start: t.byte,
end: t.byte,
// This is a little funny, but when the frame we create
// below completes, it will pop this parent frame off
// and modify this transition to point to the correct
// state.
next: StateID::ZERO,
});
stack.push(f);
f = Frame::new(&self.states[t.next]);
}
continue;
}
// At this point, we have visited all transitions in f.chunk, so
// add it as a sparse NFA state. Unless the chunk was empty, in
// which case, we don't do anything.
if !f.sparse.is_empty() {
let chunk_id = if f.sparse.len() == 1 {
builder.add_range(f.sparse.pop().unwrap())?
} else {
let sparse = mem::replace(&mut f.sparse, vec![]);
builder.add_sparse(sparse)?
};
f.union.push(chunk_id);
}
// Now we need to look to see if there are other chunks to visit.
if let Some(chunk) = f.chunks.next() {
// If we're here, it means we're on the second (or greater)
// chunk, which implies there is a match at this point. So
// connect this state to the final end state.
f.union.push(end);
// Advance to the next chunk.
f.transitions = chunk.iter();
continue;
}
// Now that we are out of chunks, we have completely visited
// this state. So turn our union of chunks into an NFA union
// state, and add that union state to the parent state's current
// sparse state. (If there is no parent, we're done.)
let start = builder.add_union(f.union)?;
match stack.pop() {
None => {
return Ok(ThompsonRef { start, end });
}
Some(mut parent) => {
// OK because the only way a frame gets pushed on to the
// stack (aside from the root) is when a transition has
// been added to 'sparse'.
parent.sparse.last_mut().unwrap().next = start;
f = parent;
}
}
}
}
/// Converts this trie to an equivalent HIR expression.
///
/// We don't actually use this, but it's useful for tests. In particular,
/// it provides a (somewhat) human readable representation of the trie
/// itself.
#[cfg(test)]
fn compile_to_hir(&self) -> regex_syntax::hir::Hir {
self.compile_state_to_hir(StateID::ZERO)
}
/// The recursive implementation of 'to_hir'.
///
/// Notice how simple this is compared to 'compile' above. 'compile' could
/// be similarly simple, but we opt to not use recursion in order to avoid
/// overflowing the stack in the case of a longer literal.
#[cfg(test)]
fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir {
use regex_syntax::hir::Hir;
let mut alt = vec![];
for (i, chunk) in self.states[sid].chunks().enumerate() {
if i > 0 {
alt.push(Hir::empty());
}
if chunk.is_empty() {
continue;
}
let mut chunk_alt = vec![];
for t in chunk.iter() {
chunk_alt.push(Hir::concat(vec![
Hir::literal(vec![t.byte]),
self.compile_state_to_hir(t.next),
]));
}
alt.push(Hir::alternation(chunk_alt));
}
Hir::alternation(alt)
}
}
impl core::fmt::Debug for LiteralTrie {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
writeln!(f, "LiteralTrie(")?;
for (sid, state) in self.states.iter().with_state_ids() {
writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?;
}
writeln!(f, ")")?;
Ok(())
}
}
/// An explicit stack frame used for traversing the trie without using
/// recursion.
///
/// Each frame is tied to the traversal of a single trie state. The frame is
/// dropped once the entire state (and all of its children) have been visited.
/// The "output" of compiling a state is the 'union' vector, which is turn
/// converted to a NFA union state. Each branch of the union corresponds to a
/// chunk in the trie state.
///
/// 'sparse' corresponds to the set of transitions for a particular chunk in a
/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse'
/// field, after being converted to a sparse NFA state, is reused for any
/// subsequent chunks in the trie state, if any exist.
#[derive(Debug)]
struct Frame<'a> {
/// The remaining chunks to visit for a trie state.
chunks: StateChunksIter<'a>,
/// The transitions of the current chunk that we're iterating over. Since
/// every trie state has at least one chunk, every frame is initialized
/// with the first chunk's transitions ready to be consumed.
transitions: core::slice::Iter<'a, Transition>,
/// The NFA state IDs pointing to the start of each chunk compiled by
/// this trie state. This ultimately gets converted to an NFA union once
/// the entire trie state (and all of its children) have been compiled.
/// The order of these matters for leftmost-first match semantics, since
/// earlier matches in the union are preferred over later ones.
union: Vec<StateID>,
/// The actual NFA transitions for a single chunk in a trie state. This
/// gets converted to an NFA sparse state, and its corresponding NFA state
/// ID should get added to 'union'.
sparse: Vec<thompson::Transition>,
}
impl<'a> Frame<'a> {
/// Create a new stack frame for trie traversal. This initializes the
/// 'transitions' iterator to the transitions for the first chunk, with the
/// 'chunks' iterator being every chunk after the first one.
fn new(state: &'a State) -> Frame<'a> {
let mut chunks = state.chunks();
// every state has at least 1 chunk
let chunk = chunks.next().unwrap();
let transitions = chunk.iter();
Frame { chunks, transitions, union: vec![], sparse: vec![] }
}
}
/// A state in a trie.
///
/// This uses a sparse representation. Since we don't use literal tries
/// for searching, and ultimately (and compilation requires visiting every
/// transition anyway), we use a sparse representation for transitions. This
/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps
/// a bit slower.
///
/// While 'transitions' is pretty standard as far as tries goes, the 'chunks'
/// piece here is more unusual. In effect, 'chunks' defines a partitioning
/// of 'transitions', where each chunk corresponds to a distinct set of
/// transitions. The key invariant is that a transition in one chunk cannot
/// be moved to another chunk. This is the secret sauce that preserve
/// leftmost-first match semantics.
///
/// A new chunk is added whenever we mark a state as a match state. Once a
/// new chunk is added, the old active chunk is frozen and is never mutated
/// again. The new chunk becomes the active chunk, which is defined as
/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where
/// 'chunks' is empty actually contains one chunk. Thus, every state contains
/// at least one (possibly empty) chunk.
///
/// A "leaf" state is a state that has no outgoing transitions (so
/// 'transitions' is empty). Note that there is no way for a leaf state to be a
/// non-matching state. (Although while building the trie, within 'add', a leaf
/// state may exist while not containing any matches. But this invariant is
/// only broken within 'add'. Once 'add' returns, the invariant is upheld.)
#[derive(Clone, Default)]
struct State {
transitions: Vec<Transition>,
chunks: Vec<(usize, usize)>,
}
impl State {
/// Mark this state as a match state and freeze the active chunk such that
/// it can not be further mutated.
fn add_match(&mut self) {
// This is not strictly necessary, but there's no point in recording
// another match by adding another chunk if the state has no
// transitions. Note though that we only skip this if we already know
// this is a match state, which is only true if 'chunks' is not empty.
// Basically, if we didn't do this, nothing semantically would change,
// but we'd end up pushing another chunk and potentially triggering an
// alloc.
if self.transitions.is_empty() && !self.chunks.is_empty() {
return;
}
let chunk_start = self.active_chunk_start();
let chunk_end = self.transitions.len();
self.chunks.push((chunk_start, chunk_end));
}
/// Returns true if and only if this state is a leaf state. That is, a
/// state that has no outgoing transitions.
fn is_leaf(&self) -> bool {
self.transitions.is_empty()
}
/// Returns an iterator over all of the chunks (including the currently
/// active chunk) in this state. Since the active chunk is included, the
/// iterator is guaranteed to always yield at least one chunk (although the
/// chunk may be empty).
fn chunks(&self) -> StateChunksIter<'_> {
StateChunksIter {
transitions: &*self.transitions,
chunks: self.chunks.iter(),
active: Some(self.active_chunk()),
}
}
/// Returns the active chunk as a slice of transitions.
fn active_chunk(&self) -> &[Transition] {
let start = self.active_chunk_start();
&self.transitions[start..]
}
/// Returns the index into 'transitions' where the active chunk starts.
fn active_chunk_start(&self) -> usize {
self.chunks.last().map_or(0, |&(_, end)| end)
}
}
impl core::fmt::Debug for State {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let mut spacing = " ";
for (i, chunk) in self.chunks().enumerate() {
if i > 0 {
write!(f, "{spacing}MATCH")?;
}
spacing = "";
for (j, t) in chunk.iter().enumerate() {
spacing = " ";
if j == 0 && i > 0 {
write!(f, " ")?;
} else if j > 0 {
write!(f, ", ")?;
}
write!(f, "{t:?}")?;
}
}
Ok(())
}
}
/// An iterator over all of the chunks in a state, including the active chunk.
///
/// This iterator is created by `State::chunks`. We name this iterator so that
/// we can include it in the `Frame` type for non-recursive trie traversal.
#[derive(Debug)]
struct StateChunksIter<'a> {
transitions: &'a [Transition],
chunks: core::slice::Iter<'a, (usize, usize)>,
active: Option<&'a [Transition]>,
}
impl<'a> Iterator for StateChunksIter<'a> {
type Item = &'a [Transition];
fn next(&mut self) -> Option<&'a [Transition]> {
if let Some(&(start, end)) = self.chunks.next() {
return Some(&self.transitions[start..end]);
}
if let Some(chunk) = self.active.take() {
return Some(chunk);
}
None
}
}
/// A single transition in a trie to another state.
#[derive(Clone, Copy)]
struct Transition {
byte: u8,
next: StateID,
}
impl core::fmt::Debug for Transition {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"{:?} => {}",
crate::util::escape::DebugByte(self.byte),
self.next.as_usize()
)
}
}
#[cfg(test)]
mod tests {
use bstr::B;
use regex_syntax::hir::Hir;
use super::*;
#[test]
fn zap() {
let mut trie = LiteralTrie::forward();
trie.add(b"zapper").unwrap();
trie.add(b"z").unwrap();
trie.add(b"zap").unwrap();
let got = trie.compile_to_hir();
let expected = Hir::concat(vec![
Hir::literal(B("z")),
Hir::alternation(vec![
Hir::literal(B("apper")),
Hir::empty(),
Hir::literal(B("ap")),
]),
]);
assert_eq!(expected, got);
}
#[test]
fn maker() {
let mut trie = LiteralTrie::forward();
trie.add(b"make").unwrap();
trie.add(b"maple").unwrap();
trie.add(b"maker").unwrap();
let got = trie.compile_to_hir();
let expected = Hir::concat(vec![
Hir::literal(B("ma")),
Hir::alternation(vec![
Hir::concat(vec![
Hir::literal(B("ke")),
Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]),
]),
Hir::literal(B("ple")),
]),
]);
assert_eq!(expected, got);
}
}

View File

@@ -0,0 +1,296 @@
// This module contains a couple simple and purpose built hash maps. The key
// trade off they make is that they serve as caches rather than true maps. That
// is, inserting a new entry may cause eviction of another entry. This gives
// us two things. First, there's less overhead associated with inserts and
// lookups. Secondly, it lets us control our memory usage.
//
// These maps are used in some fairly hot code when generating NFA states for
// large Unicode character classes.
//
// Instead of exposing a rich hashmap entry API, we just permit the caller to
// produce a hash of the key directly. The hash can then be reused for both
// lookups and insertions at the cost of leaking abstraction a bit. But these
// are for internal use only, so it's fine.
//
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
// (almost) minimal DFA for large Unicode character classes in linear time.
// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
// since there's a bit more expense in the reverse direction.)
//
// The Utf8SuffixMap is used when compiling large Unicode character classes for
// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
// construction of UTF-8 automata by caching common suffixes. This doesn't
// get the same space savings as Daciuk's algorithm, but it's basically as
// fast as the naive approach and typically winds up using less memory (since
// it generates smaller NFAs) despite the presence of the cache.
//
// These maps effectively represent caching mechanisms for sparse and
// byte-range NFA states, respectively. The former represents a single NFA
// state with many transitions of equivalent priority while the latter
// represents a single NFA state with a single transition. (Neither state ever
// has or is an epsilon transition.) Thus, they have different key types. It's
// likely we could make one generic map, but the machinery didn't seem worth
// it. They are simple enough.
use alloc::{vec, vec::Vec};
use crate::{
nfa::thompson::Transition,
util::{
int::{Usize, U64},
primitives::StateID,
},
};
// Basic FNV-1a hash constants as described in:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;
/// A bounded hash map where the key is a sequence of NFA transitions and the
/// value is a pre-existing NFA state ID.
///
/// std's hashmap can be used for this, however, this map has two important
/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
/// control our memory usage by limited the number of slots. In general, the
/// cost here is that this map acts as a cache. That is, inserting a new entry
/// may remove an old entry. We are okay with this, since it does not impact
/// correctness in the cases where it is used. The only effect that dropping
/// states from the cache has is that the resulting NFA generated may be bigger
/// than it otherwise would be.
///
/// This improves benchmarks that compile large Unicode character classes,
/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
///
/// It is quite possible that there is a better way to approach this problem.
/// For example, if there happens to be a very common state that collides with
/// a lot of less frequent states, then we could wind up with very poor caching
/// behavior. Alas, the effectiveness of this cache has not been measured.
/// Instead, ad hoc experiments suggest that it is "good enough." Additional
/// smarts (such as an LRU eviction policy) have to be weighed against the
/// amount of extra time they cost.
#[derive(Clone, Debug)]
pub struct Utf8BoundedMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
///
/// This makes it possible to clear the map by simply incrementing the
/// version number instead of actually deallocating any storage.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
/// The actual entries, keyed by hash. Collisions between different states
/// result in the old state being dropped.
map: Vec<Utf8BoundedEntry>,
}
/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8BoundedEntry {
/// The version of the map used to produce this entry. If this entry's
/// version does not match the current version of the map, then the map
/// should behave as if this entry does not exist.
version: u16,
/// The key, which is a sorted sequence of non-overlapping NFA transitions.
key: Vec<Transition>,
/// The state ID corresponding to the state containing the transitions in
/// this entry.
val: StateID,
}
impl Utf8BoundedMap {
/// Create a new bounded map with the given capacity. The map will never
/// grow beyond the given size.
///
/// Note that this does not allocate. Instead, callers must call `clear`
/// before using this map. `clear` will allocate space if necessary.
///
/// This avoids the need to pay for the allocation of this map when
/// compiling regexes that lack large Unicode character classes.
pub fn new(capacity: usize) -> Utf8BoundedMap {
assert!(capacity > 0);
Utf8BoundedMap { version: 0, capacity, map: vec![] }
}
/// Clear this map of all entries, but permit the reuse of allocation
/// if possible.
///
/// This must be called before the map can be used.
pub fn clear(&mut self) {
if self.map.is_empty() {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
// If we loop back to version 0, then we forcefully clear the
// entire map. Otherwise, it might be possible to incorrectly
// match entries used to generate other NFAs.
if self.version == 0 {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
}
}
}
/// Return a hash of the given transitions.
pub fn hash(&self, key: &[Transition]) -> usize {
let mut h = INIT;
for t in key {
h = (h ^ u64::from(t.start)).wrapping_mul(PRIME);
h = (h ^ u64::from(t.end)).wrapping_mul(PRIME);
h = (h ^ t.next.as_u64()).wrapping_mul(PRIME);
}
(h % self.map.len().as_u64()).as_usize()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
/// given must have been computed with `hash` using the same key value.
///
/// If there is no cached state with the given transitions, then None is
/// returned.
pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
let entry = &self.map[hash];
if entry.version != self.version {
return None;
}
// There may be a hash collision, so we need to confirm real equality.
if entry.key != key {
return None;
}
Some(entry.val)
}
/// Add a cached state to this map with the given key. Callers should
/// ensure that `state_id` points to a state that contains precisely the
/// NFA transitions given.
///
/// `hash` must have been computed using the `hash` method with the same
/// key.
pub fn set(
&mut self,
key: Vec<Transition>,
hash: usize,
state_id: StateID,
) {
self.map[hash] =
Utf8BoundedEntry { version: self.version, key, val: state_id };
}
}
/// A cache of suffixes used to modestly compress UTF-8 automata for large
/// Unicode character classes.
#[derive(Clone, Debug)]
pub struct Utf8SuffixMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
/// The actual entries, keyed by hash. Collisions between different states
/// result in the old state being dropped.
map: Vec<Utf8SuffixEntry>,
}
/// A key that uniquely identifies an NFA state. It is a triple that represents
/// a transition from one state for a particular byte range.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct Utf8SuffixKey {
pub from: StateID,
pub start: u8,
pub end: u8,
}
/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8SuffixEntry {
/// The version of the map used to produce this entry. If this entry's
/// version does not match the current version of the map, then the map
/// should behave as if this entry does not exist.
version: u16,
/// The key, which consists of a transition in a particular state.
key: Utf8SuffixKey,
/// The identifier that the transition in the key maps to.
val: StateID,
}
impl Utf8SuffixMap {
/// Create a new bounded map with the given capacity. The map will never
/// grow beyond the given size.
///
/// Note that this does not allocate. Instead, callers must call `clear`
/// before using this map. `clear` will allocate space if necessary.
///
/// This avoids the need to pay for the allocation of this map when
/// compiling regexes that lack large Unicode character classes.
pub fn new(capacity: usize) -> Utf8SuffixMap {
assert!(capacity > 0);
Utf8SuffixMap { version: 0, capacity, map: vec![] }
}
/// Clear this map of all entries, but permit the reuse of allocation
/// if possible.
///
/// This must be called before the map can be used.
pub fn clear(&mut self) {
if self.map.is_empty() {
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
if self.version == 0 {
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
}
}
}
/// Return a hash of the given transition.
pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
// Basic FNV-1a hash as described:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;
let mut h = INIT;
h = (h ^ key.from.as_u64()).wrapping_mul(PRIME);
h = (h ^ u64::from(key.start)).wrapping_mul(PRIME);
h = (h ^ u64::from(key.end)).wrapping_mul(PRIME);
(h % self.map.len().as_u64()).as_usize()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
/// given must have been computed with `hash` using the same key value.
///
/// If there is no cached state with the given key, then None is returned.
pub fn get(
&mut self,
key: &Utf8SuffixKey,
hash: usize,
) -> Option<StateID> {
let entry = &self.map[hash];
if entry.version != self.version {
return None;
}
if key != &entry.key {
return None;
}
Some(entry.val)
}
/// Add a cached state to this map with the given key. Callers should
/// ensure that `state_id` points to a state that contains precisely the
/// NFA transition given.
///
/// `hash` must have been computed using the `hash` method with the same
/// key.
pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
self.map[hash] =
Utf8SuffixEntry { version: self.version, key, val: state_id };
}
}

View File

@@ -0,0 +1,81 @@
/*!
Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and
[`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines.
A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central
data type in this library. It is the result of what is commonly referred to as
"regex compilation." That is, turning a regex pattern from its concrete syntax
string into something that can run a search looks roughly like this:
* A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast).
* An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir).
* An `Hir` is compiled into a [`NFA`].
* The `NFA` is then used to build one of a few different regex engines:
* An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines.
* An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's
transition table at search time.
* An `NFA`, assuming it is one-pass, is used to build a full
[one-pass DFA](crate::dfa::onepass) ahead of time.
* An `NFA` is used to build a [full DFA](crate::dfa) ahead of time.
The [`meta`](crate::meta) regex engine makes all of these choices for you based
on various criteria. However, if you have a lower level use case, _you_ can
build any of the above regex engines and use them directly. But you must start
here by building an `NFA`.
# Details
It is perhaps worth expanding a bit more on what it means to go through the
`&str`->`Ast`->`Hir`->`NFA` process.
* Parsing a string into an `Ast` gives it a structured representation.
Crucially, the size and amount of work done in this step is proportional to the
size of the original string. No optimization or Unicode handling is done at
this point. This means that parsing into an `Ast` has very predictable costs.
Moreover, an `Ast` can be round-tripped back to its original pattern string as
written.
* Translating an `Ast` into an `Hir` is a process by which the structured
representation is simplified down to its most fundamental components.
Translation deals with flags such as case insensitivity by converting things
like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted
to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each
character class, regardless of how deeply nested it is, into a single sequence
of non-overlapping ranges. All the various literal forms are thrown out in
favor of one common representation. Overall, the `Hir` is small enough to fit
into your head and makes analysis and other tasks much simpler.
* Compiling an `Hir` into an `NFA` formulates the regex into a finite state
machine whose transitions are defined over bytes. For example, an `Hir` might
have a Unicode character class corresponding to a sequence of ranges defined
in terms of `char`. Compilation is then responsible for turning those ranges
into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding
of just the codepoints specified by those ranges. Otherwise, the main job of
an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be
seen as a sequence of instructions for how to match a regex.
*/
#[cfg(feature = "nfa-backtrack")]
pub mod backtrack;
mod builder;
#[cfg(feature = "syntax")]
mod compiler;
mod error;
#[cfg(feature = "syntax")]
mod literal_trie;
#[cfg(feature = "syntax")]
mod map;
mod nfa;
#[cfg(feature = "nfa-pikevm")]
pub mod pikevm;
#[cfg(feature = "syntax")]
mod range_trie;
pub use self::{
builder::Builder,
error::BuildError,
nfa::{
DenseTransitions, PatternIter, SparseTransitions, State, Transition,
NFA,
},
};
#[cfg(feature = "syntax")]
pub use compiler::{Compiler, Config, WhichCaptures};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1139
vendor/regex-automata/src/util/alphabet.rs vendored Normal file

File diff suppressed because it is too large Load Diff

2551
vendor/regex-automata/src/util/captures.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,682 @@
/*!
This module contains types and routines for implementing determinization.
In this crate, there are at least two places where we implement
determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
lazily compiled DFAs in the `hybrid` module. The stuff in this module
corresponds to the things that are in common between these implementations.
There are three broad things that our implementations of determinization have
in common, as defined by this module:
* The classification of start states. That is, whether we're dealing with
word boundaries, line boundaries, etc., is all the same. This also includes
the look-behind assertions that are satisfied by each starting state
classification.
* The representation of DFA states as sets of NFA states, including
convenience types for building these DFA states that are amenable to reusing
allocations.
* Routines for the "classical" parts of determinization: computing the
epsilon closure, tracking match states (with corresponding pattern IDs, since
we support multi-pattern finite automata) and, of course, computing the
transition function between states for units of input.
I did consider a couple of alternatives to this particular form of code reuse:
1. Don't do any code reuse. The problem here is that we *really* want both
forms of determinization to do exactly identical things when it comes to
their handling of NFA states. While our tests generally ensure this, the code
is tricky and large enough where not reusing code is a pretty big bummer.
2. Implement all of determinization once and make it generic over fully
compiled DFAs and lazily compiled DFAs. While I didn't actually try this
approach, my instinct is that it would be more complex than is needed here.
And the interface required would be pretty hairy. Instead, I think splitting
it into logical sub-components works better.
*/
use alloc::vec::Vec;
pub(crate) use self::state::{
State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
};
use crate::{
nfa::thompson,
util::{
alphabet,
look::{Look, LookSet},
primitives::StateID,
search::MatchKind,
sparse_set::{SparseSet, SparseSets},
start::Start,
utf8,
},
};
mod state;
/// Compute the set of all reachable NFA states, including the full epsilon
/// closure, from a DFA state for a single unit of input. The set of reachable
/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
/// also includes any look-behind assertions satisfied by `unit`, in addition
/// to whether it is a match state. For multi-pattern DFAs, the builder will
/// also include the pattern IDs that match (in the order seen).
///
/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
/// must have capacity equivalent to `nfa.len()`.
///
/// `match_kind` should correspond to the match semantics implemented by the
/// DFA being built. Generally speaking, for leftmost-first match semantics,
/// states that appear after the first NFA match state will not be included in
/// the `StateBuilderNFA` returned since they are impossible to visit.
///
/// `sparses` is used as scratch space for NFA traversal. Other than their
/// capacity requirements (detailed above), there are no requirements on what's
/// contained within them (if anything). Similarly, what's inside of them once
/// this routine returns is unspecified.
///
/// `stack` must have length 0. It is used as scratch space for depth first
/// traversal. After returning, it is guaranteed that `stack` will have length
/// 0.
///
/// `state` corresponds to the current DFA state on which one wants to compute
/// the transition for the input `unit`.
///
/// `empty_builder` corresponds to the builder allocation to use to produce a
/// complete `StateBuilderNFA` state. If the state is not needed (or is already
/// cached), then it can be cleared and reused without needing to create a new
/// `State`. The `StateBuilderNFA` state returned is final and ready to be
/// turned into a `State` if necessary.
pub(crate) fn next(
nfa: &thompson::NFA,
match_kind: MatchKind,
sparses: &mut SparseSets,
stack: &mut Vec<StateID>,
state: &State,
unit: alphabet::Unit,
empty_builder: StateBuilderEmpty,
) -> StateBuilderNFA {
sparses.clear();
// Whether the NFA is matched in reverse or not. We use this in some
// conditional logic for dealing with the exceptionally annoying CRLF-aware
// line anchors.
let rev = nfa.is_reverse();
// The look-around matcher that our NFA is configured with. We don't
// actually use it to match look-around assertions, but we do need its
// configuration for constructing states consistent with how it matches.
let lookm = nfa.look_matcher();
// Put the NFA state IDs into a sparse set in case we need to
// re-compute their epsilon closure.
//
// Doing this state shuffling is technically not necessary unless some
// kind of look-around is used in the DFA. Some ad hoc experiments
// suggested that avoiding this didn't lead to much of an improvement,
// but perhaps more rigorous experimentation should be done. And in
// particular, avoiding this check requires some light refactoring of
// the code below.
state.iter_nfa_state_ids(|nfa_id| {
sparses.set1.insert(nfa_id);
});
// Compute look-ahead assertions originating from the current state. Based
// on the input unit we're transitioning over, some additional set of
// assertions may be true. Thus, we re-compute this state's epsilon closure
// (but only if necessary). Notably, when we build a DFA state initially,
// we don't enable any look-ahead assertions because we don't know whether
// they're true or not at that point.
if !state.look_need().is_empty() {
// Add look-ahead assertions that are now true based on the current
// input unit.
let mut look_have = state.look_have();
match unit.as_u8() {
Some(b'\r') => {
if !rev || !state.is_half_crlf() {
look_have = look_have.insert(Look::EndCRLF);
}
}
Some(b'\n') => {
if rev || !state.is_half_crlf() {
look_have = look_have.insert(Look::EndCRLF);
}
}
Some(_) => {}
None => {
look_have = look_have
.insert(Look::End)
.insert(Look::EndLF)
.insert(Look::EndCRLF);
}
}
if unit.is_byte(lookm.get_line_terminator()) {
look_have = look_have.insert(Look::EndLF);
}
if state.is_half_crlf()
&& ((rev && !unit.is_byte(b'\r'))
|| (!rev && !unit.is_byte(b'\n')))
{
look_have = look_have.insert(Look::StartCRLF);
}
if state.is_from_word() == unit.is_word_byte() {
look_have = look_have
.insert(Look::WordAsciiNegate)
.insert(Look::WordUnicodeNegate);
} else {
look_have =
look_have.insert(Look::WordAscii).insert(Look::WordUnicode);
}
if !unit.is_word_byte() {
look_have = look_have
.insert(Look::WordEndHalfAscii)
.insert(Look::WordEndHalfUnicode);
}
if state.is_from_word() && !unit.is_word_byte() {
look_have = look_have
.insert(Look::WordEndAscii)
.insert(Look::WordEndUnicode);
} else if !state.is_from_word() && unit.is_word_byte() {
look_have = look_have
.insert(Look::WordStartAscii)
.insert(Look::WordStartUnicode);
}
// If we have new assertions satisfied that are among the set of
// assertions that exist in this state (that is, just because we added
// an EndLF assertion above doesn't mean there is an EndLF conditional
// epsilon transition in this state), then we re-compute this state's
// epsilon closure using the updated set of assertions.
//
// Note that since our DFA states omit unconditional epsilon
// transitions, this check is necessary for correctness. If we re-did
// the epsilon closure below needlessly, it could change based on the
// fact that we omitted epsilon states originally.
if !look_have
.subtract(state.look_have())
.intersect(state.look_need())
.is_empty()
{
for nfa_id in sparses.set1.iter() {
epsilon_closure(
nfa,
nfa_id,
look_have,
stack,
&mut sparses.set2,
);
}
sparses.swap();
sparses.set2.clear();
}
}
// Convert our empty builder into one that can record assertions and match
// pattern IDs.
let mut builder = empty_builder.into_matches();
// Set whether the StartLF look-behind assertion is true for this
// transition or not. The look-behind assertion for ASCII word boundaries
// is handled below.
if nfa.look_set_any().contains_anchor_line()
&& unit.is_byte(lookm.get_line_terminator())
{
// Why only handle StartLF here and not Start? That's because Start
// can only impact the starting state, which is special cased in
// start state handling.
builder.set_look_have(|have| have.insert(Look::StartLF));
}
// We also need to add StartCRLF to our assertions too, if we can. This
// is unfortunately a bit more complicated, because it depends on the
// direction of the search. In the forward direction, ^ matches after a
// \n, but in the reverse direction, ^ only matches after a \r. (This is
// further complicated by the fact that reverse a regex means changing a ^
// to a $ and vice versa.)
if nfa.look_set_any().contains_anchor_crlf()
&& ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n')))
{
builder.set_look_have(|have| have.insert(Look::StartCRLF));
}
// And also for the start-half word boundary assertions. As long as the
// look-behind byte is not a word char, then the assertions are satisfied.
if nfa.look_set_any().contains_word() && !unit.is_word_byte() {
builder.set_look_have(|have| {
have.insert(Look::WordStartHalfAscii)
.insert(Look::WordStartHalfUnicode)
});
}
for nfa_id in sparses.set1.iter() {
match *nfa.state(nfa_id) {
thompson::State::Union { .. }
| thompson::State::BinaryUnion { .. }
| thompson::State::Fail
| thompson::State::Look { .. }
| thompson::State::Capture { .. } => {}
thompson::State::Match { pattern_id } => {
// Notice here that we are calling the NEW state a match
// state if the OLD state we are transitioning from
// contains an NFA match state. This is precisely how we
// delay all matches by one byte and also what therefore
// guarantees that starting states cannot be match states.
//
// If we didn't delay matches by one byte, then whether
// a DFA is a matching state or not would be determined
// by whether one of its own constituent NFA states
// was a match state. (And that would be done in
// 'add_nfa_states'.)
//
// Also, 'add_match_pattern_id' requires that callers never
// pass duplicative pattern IDs. We do in fact uphold that
// guarantee here, but it's subtle. In particular, a Thompson
// NFA guarantees that each pattern has exactly one match
// state. Moreover, since we're iterating over the NFA state
// IDs in a set, we are guaranteed not to have any duplicative
// match states. Thus, it is impossible to add the same pattern
// ID more than once.
//
// N.B. We delay matches by 1 byte as a way to hack 1-byte
// look-around into DFA searches. This lets us support ^, $
// and ASCII-only \b. The delay is also why we need a special
// "end-of-input" (EOI) sentinel and why we need to follow the
// EOI sentinel at the end of every search. This final EOI
// transition is necessary to report matches found at the end
// of a haystack.
builder.add_match_pattern_id(pattern_id);
if !match_kind.continue_past_first_match() {
break;
}
}
thompson::State::ByteRange { ref trans } => {
if trans.matches_unit(unit) {
epsilon_closure(
nfa,
trans.next,
builder.look_have(),
stack,
&mut sparses.set2,
);
}
}
thompson::State::Sparse(ref sparse) => {
if let Some(next) = sparse.matches_unit(unit) {
epsilon_closure(
nfa,
next,
builder.look_have(),
stack,
&mut sparses.set2,
);
}
}
thompson::State::Dense(ref dense) => {
if let Some(next) = dense.matches_unit(unit) {
epsilon_closure(
nfa,
next,
builder.look_have(),
stack,
&mut sparses.set2,
);
}
}
}
}
// We only set the word byte if there's a word boundary look-around
// anywhere in this regex. Otherwise, there's no point in bloating the
// number of states if we don't have one.
//
// We also only set it when the state has a non-zero number of NFA states.
// Otherwise, we could wind up with states that *should* be DEAD states
// but are otherwise distinct from DEAD states because of this look-behind
// assertion being set. While this can't technically impact correctness *in
// theory*, it can create pathological DFAs that consume input until EOI or
// a quit byte is seen. Consuming until EOI isn't a correctness problem,
// but a (serious) perf problem. Hitting a quit byte, however, could be a
// correctness problem since it could cause search routines to report an
// error instead of a detected match once the quit state is entered. (The
// search routine could be made to be a bit smarter by reporting a match
// if one was detected once it enters a quit state (and indeed, the search
// routines in this crate do just that), but it seems better to prevent
// these things by construction if possible.)
if !sparses.set2.is_empty() {
if nfa.look_set_any().contains_word() && unit.is_word_byte() {
builder.set_is_from_word();
}
if nfa.look_set_any().contains_anchor_crlf()
&& ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r')))
{
builder.set_is_half_crlf();
}
}
let mut builder_nfa = builder.into_nfa();
add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
builder_nfa
}
/// Compute the epsilon closure for the given NFA state. The epsilon closure
/// consists of all NFA state IDs, including `start_nfa_id`, that can be
/// reached from `start_nfa_id` without consuming any input. These state IDs
/// are written to `set` in the order they are visited, but only if they are
/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
/// given.
///
/// `look_have` consists of the satisfied assertions at the current
/// position. For conditional look-around epsilon transitions, these are
/// only followed if they are satisfied by `look_have`.
///
/// `stack` must have length 0. It is used as scratch space for depth first
/// traversal. After returning, it is guaranteed that `stack` will have length
/// 0.
pub(crate) fn epsilon_closure(
nfa: &thompson::NFA,
start_nfa_id: StateID,
look_have: LookSet,
stack: &mut Vec<StateID>,
set: &mut SparseSet,
) {
assert!(stack.is_empty());
// If this isn't an epsilon state, then the epsilon closure is always just
// itself, so there's no need to spin up the machinery below to handle it.
if !nfa.state(start_nfa_id).is_epsilon() {
set.insert(start_nfa_id);
return;
}
stack.push(start_nfa_id);
while let Some(mut id) = stack.pop() {
// In many cases, we can avoid stack operations when an NFA state only
// adds one new state to visit. In that case, we just set our ID to
// that state and mush on. We only use the stack when an NFA state
// introduces multiple new states to visit.
loop {
// Insert this NFA state, and if it's already in the set and thus
// already visited, then we can move on to the next one.
if !set.insert(id) {
break;
}
match *nfa.state(id) {
thompson::State::ByteRange { .. }
| thompson::State::Sparse { .. }
| thompson::State::Dense { .. }
| thompson::State::Fail
| thompson::State::Match { .. } => break,
thompson::State::Look { look, next } => {
if !look_have.contains(look) {
break;
}
id = next;
}
thompson::State::Union { ref alternates } => {
id = match alternates.get(0) {
None => break,
Some(&id) => id,
};
// We need to process our alternates in order to preserve
// match preferences, so put the earliest alternates closer
// to the top of the stack.
stack.extend(alternates[1..].iter().rev());
}
thompson::State::BinaryUnion { alt1, alt2 } => {
id = alt1;
stack.push(alt2);
}
thompson::State::Capture { next, .. } => {
id = next;
}
}
}
}
}
/// Add the NFA state IDs in the given `set` to the given DFA builder state.
/// The order in which states are added corresponds to the order in which they
/// were added to `set`.
///
/// The DFA builder state given should already have its complete set of match
/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start
/// and whether this state is being generated for a transition over a word byte
/// when applicable) that are true immediately prior to transitioning into this
/// state (via `builder.look_have()`). The match pattern IDs should correspond
/// to matches that occurred on the previous transition, since all matches are
/// delayed by one byte. The things that should _not_ be set are look-ahead
/// assertions (EndLF, End and whether the next byte is a word byte or not).
/// The builder state should also not have anything in `look_need` set, as this
/// routine will compute that for you.
///
/// The given NFA should be able to resolve all identifiers in `set` to a
/// particular NFA state. Additionally, `set` must have capacity equivalent
/// to `nfa.len()`.
pub(crate) fn add_nfa_states(
nfa: &thompson::NFA,
set: &SparseSet,
builder: &mut StateBuilderNFA,
) {
for nfa_id in set.iter() {
match *nfa.state(nfa_id) {
thompson::State::ByteRange { .. } => {
builder.add_nfa_state_id(nfa_id);
}
thompson::State::Sparse { .. } => {
builder.add_nfa_state_id(nfa_id);
}
thompson::State::Dense { .. } => {
builder.add_nfa_state_id(nfa_id);
}
thompson::State::Look { look, .. } => {
builder.add_nfa_state_id(nfa_id);
builder.set_look_need(|need| need.insert(look));
}
thompson::State::Union { .. }
| thompson::State::BinaryUnion { .. } => {
// Pure epsilon transitions don't need to be tracked as part
// of the DFA state. Tracking them is actually superfluous;
// they won't cause any harm other than making determinization
// slower.
//
// Why aren't these needed? Well, in an NFA, epsilon
// transitions are really just jumping points to other states.
// So once you hit an epsilon transition, the same set of
// resulting states always appears. Therefore, putting them in
// a DFA's set of ordered NFA states is strictly redundant.
//
// Look-around states are also epsilon transitions, but
// they are *conditional*. So their presence could be
// discriminatory, and thus, they are tracked above.
//
// But wait... why are epsilon states in our `set` in the first
// place? Why not just leave them out? They're in our `set`
// because it was generated by computing an epsilon closure,
// and we want to keep track of all states we visited to avoid
// re-visiting them. In exchange, we have to do this second
// iteration over our collected states to finalize our DFA
// state. In theory, we could avoid this second iteration if
// we maintained two sets during epsilon closure: the set of
// visited states (to avoid cycles) and the set of states that
// will actually be used to construct the next DFA state.
//
// Note that this optimization requires that we re-compute the
// epsilon closure to account for look-ahead in 'next' *only
// when necessary*. Namely, only when the set of look-around
// assertions changes and only when those changes are within
// the set of assertions that are needed in order to step
// through the closure correctly. Otherwise, if we re-do the
// epsilon closure needlessly, it could change based on the
// fact that we are omitting epsilon states here.
//
// -----
//
// Welp, scratch the above. It turns out that recording these
// is in fact necessary to seemingly handle one particularly
// annoying case: when a conditional epsilon transition is
// put inside of a repetition operator. One specific case I
// ran into was the regex `(?:\b|%)+` on the haystack `z%`.
// The correct leftmost first matches are: [0, 0] and [1, 1].
// But the DFA was reporting [0, 0] and [1, 2]. To understand
// why this happens, consider the NFA for the aforementioned
// regex:
//
// >000000: binary-union(4, 1)
// 000001: \x00-\xFF => 0
// 000002: WordAscii => 5
// 000003: % => 5
// ^000004: binary-union(2, 3)
// 000005: binary-union(4, 6)
// 000006: MATCH(0)
//
// The problem here is that one of the DFA start states is
// going to consist of the NFA states [2, 3] by computing the
// epsilon closure of state 4. State 4 isn't included because
// we previously were not keeping track of union states. But
// only a subset of transitions out of this state will be able
// to follow WordAscii, and in those cases, the epsilon closure
// is redone. The only problem is that computing the epsilon
// closure from [2, 3] is different than computing the epsilon
// closure from [4]. In the former case, assuming the WordAscii
// assertion is satisfied, you get: [2, 3, 6]. In the latter
// case, you get: [2, 6, 3]. Notice that '6' is the match state
// and appears AFTER '3' in the former case. This leads to a
// preferential but incorrect match of '%' before returning
// a match. In the latter case, the match is preferred over
// continuing to accept the '%'.
//
// It almost feels like we might be able to fix the NFA states
// to avoid this, or to at least only keep track of union
// states where this actually matters, since in the vast
// majority of cases, this doesn't matter.
//
// Another alternative would be to define a new HIR property
// called "assertion is repeated anywhere" and compute it
// inductively over the entire pattern. If it happens anywhere,
// which is probably pretty rare, then we record union states.
// Otherwise we don't.
builder.add_nfa_state_id(nfa_id);
}
// Capture states we definitely do not need to record, since they
// are unconditional epsilon transitions with no branching.
thompson::State::Capture { .. } => {}
// It's not totally clear whether we need to record fail states or
// not, but we do so out of an abundance of caution. Since they are
// quite rare in practice, there isn't much cost to recording them.
thompson::State::Fail => {
builder.add_nfa_state_id(nfa_id);
}
thompson::State::Match { .. } => {
// Normally, the NFA match state doesn't actually need to
// be inside the DFA state. But since we delay matches by
// one byte, the matching DFA state corresponds to states
// that transition from the one we're building here. And
// the way we detect those cases is by looking for an NFA
// match state. See 'next' for how this is handled.
builder.add_nfa_state_id(nfa_id);
}
}
}
// If we know this state contains no look-around assertions, then
// there's no reason to track which look-around assertions were
// satisfied when this state was created.
if builder.look_need().is_empty() {
builder.set_look_have(|_| LookSet::empty());
}
}
/// Sets the appropriate look-behind assertions on the given state based on
/// this starting configuration.
pub(crate) fn set_lookbehind_from_start(
nfa: &thompson::NFA,
start: &Start,
builder: &mut StateBuilderMatches,
) {
let rev = nfa.is_reverse();
let lineterm = nfa.look_matcher().get_line_terminator();
let lookset = nfa.look_set_any();
match *start {
Start::NonWordByte => {
if lookset.contains_word() {
builder.set_look_have(|have| {
have.insert(Look::WordStartHalfAscii)
.insert(Look::WordStartHalfUnicode)
});
}
}
Start::WordByte => {
if lookset.contains_word() {
builder.set_is_from_word();
}
}
Start::Text => {
if lookset.contains_anchor_haystack() {
builder.set_look_have(|have| have.insert(Look::Start));
}
if lookset.contains_anchor_line() {
builder.set_look_have(|have| {
have.insert(Look::StartLF).insert(Look::StartCRLF)
});
}
if lookset.contains_word() {
builder.set_look_have(|have| {
have.insert(Look::WordStartHalfAscii)
.insert(Look::WordStartHalfUnicode)
});
}
}
Start::LineLF => {
if rev {
if lookset.contains_anchor_crlf() {
builder.set_is_half_crlf();
}
if lookset.contains_anchor_line() {
builder.set_look_have(|have| have.insert(Look::StartLF));
}
} else {
if lookset.contains_anchor_line() {
builder.set_look_have(|have| have.insert(Look::StartCRLF));
}
}
if lookset.contains_anchor_line() && lineterm == b'\n' {
builder.set_look_have(|have| have.insert(Look::StartLF));
}
if lookset.contains_word() {
builder.set_look_have(|have| {
have.insert(Look::WordStartHalfAscii)
.insert(Look::WordStartHalfUnicode)
});
}
}
Start::LineCR => {
if lookset.contains_anchor_crlf() {
if rev {
builder.set_look_have(|have| have.insert(Look::StartCRLF));
} else {
builder.set_is_half_crlf();
}
}
if lookset.contains_anchor_line() && lineterm == b'\r' {
builder.set_look_have(|have| have.insert(Look::StartLF));
}
if lookset.contains_word() {
builder.set_look_have(|have| {
have.insert(Look::WordStartHalfAscii)
.insert(Look::WordStartHalfUnicode)
});
}
}
Start::CustomLineTerminator => {
if lookset.contains_anchor_line() {
builder.set_look_have(|have| have.insert(Look::StartLF));
}
// This is a bit of a tricky case, but if the line terminator was
// set to a word byte, then we also need to behave as if the start
// configuration is Start::WordByte. That is, we need to mark our
// state as having come from a word byte.
if lookset.contains_word() {
if utf8::is_word_byte(lineterm) {
builder.set_is_from_word();
} else {
builder.set_look_have(|have| {
have.insert(Look::WordStartHalfAscii)
.insert(Look::WordStartHalfUnicode)
});
}
}
}
}
}

View File

@@ -0,0 +1,907 @@
/*!
This module defines a DFA state representation and builders for constructing
DFA states.
This representation is specifically for use in implementations of NFA-to-DFA
conversion via powerset construction. (Also called "determinization" in this
crate.)
The term "DFA state" is somewhat overloaded in this crate. In some cases, it
refers to the set of transitions over an alphabet for a particular state. In
other cases, it refers to a set of NFA states. The former is really about the
final representation of a state in a DFA's transition table, where as the
latter---what this module is focused on---is closer to an intermediate form
that is used to help eventually build the transition table.
This module exports four types. All four types represent the same idea: an
ordered set of NFA states. This ordered set represents the epsilon closure of a
particular NFA state, where the "epsilon closure" is the set of NFA states that
can be transitioned to without consuming any input. i.e., Follow all of the NFA
state's epsilon transitions. In addition, this implementation of DFA states
cares about two other things: the ordered set of pattern IDs corresponding
to the patterns that match if the state is a match state, and the set of
look-behind assertions that were true when the state was created.
The first, `State`, is a frozen representation of a state that cannot be
modified. It may be cheaply cloned without copying the state itself and can be
accessed safely from multiple threads simultaneously. This type is useful for
when one knows that the DFA state being constructed is distinct from any other
previously constructed states. Namely, powerset construction, in practice,
requires one to keep a cache of previously created DFA states. Otherwise,
the number of DFA states created in memory balloons to an impractically
large number. For this reason, equivalent states should endeavor to have an
equivalent byte-level representation. (In general, "equivalency" here means,
"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
full DFA minimization be implemented here. This form of equivalency is only
surface deep and is more-or-less a practical necessity.)
The other three types represent different phases in the construction of a
DFA state. Internally, these three types (and `State`) all use the same
byte-oriented representation. That means one can use any of the builder types
to check whether the state it represents already exists or not. If it does,
then there is no need to freeze it into a `State` (which requires an alloc and
a copy). Here are the three types described succinctly:
* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
`StateBuilderEmpty` can only be used to query its underlying memory capacity,
or to convert into a builder for recording pattern IDs and/or assertions.
* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
can only be used for adding pattern IDs and recording assertions.
* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
can only be used for adding NFA state IDs and recording some assertions.
The expected flow here is to use the above builders to construct a candidate
DFA state to check if it already exists. If it does, then there's no need to
freeze it into a `State`. If it doesn't exist, then `StateBuilderNFA::to_state`
can be called to freeze the builder into an immutable `State`. In either
case, `clear` should be called on the builder to turn it back into a
`StateBuilderEmpty` that reuses the underlying memory.
The main purpose for splitting the builder into these distinct types is to
make it impossible to do things like adding a pattern ID after adding an NFA
state ID. Namely, this makes it simpler to use a space-and-time efficient
binary representation for the state. (The format is documented on the `Repr`
type below.) If we just used one type for everything, it would be possible for
callers to use an incorrect interleaving of calls and thus result in a corrupt
representation. I chose to use more type machinery to make this impossible to
do because 1) determinization is itself pretty complex and it wouldn't be too
hard to foul this up and 2) there isn't too much machinery involved and it's
well contained.
As an optimization, sometimes states won't have certain things set. For
example, if the underlying NFA has no word boundary assertions, then there is
no reason to set a state's look-behind assertion as to whether it was generated
from a word byte or not. Similarly, if a state has no NFA states corresponding
to look-around assertions, then there is no reason to set `look_have` to a
non-empty set. Finally, callers usually omit unconditional epsilon transitions
when adding NFA state IDs since they aren't discriminatory.
Finally, the binary representation used by these states is, thankfully, not
serialized anywhere. So any kind of change can be made with reckless abandon,
as long as everything in this module agrees.
*/
use core::mem;
use alloc::{sync::Arc, vec::Vec};
use crate::util::{
int::{I32, U32},
look::LookSet,
primitives::{PatternID, StateID},
wire::{self, Endian},
};
/// A DFA state that, at its core, is represented by an ordered set of NFA
/// states.
///
/// This type is intended to be used only in NFA-to-DFA conversion via powerset
/// construction.
///
/// It may be cheaply cloned and accessed safely from multiple threads
/// simultaneously.
#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
pub(crate) struct State(Arc<[u8]>);
/// This Borrow impl permits us to lookup any state in a map by its byte
/// representation. This is particularly convenient when one has a StateBuilder
/// and we want to see if a correspondingly equivalent state already exists. If
/// one does exist, then we can reuse the allocation required by StateBuilder
/// without having to convert it into a State first.
impl core::borrow::Borrow<[u8]> for State {
fn borrow(&self) -> &[u8] {
&self.0
}
}
impl core::fmt::Debug for State {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.debug_tuple("State").field(&self.repr()).finish()
}
}
/// For docs on these routines, see the internal Repr and ReprVec types below.
impl State {
pub(crate) fn dead() -> State {
StateBuilderEmpty::new().into_matches().into_nfa().to_state()
}
pub(crate) fn is_match(&self) -> bool {
self.repr().is_match()
}
pub(crate) fn is_from_word(&self) -> bool {
self.repr().is_from_word()
}
pub(crate) fn is_half_crlf(&self) -> bool {
self.repr().is_half_crlf()
}
pub(crate) fn look_have(&self) -> LookSet {
self.repr().look_have()
}
pub(crate) fn look_need(&self) -> LookSet {
self.repr().look_need()
}
pub(crate) fn match_len(&self) -> usize {
self.repr().match_len()
}
pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
self.repr().match_pattern(index)
}
pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
self.repr().match_pattern_ids()
}
#[cfg(all(test, not(miri)))]
pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
self.repr().iter_match_pattern_ids(f)
}
pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
self.repr().iter_nfa_state_ids(f)
}
pub(crate) fn memory_usage(&self) -> usize {
self.0.len()
}
fn repr(&self) -> Repr<'_> {
Repr(&self.0)
}
}
/// A state builder that represents an empty state.
///
/// This is a useful "initial condition" for state construction. It has no
/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
/// made when new() is called. Its main use is for being converted into a
/// builder that can capture assertions and pattern IDs.
#[derive(Clone, Debug)]
pub(crate) struct StateBuilderEmpty(Vec<u8>);
/// For docs on these routines, see the internal Repr and ReprVec types below.
impl StateBuilderEmpty {
pub(crate) fn new() -> StateBuilderEmpty {
StateBuilderEmpty(alloc::vec![])
}
pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]);
StateBuilderMatches(self.0)
}
fn clear(&mut self) {
self.0.clear();
}
pub(crate) fn capacity(&self) -> usize {
self.0.capacity()
}
}
/// A state builder that collects assertions and pattern IDs.
///
/// When collecting pattern IDs is finished, this can be converted into a
/// builder that collects NFA state IDs.
#[derive(Clone)]
pub(crate) struct StateBuilderMatches(Vec<u8>);
impl core::fmt::Debug for StateBuilderMatches {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish()
}
}
/// For docs on these routines, see the internal Repr and ReprVec types below.
impl StateBuilderMatches {
pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
self.repr_vec().close_match_pattern_ids();
StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
}
pub(crate) fn set_is_from_word(&mut self) {
self.repr_vec().set_is_from_word()
}
pub(crate) fn set_is_half_crlf(&mut self) {
self.repr_vec().set_is_half_crlf()
}
pub(crate) fn look_have(&self) -> LookSet {
LookSet::read_repr(&self.0[1..])
}
pub(crate) fn set_look_have(
&mut self,
set: impl FnMut(LookSet) -> LookSet,
) {
self.repr_vec().set_look_have(set)
}
pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
self.repr_vec().add_match_pattern_id(pid)
}
fn repr(&self) -> Repr<'_> {
Repr(&self.0)
}
fn repr_vec(&mut self) -> ReprVec<'_> {
ReprVec(&mut self.0)
}
}
/// A state builder that collects some assertions and NFA state IDs.
///
/// When collecting NFA state IDs is finished, this can be used to build a
/// `State` if necessary.
///
/// When dont with building a state (regardless of whether it got kept or not),
/// it's usually a good idea to call `clear` to get an empty builder back so
/// that it can be reused to build the next state.
#[derive(Clone)]
pub(crate) struct StateBuilderNFA {
repr: Vec<u8>,
prev_nfa_state_id: StateID,
}
impl core::fmt::Debug for StateBuilderNFA {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish()
}
}
/// For docs on these routines, see the internal Repr and ReprVec types below.
impl StateBuilderNFA {
pub(crate) fn to_state(&self) -> State {
State(Arc::from(&*self.repr))
}
pub(crate) fn clear(self) -> StateBuilderEmpty {
let mut builder = StateBuilderEmpty(self.repr);
builder.clear();
builder
}
pub(crate) fn look_need(&self) -> LookSet {
self.repr().look_need()
}
pub(crate) fn set_look_have(
&mut self,
set: impl FnMut(LookSet) -> LookSet,
) {
self.repr_vec().set_look_have(set)
}
pub(crate) fn set_look_need(
&mut self,
set: impl FnMut(LookSet) -> LookSet,
) {
self.repr_vec().set_look_need(set)
}
pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
ReprVec(&mut self.repr)
.add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
}
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.repr
}
fn repr(&self) -> Repr<'_> {
Repr(&self.repr)
}
fn repr_vec(&mut self) -> ReprVec<'_> {
ReprVec(&mut self.repr)
}
}
/// Repr is a read-only view into the representation of a DFA state.
///
/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
/// in one place, and then use a Repr to implement the various methods on the
/// public state types.
///
/// The format is as follows:
///
/// The first three bytes correspond to bitsets.
///
/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
/// if the state has pattern IDs explicitly written to it. (This is a flag that
/// is not meant to be set by determinization, but rather, is used as part of
/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
/// generated by a transition over a "word" byte. (Callers may not always set
/// this. For example, if the NFA has no word boundary assertion, then needing
/// to track whether a state came from a word byte or not is superfluous and
/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition
/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is
/// enabled.
///
/// Bytes 1..5 correspond to the look-behind assertions that were satisfied
/// by the transition that created this state. (Look-ahead assertions are not
/// tracked as part of states. Instead, these are applied by re-computing the
/// epsilon closure of a state when computing the transition function. See
/// `next` in the parent module.)
///
/// Bytes 5..9 correspond to the set of look-around assertions (including both
/// look-behind and look-ahead) that appear somewhere in this state's set of
/// NFA state IDs. This is used to determine whether this state's epsilon
/// closure should be re-computed when computing the transition function.
/// Namely, look-around assertions are "just" conditional epsilon transitions,
/// so if there are new assertions available when computing the transition
/// function, we should only re-compute the epsilon closure if those new
/// assertions are relevant to this particular state.
///
/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer
/// corresponding to the number of patterns encoded in this state. If the state
/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
/// offset 3 is the position at which the first NFA state ID is encoded.
///
/// For a match state with at least one non-ZERO pattern ID, the next bytes
/// correspond to a sequence of 32-bit native endian encoded integers that
/// represent each pattern ID, in order, that this match state represents.
///
/// After the pattern IDs (if any), NFA state IDs are delta encoded as
/// varints.[1] The first NFA state ID is encoded as itself, and each
/// subsequent NFA state ID is encoded as the difference between itself and the
/// previous NFA state ID.
///
/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
struct Repr<'a>(&'a [u8]);
impl<'a> Repr<'a> {
/// Returns true if and only if this is a match state.
///
/// If callers have added pattern IDs to this state, then callers MUST set
/// this state as a match state explicitly. However, as a special case,
/// states that are marked as match states but with no pattern IDs, then
/// the state is treated as if it had a single pattern ID equivalent to
/// PatternID::ZERO.
fn is_match(&self) -> bool {
self.0[0] & (1 << 0) > 0
}
/// Returns true if and only if this state has had at least one pattern
/// ID added to it.
///
/// This is an internal-only flag that permits the representation to save
/// space in the common case of an NFA with one pattern in it. In that
/// case, a match state can only ever have exactly one pattern ID:
/// PatternID::ZERO. So there's no need to represent it.
fn has_pattern_ids(&self) -> bool {
self.0[0] & (1 << 1) > 0
}
/// Returns true if and only if this state is marked as having been created
/// from a transition over a word byte. This is useful for checking whether
/// a word boundary assertion is true or not, which requires look-behind
/// (whether the current state came from a word byte or not) and look-ahead
/// (whether the transition byte is a word byte or not).
///
/// Since states with this set are distinct from states that don't have
/// this set (even if they are otherwise equivalent), callers should not
/// set this assertion unless the underlying NFA has at least one word
/// boundary assertion somewhere. Otherwise, a superfluous number of states
/// may be created.
fn is_from_word(&self) -> bool {
self.0[0] & (1 << 2) > 0
}
/// Returns true if and only if this state is marked as being inside of a
/// CRLF terminator. In the forward direction, this means the state was
/// created after seeing a `\r`. In the reverse direction, this means the
/// state was created after seeing a `\n`.
fn is_half_crlf(&self) -> bool {
self.0[0] & (1 << 3) > 0
}
/// The set of look-behind assertions that were true in the transition that
/// created this state.
///
/// Generally, this should be empty if 'look_need' is empty, since there is
/// no reason to track which look-behind assertions are true if the state
/// has no conditional epsilon transitions.
///
/// Satisfied look-ahead assertions are not tracked in states. Instead,
/// these are re-computed on demand via epsilon closure when computing the
/// transition function.
fn look_have(&self) -> LookSet {
LookSet::read_repr(&self.0[1..])
}
/// The set of look-around (both behind and ahead) assertions that appear
/// at least once in this state's set of NFA states.
///
/// This is used to determine whether the epsilon closure needs to be
/// re-computed when computing the transition function. Namely, if the
/// state has no conditional epsilon transitions, then there is no need
/// to re-compute the epsilon closure.
fn look_need(&self) -> LookSet {
LookSet::read_repr(&self.0[5..])
}
/// Returns the total number of match pattern IDs in this state.
///
/// If this state is not a match state, then this always returns 0.
fn match_len(&self) -> usize {
if !self.is_match() {
0
} else if !self.has_pattern_ids() {
1
} else {
self.encoded_pattern_len()
}
}
/// Returns the pattern ID for this match state at the given index.
///
/// If the given index is greater than or equal to `match_len()` for this
/// state, then this could panic or return incorrect results.
fn match_pattern(&self, index: usize) -> PatternID {
if !self.has_pattern_ids() {
PatternID::ZERO
} else {
let offset = 13 + index * PatternID::SIZE;
// This is OK since we only ever serialize valid PatternIDs to
// states.
wire::read_pattern_id_unchecked(&self.0[offset..]).0
}
}
/// Returns a copy of all match pattern IDs in this state. If this state
/// is not a match state, then this returns None.
fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
if !self.is_match() {
return None;
}
let mut pids = alloc::vec![];
self.iter_match_pattern_ids(|pid| pids.push(pid));
Some(pids)
}
/// Calls the given function on every pattern ID in this state.
fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
if !self.is_match() {
return;
}
// As an optimization for a very common case, when this is a match
// state for an NFA with only one pattern, we don't actually write the
// pattern ID to the state representation. Instead, we know it must
// be there since it is the only possible choice.
if !self.has_pattern_ids() {
f(PatternID::ZERO);
return;
}
let mut pids = &self.0[13..self.pattern_offset_end()];
while !pids.is_empty() {
let pid = wire::read_u32(pids);
pids = &pids[PatternID::SIZE..];
// This is OK since we only ever serialize valid PatternIDs to
// states. And since pattern IDs can never exceed a usize, the
// unwrap is OK.
f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
}
}
/// Calls the given function on every NFA state ID in this state.
fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
let mut sids = &self.0[self.pattern_offset_end()..];
let mut prev = 0i32;
while !sids.is_empty() {
let (delta, nr) = read_vari32(sids);
sids = &sids[nr..];
let sid = prev + delta;
prev = sid;
// This is OK since we only ever serialize valid StateIDs to
// states. And since state IDs can never exceed an isize, they must
// always be able to fit into a usize, and thus cast is OK.
f(StateID::new_unchecked(sid.as_usize()))
}
}
/// Returns the offset into this state's representation where the pattern
/// IDs end and the NFA state IDs begin.
fn pattern_offset_end(&self) -> usize {
let encoded = self.encoded_pattern_len();
if encoded == 0 {
return 9;
}
// This arithmetic is OK since we were able to address this many bytes
// when writing to the state, thus, it must fit into a usize.
encoded.checked_mul(4).unwrap().checked_add(13).unwrap()
}
/// Returns the total number of *encoded* pattern IDs in this state.
///
/// This may return 0 even when this is a match state, since the pattern
/// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
/// the match state (the overwhelming common case).
fn encoded_pattern_len(&self) -> usize {
if !self.has_pattern_ids() {
return 0;
}
// This unwrap is OK since the total number of patterns is always
// guaranteed to fit into a usize.
usize::try_from(wire::read_u32(&self.0[9..13])).unwrap()
}
}
impl<'a> core::fmt::Debug for Repr<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let mut nfa_ids = alloc::vec![];
self.iter_nfa_state_ids(|sid| nfa_ids.push(sid));
f.debug_struct("Repr")
.field("is_match", &self.is_match())
.field("is_from_word", &self.is_from_word())
.field("is_half_crlf", &self.is_half_crlf())
.field("look_have", &self.look_have())
.field("look_need", &self.look_need())
.field("match_pattern_ids", &self.match_pattern_ids())
.field("nfa_state_ids", &nfa_ids)
.finish()
}
}
/// ReprVec is a write-only view into the representation of a DFA state.
///
/// See Repr for more details on the purpose of this type and also the format.
///
/// Note that not all possible combinations of methods may be called. This is
/// precisely what the various StateBuilder types encapsulate: they only
/// permit valid combinations via Rust's linear typing.
struct ReprVec<'a>(&'a mut Vec<u8>);
impl<'a> ReprVec<'a> {
/// Set this state as a match state.
///
/// This should not be exposed explicitly outside of this module. It is
/// set automatically when a pattern ID is added.
fn set_is_match(&mut self) {
self.0[0] |= 1 << 0;
}
/// Set that this state has pattern IDs explicitly written to it.
///
/// This should not be exposed explicitly outside of this module. This is
/// used internally as a space saving optimization. Namely, if the state
/// is a match state but does not have any pattern IDs written to it,
/// then it is automatically inferred to have a pattern ID of ZERO.
fn set_has_pattern_ids(&mut self) {
self.0[0] |= 1 << 1;
}
/// Set this state as being built from a transition over a word byte.
///
/// Setting this is only necessary when one needs to deal with word
/// boundary assertions. Therefore, if the underlying NFA has no word
/// boundary assertions, callers should not set this.
fn set_is_from_word(&mut self) {
self.0[0] |= 1 << 2;
}
/// Set this state as having seen half of a CRLF terminator.
///
/// In the forward direction, this should be set when a `\r` has been seen.
/// In the reverse direction, this should be set when a `\n` has been seen.
fn set_is_half_crlf(&mut self) {
self.0[0] |= 1 << 3;
}
/// The set of look-behind assertions that were true in the transition that
/// created this state.
fn look_have(&self) -> LookSet {
self.repr().look_have()
}
/// The set of look-around (both behind and ahead) assertions that appear
/// at least once in this state's set of NFA states.
fn look_need(&self) -> LookSet {
self.repr().look_need()
}
/// Mutate the set of look-behind assertions that were true in the
/// transition that created this state.
fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
set(self.look_have()).write_repr(&mut self.0[1..]);
}
/// Mutate the set of look-around (both behind and ahead) assertions that
/// appear at least once in this state's set of NFA states.
fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
set(self.look_need()).write_repr(&mut self.0[5..]);
}
/// Add a pattern ID to this state. All match states must have at least
/// one pattern ID associated with it.
///
/// Callers must never add duplicative pattern IDs.
///
/// The order in which patterns are added must correspond to the order
/// in which patterns are reported as matches.
fn add_match_pattern_id(&mut self, pid: PatternID) {
// As a (somewhat small) space saving optimization, in the case where
// a matching state has exactly one pattern ID, PatternID::ZERO, we do
// not write either the pattern ID or the number of patterns encoded.
// Instead, all we do is set the 'is_match' bit on this state. Overall,
// this saves 8 bytes per match state for the overwhelming majority of
// match states.
//
// In order to know whether pattern IDs need to be explicitly read or
// not, we use another internal-only bit, 'has_pattern_ids', to
// indicate whether they have been explicitly written or not.
if !self.repr().has_pattern_ids() {
if pid == PatternID::ZERO {
self.set_is_match();
return;
}
// Make room for 'close_match_pattern_ids' to write the total
// number of pattern IDs written.
self.0.extend(core::iter::repeat(0).take(PatternID::SIZE));
self.set_has_pattern_ids();
// If this was already a match state, then the only way that's
// possible when the state doesn't have pattern IDs is if
// PatternID::ZERO was added by the caller previously. In this
// case, we are now adding a non-ZERO pattern ID after it, in
// which case, we want to make sure to represent ZERO explicitly
// now.
if self.repr().is_match() {
write_u32(self.0, 0)
} else {
// Otherwise, just make sure the 'is_match' bit is set.
self.set_is_match();
}
}
write_u32(self.0, pid.as_u32());
}
/// Indicate that no more pattern IDs will be added to this state.
///
/// Once this is called, callers must not call it or 'add_match_pattern_id'
/// again.
///
/// This should not be exposed explicitly outside of this module. It
/// should be called only when converting a StateBuilderMatches into a
/// StateBuilderNFA.
fn close_match_pattern_ids(&mut self) {
// If we never wrote any pattern IDs, then there's nothing to do here.
if !self.repr().has_pattern_ids() {
return;
}
let patsize = PatternID::SIZE;
let pattern_bytes = self.0.len() - 13;
// Every pattern ID uses 4 bytes, so number of bytes should be
// divisible by 4.
assert_eq!(pattern_bytes % patsize, 0);
// This unwrap is OK since we are guaranteed that the maximum number
// of possible patterns fits into a u32.
let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
wire::NE::write_u32(count32, &mut self.0[9..13]);
}
/// Add an NFA state ID to this state. The order in which NFA states are
/// added matters. It is the caller's responsibility to ensure that
/// duplicate NFA state IDs are not added.
fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
let delta = sid.as_i32() - prev.as_i32();
write_vari32(self.0, delta);
*prev = sid;
}
/// Return a read-only view of this state's representation.
fn repr(&self) -> Repr<'_> {
Repr(self.0.as_slice())
}
}
/// Write a signed 32-bit integer using zig-zag encoding.
///
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_vari32(data: &mut Vec<u8>, n: i32) {
let mut un = n.to_bits() << 1;
if n < 0 {
un = !un;
}
write_varu32(data, un)
}
/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
/// number of bytes read.
///
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn read_vari32(data: &[u8]) -> (i32, usize) {
let (un, i) = read_varu32(data);
let mut n = i32::from_bits(un >> 1);
if un & 1 != 0 {
n = !n;
}
(n, i)
}
/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
/// as a sequence of bytes where all bytes except for the last one have the
/// most significant bit set. The least significant 7 bits correspond to the
/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
/// very common cases, it uses fewer than 4.
///
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
while n >= 0b1000_0000 {
data.push(n.low_u8() | 0b1000_0000);
n >>= 7;
}
data.push(n.low_u8());
}
/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
///
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn read_varu32(data: &[u8]) -> (u32, usize) {
// N.B. We can assume correctness here since we know that all var-u32 are
// written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
// is all okay.
let mut n: u32 = 0;
let mut shift: u32 = 0;
for (i, &b) in data.iter().enumerate() {
if b < 0b1000_0000 {
return (n | (u32::from(b) << shift), i + 1);
}
n |= (u32::from(b) & 0b0111_1111) << shift;
shift += 7;
}
(0, 0)
}
/// Push a native-endian encoded `n` on to `dst`.
fn write_u32(dst: &mut Vec<u8>, n: u32) {
use crate::util::wire::NE;
let start = dst.len();
dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
NE::write_u32(n, &mut dst[start..]);
}
#[cfg(test)]
mod tests {
use alloc::vec;
use quickcheck::quickcheck;
use super::*;
#[cfg(not(miri))]
quickcheck! {
fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
// Builders states do not permit duplicate IDs.
let sids = dedup_state_ids(sids);
let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
for &sid in &sids {
b.add_nfa_state_id(sid);
}
let s = b.to_state();
let mut got = vec![];
s.iter_nfa_state_ids(|sid| got.push(sid));
got == sids
}
fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
// Builders states do not permit duplicate IDs.
let pids = dedup_pattern_ids(pids);
let mut b = StateBuilderEmpty::new().into_matches();
for &pid in &pids {
b.add_match_pattern_id(pid);
}
let s = b.into_nfa().to_state();
let mut got = vec![];
s.iter_match_pattern_ids(|pid| got.push(pid));
got == pids
}
fn prop_state_read_write_nfa_state_and_pattern_ids(
sids: Vec<StateID>,
pids: Vec<PatternID>
) -> bool {
// Builders states do not permit duplicate IDs.
let sids = dedup_state_ids(sids);
let pids = dedup_pattern_ids(pids);
let mut b = StateBuilderEmpty::new().into_matches();
for &pid in &pids {
b.add_match_pattern_id(pid);
}
let mut b = b.into_nfa();
for &sid in &sids {
b.add_nfa_state_id(sid);
}
let s = b.to_state();
let mut got_pids = vec![];
s.iter_match_pattern_ids(|pid| got_pids.push(pid));
let mut got_sids = vec![];
s.iter_nfa_state_ids(|sid| got_sids.push(sid));
got_pids == pids && got_sids == sids
}
}
quickcheck! {
fn prop_read_write_varu32(n: u32) -> bool {
let mut buf = vec![];
write_varu32(&mut buf, n);
let (got, nread) = read_varu32(&buf);
nread == buf.len() && got == n
}
fn prop_read_write_vari32(n: i32) -> bool {
let mut buf = vec![];
write_vari32(&mut buf, n);
let (got, nread) = read_vari32(&buf);
nread == buf.len() && got == n
}
}
#[cfg(not(miri))]
fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
let mut set = alloc::collections::BTreeSet::new();
let mut deduped = vec![];
for sid in sids {
if set.contains(&sid) {
continue;
}
set.insert(sid);
deduped.push(sid);
}
deduped
}
#[cfg(not(miri))]
fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
let mut set = alloc::collections::BTreeSet::new();
let mut deduped = vec![];
for pid in pids {
if set.contains(&pid) {
continue;
}
set.insert(pid);
deduped.push(pid);
}
deduped
}
}

265
vendor/regex-automata/src/util/empty.rs vendored Normal file
View File

@@ -0,0 +1,265 @@
/*!
This module provides helper routines for dealing with zero-width matches.
The main problem being solved here is this:
1. The caller wants to search something that they know is valid UTF-8, such
as a Rust `&str`.
2. The regex used by the caller can match the empty string. For example, `a*`.
3. The caller should never get match offsets returned that occur within the
encoding of a UTF-8 codepoint. It is logically incorrect, and also means that,
e.g., slicing the `&str` at those offsets will lead to a panic.
So the question here is, how do we prevent the caller from getting match
offsets that split a codepoint? For example, strictly speaking, the regex `a*`
matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since
the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that
underlies all of the matching engines in this crate doesn't have anything in
its state graph that prevents matching between UTF-8 code units. Indeed, any
engine derived from the `NFA` will match at those positions by virtue of the
fact that the `NFA` is byte oriented. That is, its transitions are defined over
bytes and the matching engines work by proceeding one byte at a time.
(An alternative architecture would be to define the transitions in an `NFA`
over codepoints, or `char`. And then make the matching engines proceed by
decoding one codepoint at a time. This is a viable strategy, but it doesn't
work for DFA matching engines because designing a fast and memory efficient
transition table for an alphabet as large as Unicode is quite difficult. More
to the point, the top-level `regex` crate supports matching on arbitrary bytes
when Unicode mode is disabled and one is searching a `&[u8]`. So in that case,
you can't just limit yourself to decoding codepoints and matching those. You
really do need to be able to follow byte oriented transitions on the `NFA`.)
In an older version of the regex crate, we handled this case not in the regex
engine, but in the iterators over matches. Namely, since this case only arises
when the match is empty, we "just" incremented the next starting position
of the search by `N`, where `N` is the length of the codepoint encoded at
the current position. The alternative or more "natural" solution of just
incrementing by `1` would result in executing a search of `a*` on `☃` like
this:
* Start search at `0`.
* Found match at `[0, 0]`.
* Next start position is `0`.
* To avoid an infinite loop, since it's an empty match, increment by `1`.
* Start search at `1`.
* Found match at `[1, 1]`. Oops.
But if we instead incremented by `3` (the length in bytes of `☃`), then we get
the following:
* Start search at `0`.
* Found match at `[0, 0]`.
* Next start position is `0`.
* To avoid an infinite loop, since it's an empty match, increment by `3`.
* Start search at `3`.
* Found match at `[3, 3]`.
And we get the correct result. But does this technique work in all cases?
Crucially, it requires that a zero-width match that splits a codepoint never
occurs beyond the starting position of the search. Because if it did, merely
incrementing the start position by the number of bytes in the codepoint at
the current position wouldn't be enough. A zero-width match could just occur
anywhere. It turns out that it is _almost_ true. We can convince ourselves by
looking at all possible patterns that can match the empty string:
* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match
the empty string. That is, assuming there isn't an `a` at the current position,
they will all match the empty string at the start of a search. There is no way
to move past it because any other match would not be "leftmost."
* `^` only matches at the beginning of the haystack, where the start position
is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8,
then this entire problem goes away because it implies your string type supports
invalid UTF-8 and thus must deal with offsets that not only split a codepoint
but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches
between the code units of a codepoint because the start of a valid UTF-8 string
is never within the encoding of a codepoint.
* `$` basically the same logic as `^`, but for the end of a string. A valid
UTF-8 string can't have an incomplete codepoint at the end of it.
* `(?m:^)` follows similarly to `^`, but it can match immediately following
a `\n`. However, since a `\n` is always a codepoint itself and can never
appear within a codepoint, it follows that the position immediately following
a `\n` in a string that is valid UTF-8 is guaranteed to not be between the
code units of another codepoint. (One caveat here is that the line terminator
for multi-line anchors can now be changed to any arbitrary byte, including
things like `\x98` which might occur within a codepoint. However, this wasn't
supported by the old regex crate. If it was, it pose the same problems as
`(?-u:\B)`, as we'll discuss below.)
* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a
`(?m:$)` matches just before a `\n`. But the same argument applies.
* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the
CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`.
Namely, since they only ever match at a boundary where one side is either a
`\r` or a `\n`, neither of which can occur within a codepoint.
* `\b` only matches at positions where both sides are valid codepoints, so
this cannot split a codepoint.
* `\B`, like `\b`, also only matches at positions where both sides are valid
codepoints. So this cannot split a codepoint either.
* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII
word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints
(one of the many amazing qualities of UTF-8), it follows that this too cannot
split a codepoint.
* `(?-u:\B)` finally represents a problem. It can matches between *any* two
bytes that are either both word bytes or non-word bytes. Since code units like
`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes,
`(?-u:\B)` will match at the position between them.
Thus, our approach of incrementing one codepoint at a time after seeing an
empty match is flawed because `(?-u:\B)` can result in an empty match that
splits a codepoint at a position past the starting point of a search. For
example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2,
2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because
they correspond to word boundaries since `a` is an ASCII word byte.
So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from
regexes that could match `&str`. That might sound extreme, but a lot of other
things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and
`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a
codepoint. The key difference is that those expressions could never produce an
empty match. That ban happens when translating an `Ast` to an `Hir`, because
that process that reason about whether an `Hir` can produce *non-empty* matches
at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the
`(?-u:\B)` issue by banning it.
If banning `(?-u:\B)` were the only issue with the old regex crate's approach,
then I probably would have kept it. `\B` is rarely used, so it's not such a big
deal to have to work-around it. However, the problem with the above approach
is that it doesn't compose. The logic for avoiding splitting a codepoint only
lived in the iterator, which means if anyone wants to implement their own
iterator over regex matches, they have to deal with this extremely subtle edge
case to get full correctness.
Instead, in this crate, we take the approach of pushing this complexity down
to the lowest layers of each regex engine. The approach is pretty simple:
* If this corner case doesn't apply, don't do anything. (For example, if UTF-8
mode isn't enabled or if the regex cannot match the empty string.)
* If an empty match is reported, explicitly check if it splits a codepoint.
* If it doesn't, we're done, return the match.
* If it does, then ignore the match and re-run the search.
* Repeat the above process until the end of the haystack is reached or a match
is found that doesn't split a codepoint or isn't zero width.
And that's pretty much what this module provides. Every regex engine uses these
methods in their lowest level public APIs, but just above the layer where
their internal engine is used. That way, all regex engines can be arbitrarily
composed without worrying about handling this case, and iterators don't need to
handle it explicitly.
(It turns out that a new feature I added, support for changing the line
terminator in a regex to any arbitrary byte, also provokes the above problem.
Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that
support would need to be limited or banned when UTF-8 mode is enabled, just
like we did for `(?-u:\B)`. But thankfully our more robust approach in this
crate handles that case just fine too.)
*/
use crate::util::search::{Input, MatchError};
#[cold]
#[inline(never)]
pub(crate) fn skip_splits_fwd<T, F>(
input: &Input<'_>,
init_value: T,
match_offset: usize,
find: F,
) -> Result<Option<T>, MatchError>
where
F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
{
skip_splits(true, input, init_value, match_offset, find)
}
#[cold]
#[inline(never)]
pub(crate) fn skip_splits_rev<T, F>(
input: &Input<'_>,
init_value: T,
match_offset: usize,
find: F,
) -> Result<Option<T>, MatchError>
where
F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
{
skip_splits(false, input, init_value, match_offset, find)
}
fn skip_splits<T, F>(
forward: bool,
input: &Input<'_>,
init_value: T,
mut match_offset: usize,
mut find: F,
) -> Result<Option<T>, MatchError>
where
F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
{
// If our config says to do an anchored search, then we're definitely
// done. We just need to determine whether we have a valid match or
// not. If we don't, then we're not allowed to continue, so we report
// no match.
//
// This is actually quite a subtle correctness thing. The key here is
// that if we got an empty match that splits a codepoint after doing an
// anchored search in UTF-8 mode, then that implies that we must have
// *started* the search at a location that splits a codepoint. This
// follows from the fact that if a match is reported from an anchored
// search, then the start offset of the match *must* match the start
// offset of the search.
//
// It also follows that no other non-empty match is possible. For
// example, you might write a regex like '(?:)|SOMETHING' and start its
// search in the middle of a codepoint. The first branch is an empty
// regex that will bubble up a match at the first position, and then
// get rejected here and report no match. But what if 'SOMETHING' could
// have matched? We reason that such a thing is impossible, because
// if it does, it must report a match that starts in the middle of a
// codepoint. This in turn implies that a match is reported whose span
// does not correspond to valid UTF-8, and this breaks the promise
// made when UTF-8 mode is enabled. (That promise *can* be broken, for
// example, by enabling UTF-8 mode but building an by hand NFA that
// produces non-empty matches that span invalid UTF-8. This is an unchecked
// but documented precondition violation of UTF-8 mode, and is documented
// to have unspecified behavior.)
//
// I believe this actually means that if an anchored search is run, and
// UTF-8 mode is enabled and the start position splits a codepoint,
// then it is correct to immediately report no match without even
// executing the regex engine. But it doesn't really seem worth writing
// out that case in every regex engine to save a tiny bit of work in an
// extremely pathological case, so we just handle it here.
if input.get_anchored().is_anchored() {
return Ok(if input.is_char_boundary(match_offset) {
Some(init_value)
} else {
None
});
}
// Otherwise, we have an unanchored search, so just keep looking for
// matches until we have one that does not split a codepoint or we hit
// EOI.
let mut value = init_value;
let mut input = input.clone();
while !input.is_char_boundary(match_offset) {
if forward {
// The unwrap is OK here because overflowing usize while
// iterating over a slice is impossible, at it would require
// a slice of length greater than isize::MAX, which is itself
// impossible.
input.set_start(input.start().checked_add(1).unwrap());
} else {
input.set_end(match input.end().checked_sub(1) {
None => return Ok(None),
Some(end) => end,
});
}
match find(&input)? {
None => return Ok(None),
Some((new_value, new_match_end)) => {
value = new_value;
match_offset = new_match_end;
}
}
}
Ok(Some(value))
}

View File

@@ -0,0 +1,84 @@
/*!
Provides convenience routines for escaping raw bytes.
Since this crate tends to deal with `&[u8]` everywhere and the default
`Debug` implementation just shows decimal integers, it makes debugging those
representations quite difficult. This module provides types that show `&[u8]`
as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex
representation.
*/
use crate::util::utf8;
/// Provides a convenient `Debug` implementation for a `u8`.
///
/// The `Debug` impl treats the byte as an ASCII, and emits a human readable
/// representation of it. If the byte isn't ASCII, then it's emitted as a hex
/// escape sequence.
#[derive(Clone, Copy)]
pub struct DebugByte(pub u8);
impl core::fmt::Debug for DebugByte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// Special case ASCII space. It's too hard to read otherwise, so
// put quotes around it. I sometimes wonder whether just '\x20' would
// be better...
if self.0 == b' ' {
return write!(f, "' '");
}
// 10 bytes is enough to cover any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
/// Provides a convenient `Debug` implementation for `&[u8]`.
///
/// This generally works best when the bytes are presumed to be mostly UTF-8,
/// but will work for anything. For any bytes that aren't UTF-8, they are
/// emitted as hex escape sequences.
pub struct DebugHaystack<'a>(pub &'a [u8]);
impl<'a> core::fmt::Debug for DebugHaystack<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8::decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(byte) => {
write!(f, r"\x{byte:02x}")?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
// ASCII control characters except \0, \n, \r, \t
'\x01'..='\x08'
| '\x0b'
| '\x0c'
| '\x0e'..='\x19'
| '\x7f' => {
write!(f, "\\x{:02x}", u32::from(ch))?;
}
'\n' | '\r' | '\t' | _ => {
write!(f, "{}", ch.escape_debug())?;
}
}
}
write!(f, "\"")?;
Ok(())
}
}

246
vendor/regex-automata/src/util/int.rs vendored Normal file
View File

@@ -0,0 +1,246 @@
/*!
This module provides several integer oriented traits for converting between
both fixed size integers and integers whose size varies based on the target
(like `usize`).
The driving design principle of this module is to attempt to centralize as many
`as` casts as possible here. And in particular, we separate casts into two
buckets:
* Casts that we use for their truncating behavior. In this case, we use more
descriptive names, like `low_u32` and `high_u32`.
* Casts that we use for converting back-and-forth between `usize`. These
conversions are generally necessary because we often store indices in different
formats to save on memory, which requires converting to and from `usize`. In
this case, we very specifically do not want to overflow, and so the methods
defined here will panic if the `as` cast would be lossy in debug mode. (A
normal `as` cast will never panic!)
For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there.
For regex engines, floating point is just never used, so we don't have to worry
about `as` casts for those.
Otherwise, this module pretty much covers all of our `as` needs except for one
thing: const contexts. There are a select few places in this crate where we
still need to use `as` because const functions on traits aren't stable yet.
If we wind up significantly expanding our const footprint in this crate, it
might be worth defining free functions to handle those cases. But at the time
of writing, that just seemed like too much ceremony. Instead, I comment each
such use of `as` in a const context with a "fixme" notice.
NOTE: for simplicity, we don't take target pointer width into account here for
`usize` conversions. Since we currently only panic in debug mode, skipping the
check when it can be proven it isn't needed at compile time doesn't really
matter. Now, if we wind up wanting to do as many checks as possible in release
mode, then we would want to skip those when we know the conversions are always
non-lossy.
NOTE: this module isn't an exhaustive API. For example, we still use things
like `u64::from` where possible, or even `usize::try_from()` for when we do
explicitly want to panic or when we want to return an error for overflow.
*/
// We define a little more than what we need, but I'd rather just have
// everything via a consistent and uniform API then have holes.
#![allow(dead_code)]
pub(crate) trait U8 {
fn as_usize(self) -> usize;
}
impl U8 for u8 {
fn as_usize(self) -> usize {
usize::from(self)
}
}
pub(crate) trait U16 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn high_u8(self) -> u8;
}
impl U16 for u16 {
fn as_usize(self) -> usize {
usize::from(self)
}
fn low_u8(self) -> u8 {
self as u8
}
fn high_u8(self) -> u8 {
(self >> 8) as u8
}
}
pub(crate) trait U32 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn low_u16(self) -> u16;
fn high_u16(self) -> u16;
}
impl U32 for u32 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("u32 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn low_u8(self) -> u8 {
self as u8
}
fn low_u16(self) -> u16 {
self as u16
}
fn high_u16(self) -> u16 {
(self >> 16) as u16
}
}
pub(crate) trait U64 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn low_u16(self) -> u16;
fn low_u32(self) -> u32;
fn high_u32(self) -> u32;
}
impl U64 for u64 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("u64 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn low_u8(self) -> u8 {
self as u8
}
fn low_u16(self) -> u16 {
self as u16
}
fn low_u32(self) -> u32 {
self as u32
}
fn high_u32(self) -> u32 {
(self >> 32) as u32
}
}
pub(crate) trait I32 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u32;
fn from_bits(n: u32) -> i32;
}
impl I32 for i32 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i32 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u32 {
self as u32
}
fn from_bits(n: u32) -> i32 {
n as i32
}
}
pub(crate) trait Usize {
fn as_u8(self) -> u8;
fn as_u16(self) -> u16;
fn as_u32(self) -> u32;
fn as_u64(self) -> u64;
}
impl Usize for usize {
fn as_u8(self) -> u8 {
#[cfg(debug_assertions)]
{
u8::try_from(self).expect("usize overflowed u8")
}
#[cfg(not(debug_assertions))]
{
self as u8
}
}
fn as_u16(self) -> u16 {
#[cfg(debug_assertions)]
{
u16::try_from(self).expect("usize overflowed u16")
}
#[cfg(not(debug_assertions))]
{
self as u16
}
}
fn as_u32(self) -> u32 {
#[cfg(debug_assertions)]
{
u32::try_from(self).expect("usize overflowed u32")
}
#[cfg(not(debug_assertions))]
{
self as u32
}
}
fn as_u64(self) -> u64 {
#[cfg(debug_assertions)]
{
u64::try_from(self).expect("usize overflowed u64")
}
#[cfg(not(debug_assertions))]
{
self as u64
}
}
}
// Pointers aren't integers, but we convert pointers to integers to perform
// offset arithmetic in some places. (And no, we don't convert the integers
// back to pointers.) So add 'as_usize' conversions here too for completeness.
//
// These 'as' casts are actually okay because they're always non-lossy. But the
// idea here is to just try and remove as much 'as' as possible, particularly
// in this crate where we are being really paranoid about offsets and making
// sure we don't panic on inputs that might be untrusted. This way, the 'as'
// casts become easier to audit if they're all in one place, even when some of
// them are actually okay 100% of the time.
pub(crate) trait Pointer {
fn as_usize(self) -> usize;
}
impl<T> Pointer for *const T {
fn as_usize(self) -> usize {
self as usize
}
}

View File

@@ -0,0 +1,576 @@
/*!
Provides routines for interpolating capture group references.
That is, if a replacement string contains references like `$foo` or `${foo1}`,
then they are replaced with the corresponding capture values for the groups
named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
is supported as well, with `1` corresponding to a capture group index and not
a name.
This module provides the free functions [`string`] and [`bytes`], which
interpolate Rust Unicode strings and byte strings, respectively.
# Format
These routines support two different kinds of capture references: unbraced and
braced.
For the unbraced format, the format supported is `$ref` where `name` can be
any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
possible parse. So for example, `$1a` corresponds to the capture group named
`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
it is treated as a capture group index itself and not a name.
For the braced format, the format supported is `${ref}` where `ref` can be any
sequence of bytes except for `}`. If no closing brace occurs, then it is not
considered a capture reference. As with the unbraced format, if `ref` matches
`^[0-9]+$`, then it is treated as a capture group index and not a name.
The braced format is useful for exerting precise control over the name of the
capture reference. For example, `${1}a` corresponds to the capture group
reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
corresponds to the capture group reference `1a`. The braced format is also
useful for expressing capture group names that use characters not supported by
the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
named `foo[bar].baz`.
If a capture group reference is found and it does not refer to a valid capture
group, then it will be replaced with the empty string.
To write a literal `$`, use `$$`.
To be clear, and as exhibited via the type signatures in the routines in this
module, it is impossible for a replacement string to be invalid. A replacement
string may not have the intended semantics, but the interpolation procedure
itself can never fail.
*/
use alloc::{string::String, vec::Vec};
use crate::util::memchr::memchr;
/// Accepts a replacement string and interpolates capture references with their
/// corresponding values.
///
/// `append` should be a function that appends the string value of a capture
/// group at a particular index to the string given. If the capture group
/// index is invalid, then nothing should be appended.
///
/// `name_to_index` should be a function that maps a capture group name to a
/// capture group index. If the given name doesn't exist, then `None` should
/// be returned.
///
/// Finally, `dst` is where the final interpolated contents should be written.
/// If `replacement` contains no capture group references, then `dst` will be
/// equivalent to `replacement`.
///
/// See the [module documentation](self) for details about the format
/// supported.
///
/// # Example
///
/// ```
/// use regex_automata::util::interpolate;
///
/// let mut dst = String::new();
/// interpolate::string(
/// "foo $bar baz",
/// |index, dst| {
/// if index == 0 {
/// dst.push_str("BAR");
/// }
/// },
/// |name| {
/// if name == "bar" {
/// Some(0)
/// } else {
/// None
/// }
/// },
/// &mut dst,
/// );
/// assert_eq!("foo BAR baz", dst);
/// ```
pub fn string(
mut replacement: &str,
mut append: impl FnMut(usize, &mut String),
mut name_to_index: impl FnMut(&str) -> Option<usize>,
dst: &mut String,
) {
while !replacement.is_empty() {
match memchr(b'$', replacement.as_bytes()) {
None => break,
Some(i) => {
dst.push_str(&replacement[..i]);
replacement = &replacement[i..];
}
}
// Handle escaping of '$'.
if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
dst.push_str("$");
replacement = &replacement[2..];
continue;
}
debug_assert!(!replacement.is_empty());
let cap_ref = match find_cap_ref(replacement.as_bytes()) {
Some(cap_ref) => cap_ref,
None => {
dst.push_str("$");
replacement = &replacement[1..];
continue;
}
};
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => append(i, dst),
Ref::Named(name) => {
if let Some(i) = name_to_index(name) {
append(i, dst);
}
}
}
}
dst.push_str(replacement);
}
/// Accepts a replacement byte string and interpolates capture references with
/// their corresponding values.
///
/// `append` should be a function that appends the byte string value of a
/// capture group at a particular index to the byte string given. If the
/// capture group index is invalid, then nothing should be appended.
///
/// `name_to_index` should be a function that maps a capture group name to a
/// capture group index. If the given name doesn't exist, then `None` should
/// be returned.
///
/// Finally, `dst` is where the final interpolated contents should be written.
/// If `replacement` contains no capture group references, then `dst` will be
/// equivalent to `replacement`.
///
/// See the [module documentation](self) for details about the format
/// supported.
///
/// # Example
///
/// ```
/// use regex_automata::util::interpolate;
///
/// let mut dst = vec![];
/// interpolate::bytes(
/// b"foo $bar baz",
/// |index, dst| {
/// if index == 0 {
/// dst.extend_from_slice(b"BAR");
/// }
/// },
/// |name| {
/// if name == "bar" {
/// Some(0)
/// } else {
/// None
/// }
/// },
/// &mut dst,
/// );
/// assert_eq!(&b"foo BAR baz"[..], dst);
/// ```
pub fn bytes(
mut replacement: &[u8],
mut append: impl FnMut(usize, &mut Vec<u8>),
mut name_to_index: impl FnMut(&str) -> Option<usize>,
dst: &mut Vec<u8>,
) {
while !replacement.is_empty() {
match memchr(b'$', replacement) {
None => break,
Some(i) => {
dst.extend_from_slice(&replacement[..i]);
replacement = &replacement[i..];
}
}
// Handle escaping of '$'.
if replacement.get(1).map_or(false, |&b| b == b'$') {
dst.push(b'$');
replacement = &replacement[2..];
continue;
}
debug_assert!(!replacement.is_empty());
let cap_ref = match find_cap_ref(replacement) {
Some(cap_ref) => cap_ref,
None => {
dst.push(b'$');
replacement = &replacement[1..];
continue;
}
};
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => append(i, dst),
Ref::Named(name) => {
if let Some(i) = name_to_index(name) {
append(i, dst);
}
}
}
}
dst.extend_from_slice(replacement);
}
/// `CaptureRef` represents a reference to a capture group inside some text.
/// The reference is either a capture group name or a number.
///
/// It is also tagged with the position in the text following the
/// capture reference.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct CaptureRef<'a> {
cap: Ref<'a>,
end: usize,
}
/// A reference to a capture group in some text.
///
/// e.g., `$2`, `$foo`, `${foo}`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum Ref<'a> {
Named(&'a str),
Number(usize),
}
impl<'a> From<&'a str> for Ref<'a> {
fn from(x: &'a str) -> Ref<'a> {
Ref::Named(x)
}
}
impl From<usize> for Ref<'static> {
fn from(x: usize) -> Ref<'static> {
Ref::Number(x)
}
}
/// Parses a possible reference to a capture group name in the given text,
/// starting at the beginning of `replacement`.
///
/// If no such valid reference could be found, None is returned.
///
/// Note that this returns a "possible" reference because this routine doesn't
/// know whether the reference is to a valid group or not. If it winds up not
/// being a valid reference, then it should be replaced with the empty string.
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
let mut i = 0;
let rep: &[u8] = replacement;
if rep.len() <= 1 || rep[0] != b'$' {
return None;
}
i += 1;
if rep[i] == b'{' {
return find_cap_ref_braced(rep, i + 1);
}
let mut cap_end = i;
while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
cap_end += 1;
}
if cap_end == i {
return None;
}
// We just verified that the range 0..cap_end is valid ASCII, so it must
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
// check via an unchecked conversion or by parsing the number straight from
// &[u8].
let cap = core::str::from_utf8(&rep[i..cap_end])
.expect("valid UTF-8 capture name");
Some(CaptureRef {
cap: match cap.parse::<usize>() {
Ok(i) => Ref::Number(i),
Err(_) => Ref::Named(cap),
},
end: cap_end,
})
}
/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
/// brace has been found at `i-1` in `rep`. This then looks for a closing
/// brace and returns the capture reference within the brace.
fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
let start = i;
while rep.get(i).map_or(false, |&b| b != b'}') {
i += 1;
}
if !rep.get(i).map_or(false, |&b| b == b'}') {
return None;
}
// When looking at braced names, we don't put any restrictions on the name,
// so it's possible it could be invalid UTF-8. But a capture group name
// can never be invalid UTF-8, so if we have invalid UTF-8, then we can
// safely return None.
let cap = match core::str::from_utf8(&rep[start..i]) {
Err(_) => return None,
Ok(cap) => cap,
};
Some(CaptureRef {
cap: match cap.parse::<usize>() {
Ok(i) => Ref::Number(i),
Err(_) => Ref::Named(cap),
},
end: i + 1,
})
}
/// Returns true if and only if the given byte is allowed in a capture name
/// written in non-brace form.
fn is_valid_cap_letter(b: u8) -> bool {
matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_')
}
#[cfg(test)]
mod tests {
use alloc::{string::String, vec, vec::Vec};
use super::{find_cap_ref, CaptureRef};
macro_rules! find {
($name:ident, $text:expr) => {
#[test]
fn $name() {
assert_eq!(None, find_cap_ref($text.as_bytes()));
}
};
($name:ident, $text:expr, $capref:expr) => {
#[test]
fn $name() {
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
}
};
}
macro_rules! c {
($name_or_number:expr, $pos:expr) => {
CaptureRef { cap: $name_or_number.into(), end: $pos }
};
}
find!(find_cap_ref1, "$foo", c!("foo", 4));
find!(find_cap_ref2, "${foo}", c!("foo", 6));
find!(find_cap_ref3, "$0", c!(0, 2));
find!(find_cap_ref4, "$5", c!(5, 2));
find!(find_cap_ref5, "$10", c!(10, 3));
// See https://github.com/rust-lang/regex/pull/585
// for more on characters following numbers
find!(find_cap_ref6, "$42a", c!("42a", 4));
find!(find_cap_ref7, "${42}a", c!(42, 5));
find!(find_cap_ref8, "${42");
find!(find_cap_ref9, "${42 ");
find!(find_cap_ref10, " $0 ");
find!(find_cap_ref11, "$");
find!(find_cap_ref12, " ");
find!(find_cap_ref13, "");
find!(find_cap_ref14, "$1-$2", c!(1, 2));
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
find!(find_cap_ref16, "$x-$y", c!("x", 2));
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
find!(find_cap_ref18, "${#}", c!("#", 4));
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
find!(find_cap_ref20, "${¾}", c!("¾", 5));
find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
find!(find_cap_ref22, "${a¾}", c!("", 6));
find!(find_cap_ref23, "${☃}", c!("", 6));
find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
find!(find_cap_ref26, "${名字}", c!("名字", 9));
fn interpolate_string(
mut name_to_index: Vec<(&'static str, usize)>,
caps: Vec<&'static str>,
replacement: &str,
) -> String {
name_to_index.sort_by_key(|x| x.0);
let mut dst = String::new();
super::string(
replacement,
|i, dst| {
if let Some(&s) = caps.get(i) {
dst.push_str(s);
}
},
|name| -> Option<usize> {
name_to_index
.binary_search_by_key(&name, |x| x.0)
.ok()
.map(|i| name_to_index[i].1)
},
&mut dst,
);
dst
}
fn interpolate_bytes(
mut name_to_index: Vec<(&'static str, usize)>,
caps: Vec<&'static str>,
replacement: &str,
) -> String {
name_to_index.sort_by_key(|x| x.0);
let mut dst = vec![];
super::bytes(
replacement.as_bytes(),
|i, dst| {
if let Some(&s) = caps.get(i) {
dst.extend_from_slice(s.as_bytes());
}
},
|name| -> Option<usize> {
name_to_index
.binary_search_by_key(&name, |x| x.0)
.ok()
.map(|i| name_to_index[i].1)
},
&mut dst,
);
String::from_utf8(dst).unwrap()
}
macro_rules! interp {
($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
#[test]
fn $name() {
assert_eq!(
$expected,
interpolate_string($map, $caps, $hay),
"interpolate::string failed",
);
assert_eq!(
$expected,
interpolate_bytes($map, $caps, $hay),
"interpolate::bytes failed",
);
}
};
}
interp!(
interp1,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test $foo test",
"test xxx test",
);
interp!(
interp2,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test$footest",
"test",
);
interp!(
interp3,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test${foo}test",
"testxxxtest",
);
interp!(
interp4,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test$2test",
"test",
);
interp!(
interp5,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test${2}test",
"testxxxtest",
);
interp!(
interp6,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test $$foo test",
"test $foo test",
);
interp!(
interp7,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test $foo",
"test xxx",
);
interp!(
interp8,
vec![("foo", 2)],
vec!["", "", "xxx"],
"$foo test",
"xxx test",
);
interp!(
interp9,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test $bar$foo",
"test yyyxxx",
);
interp!(
interp10,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test $ test",
"test $ test",
);
interp!(
interp11,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${} test",
"test test",
);
interp!(
interp12,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${ } test",
"test test",
);
interp!(
interp13,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${a b} test",
"test test",
);
interp!(
interp14,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${a} test",
"test test",
);
// This is a funny case where a braced reference is never closed, but
// within the unclosed braced reference, there is an unbraced reference.
// In this case, the braced reference is just treated literally and the
// unbraced reference is found.
interp!(
interp15,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${wat $bar ok",
"test ${wat yyy ok",
);
}

1022
vendor/regex-automata/src/util/iter.rs vendored Normal file

File diff suppressed because it is too large Load Diff

461
vendor/regex-automata/src/util/lazy.rs vendored Normal file
View File

@@ -0,0 +1,461 @@
/*!
A lazily initialized value for safe sharing between threads.
The principal type in this module is `Lazy`, which makes it easy to construct
values that are shared safely across multiple threads simultaneously.
*/
use core::fmt;
/// A lazily initialized value that implements `Deref` for `T`.
///
/// A `Lazy` takes an initialization function and permits callers from any
/// thread to access the result of that initialization function in a safe
/// manner. In effect, this permits one-time initialization of global resources
/// in a (possibly) multi-threaded program.
///
/// This type and its functionality are available even when neither the `alloc`
/// nor the `std` features are enabled. In exchange, a `Lazy` does **not**
/// guarantee that the given `create` function is called at most once. It
/// might be called multiple times. Moreover, a call to `Lazy::get` (either
/// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T`
/// is available.
///
/// This is very similar to `lazy_static` or `once_cell`, except it doesn't
/// guarantee that the initialization function will be run once and it works
/// in no-alloc no-std environments. With that said, if you need stronger
/// guarantees or a more flexible API, then it is recommended to use either
/// `lazy_static` or `once_cell`.
///
/// # Warning: may use a spin lock
///
/// When this crate is compiled _without_ the `alloc` feature, then this type
/// may used a spin lock internally. This can have subtle effects that may
/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more
/// thorough treatment of this topic.
///
/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
///
/// # Example
///
/// This type is useful for creating regexes once, and then using them from
/// multiple threads simultaneously without worrying about synchronization.
///
/// ```
/// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match};
///
/// static RE: Lazy<Regex> = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap());
///
/// let expected = Some(Match::must(0, 3..14));
/// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz"));
/// ```
pub struct Lazy<T, F = fn() -> T>(lazy::Lazy<T, F>);
impl<T, F> Lazy<T, F> {
/// Create a new `Lazy` value that is initialized via the given function.
///
/// The `T` type is automatically inferred from the return type of the
/// `create` function given.
pub const fn new(create: F) -> Lazy<T, F> {
Lazy(lazy::Lazy::new(create))
}
}
impl<T, F: Fn() -> T> Lazy<T, F> {
/// Return a reference to the lazily initialized value.
///
/// This routine may block if another thread is initializing a `T`.
///
/// Note that given a `x` which has type `Lazy`, this must be called via
/// `Lazy::get(x)` and not `x.get()`. This routine is defined this way
/// because `Lazy` impls `Deref` with a target of `T`.
///
/// # Panics
///
/// This panics if the `create` function inside this lazy value panics.
/// If the panic occurred in another thread, then this routine _may_ also
/// panic (but is not guaranteed to do so).
pub fn get(this: &Lazy<T, F>) -> &T {
this.0.get()
}
}
impl<T, F: Fn() -> T> core::ops::Deref for Lazy<T, F> {
type Target = T;
fn deref(&self) -> &T {
Lazy::get(self)
}
}
impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.0.fmt(f)
}
}
#[cfg(feature = "alloc")]
mod lazy {
use core::{
fmt,
marker::PhantomData,
sync::atomic::{AtomicPtr, Ordering},
};
use alloc::boxed::Box;
/// A non-std lazy initialized value.
///
/// This might run the initialization function more than once, but will
/// never block.
///
/// I wish I could get these semantics into the non-alloc non-std Lazy
/// type below, but I'm not sure how to do it. If you can do an alloc,
/// then the implementation becomes very simple if you don't care about
/// redundant work precisely because a pointer can be atomically swapped.
///
/// Perhaps making this approach work in the non-alloc non-std case
/// requires asking the caller for a pointer? It would make the API less
/// convenient I think.
pub(super) struct Lazy<T, F> {
data: AtomicPtr<T>,
create: F,
// This indicates to the compiler that this type can drop T. It's not
// totally clear how the absence of this marker could lead to trouble,
// but putting here doesn't have any downsides so we hedge until someone
// can from the Unsafe Working Group can tell us definitively that we
// don't need it.
//
// See: https://github.com/BurntSushi/regex-automata/issues/30
owned: PhantomData<Box<T>>,
}
// SAFETY: So long as T and &T (and F and &F) can themselves be safely
// shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only
// permits accessing a &T and initialization is free of data races. So if T
// is thread safe, then so to is Lazy<T, _>.
//
// We specifically require that T: Send in order for Lazy<T> to be Sync.
// Without that requirement, it's possible to send a T from one thread to
// another via Lazy's destructor.
//
// It's not clear whether we need F: Send+Sync for Lazy to be Sync. But
// we're conservative for now and keep both.
unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {}
impl<T, F> Lazy<T, F> {
/// Create a new alloc but non-std lazy value that is racily
/// initialized. That is, the 'create' function may be called more than
/// once.
pub(super) const fn new(create: F) -> Lazy<T, F> {
Lazy {
data: AtomicPtr::new(core::ptr::null_mut()),
create,
owned: PhantomData,
}
}
}
impl<T, F: Fn() -> T> Lazy<T, F> {
/// Get the underlying lazy value. If it hasn't been initialized
/// yet, then always attempt to initialize it (even if some other
/// thread is initializing it) and atomically attach it to this lazy
/// value before returning it.
pub(super) fn get(&self) -> &T {
if let Some(data) = self.poll() {
return data;
}
let data = (self.create)();
let mut ptr = Box::into_raw(Box::new(data));
// We attempt to stuff our initialized value into our atomic
// pointer. Upon success, we don't need to do anything. But if
// someone else beat us to the punch, then we need to make sure
// our newly created value is dropped.
let result = self.data.compare_exchange(
core::ptr::null_mut(),
ptr,
Ordering::AcqRel,
Ordering::Acquire,
);
if let Err(old) = result {
// SAFETY: We created 'ptr' via Box::into_raw above, so turning
// it back into a Box via from_raw is safe.
drop(unsafe { Box::from_raw(ptr) });
ptr = old;
}
// SAFETY: We just set the pointer above to a non-null value, even
// in the error case, and set it to a fully initialized value
// returned by 'create'.
unsafe { &*ptr }
}
/// If this lazy value has been initialized successfully, then return
/// that value. Otherwise return None immediately. This never attempts
/// to run initialization itself.
fn poll(&self) -> Option<&T> {
let ptr = self.data.load(Ordering::Acquire);
if ptr.is_null() {
return None;
}
// SAFETY: We just checked that the pointer is not null. Since it's
// not null, it must have been fully initialized by 'get' at some
// point.
Some(unsafe { &*ptr })
}
}
impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Lazy").field("data", &self.poll()).finish()
}
}
impl<T, F> Drop for Lazy<T, F> {
fn drop(&mut self) {
let ptr = *self.data.get_mut();
if !ptr.is_null() {
// SAFETY: We just checked that 'ptr' is not null. And since
// we have exclusive access, there are no races to worry about.
drop(unsafe { Box::from_raw(ptr) });
}
}
}
}
#[cfg(not(feature = "alloc"))]
mod lazy {
use core::{
cell::Cell,
fmt,
mem::MaybeUninit,
panic::{RefUnwindSafe, UnwindSafe},
sync::atomic::{AtomicU8, Ordering},
};
/// Our 'Lazy' value can be in one of three states:
///
/// * INIT is where it starts, and also ends up back here if the
/// 'create' routine panics.
/// * BUSY is where it sits while initialization is running in exactly
/// one thread.
/// * DONE is where it sits after 'create' has completed and 'data' has
/// been fully initialized.
const LAZY_STATE_INIT: u8 = 0;
const LAZY_STATE_BUSY: u8 = 1;
const LAZY_STATE_DONE: u8 = 2;
/// A non-alloc non-std lazy initialized value.
///
/// This guarantees initialization only happens once, but uses a spinlock
/// to block in the case of simultaneous access. Blocking occurs so that
/// one thread waits while another thread initializes the value.
///
/// I would much rather have the semantics of the 'alloc' Lazy type above.
/// Namely, that we might run the initialization function more than once,
/// but we never otherwise block. However, I don't know how to do that in
/// a non-alloc non-std context.
pub(super) struct Lazy<T, F> {
state: AtomicU8,
create: Cell<Option<F>>,
data: Cell<MaybeUninit<T>>,
}
// SAFETY: So long as T and &T (and F and &F) can themselves be safely
// shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only
// permits accessing a &T and initialization is free of data races. So if T
// is thread safe, then so to is Lazy<T, _>.
unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {}
// A reference to a Lazy is unwind safe because we specifically take
// precautions to poison all accesses to a Lazy if the caller-provided
// 'create' function panics.
impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe
for Lazy<T, F>
{
}
impl<T, F> Lazy<T, F> {
/// Create a new non-alloc non-std lazy value that is initialized
/// exactly once on first use using the given function.
pub(super) const fn new(create: F) -> Lazy<T, F> {
Lazy {
state: AtomicU8::new(LAZY_STATE_INIT),
create: Cell::new(Some(create)),
data: Cell::new(MaybeUninit::uninit()),
}
}
}
impl<T, F: FnOnce() -> T> Lazy<T, F> {
/// Get the underlying lazy value. If it isn't been initialized
/// yet, then either initialize it or block until some other thread
/// initializes it. If the 'create' function given to Lazy::new panics
/// (even in another thread), then this panics too.
pub(super) fn get(&self) -> &T {
// This is effectively a spinlock. We loop until we enter a DONE
// state, and if possible, initialize it ourselves. The only way
// we exit the loop is if 'create' panics, we initialize 'data' or
// some other thread initializes 'data'.
//
// Yes, I have read spinlocks considered harmful[1]. And that
// article is why this spinlock is only active when 'alloc' isn't
// enabled. I did this because I don't think there is really
// another choice without 'alloc', other than not providing this at
// all. But I think that's a big bummer.
//
// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE {
// Check if we're the first ones to get here. If so, we'll be
// the ones who initialize.
let result = self.state.compare_exchange(
LAZY_STATE_INIT,
LAZY_STATE_BUSY,
Ordering::AcqRel,
Ordering::Acquire,
);
// This means we saw the INIT state and nobody else can. So we
// must take responsibility for initializing. And by virtue of
// observing INIT, we have also told anyone else trying to
// get here that we are BUSY. If someone else sees BUSY, then
// they will spin until we finish initialization.
if let Ok(_) = result {
// Since we are guaranteed to be the only ones here, we
// know that 'create' is there... Unless someone else got
// here before us and 'create' panicked. In which case,
// 'self.create' is now 'None' and we forward the panic
// to the caller. (i.e., We implement poisoning.)
//
// SAFETY: Our use of 'self.state' guarantees that we are
// the only thread executing this line, and thus there are
// no races.
let create = unsafe {
(*self.create.as_ptr()).take().expect(
"Lazy's create function panicked, \
preventing initialization,
poisoning current thread",
)
};
let guard = Guard { state: &self.state };
// SAFETY: Our use of 'self.state' guarantees that we are
// the only thread executing this line, and thus there are
// no races.
unsafe {
(*self.data.as_ptr()).as_mut_ptr().write(create());
}
// All is well. 'self.create' ran successfully, so we
// forget the guard.
core::mem::forget(guard);
// Everything is initialized, so we can declare success.
self.state.store(LAZY_STATE_DONE, Ordering::Release);
break;
}
core::hint::spin_loop();
}
// We only get here if data is fully initialized, and thus poll
// will always return something.
self.poll().unwrap()
}
/// If this lazy value has been initialized successfully, then return
/// that value. Otherwise return None immediately. This never blocks.
fn poll(&self) -> Option<&T> {
if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE {
// SAFETY: The DONE state only occurs when data has been fully
// initialized.
Some(unsafe { &*(*self.data.as_ptr()).as_ptr() })
} else {
None
}
}
}
impl<T: fmt::Debug, F: FnMut() -> T> fmt::Debug for Lazy<T, F> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Lazy")
.field("state", &self.state.load(Ordering::Acquire))
.field("create", &"<closure>")
.field("data", &self.poll())
.finish()
}
}
impl<T, F> Drop for Lazy<T, F> {
fn drop(&mut self) {
if *self.state.get_mut() == LAZY_STATE_DONE {
// SAFETY: state is DONE if and only if data has been fully
// initialized. At which point, it is safe to drop.
unsafe {
self.data.get_mut().assume_init_drop();
}
}
}
}
/// A guard that will reset a Lazy's state back to INIT when dropped. The
/// idea here is to 'forget' this guard on success. On failure (when a
/// panic occurs), the Drop impl runs and causes all in-progress and future
/// 'get' calls to panic. Without this guard, all in-progress and future
/// 'get' calls would spin forever. Crashing is much better than getting
/// stuck in an infinite loop.
struct Guard<'a> {
state: &'a AtomicU8,
}
impl<'a> Drop for Guard<'a> {
fn drop(&mut self) {
// We force ourselves back into an INIT state. This will in turn
// cause any future 'get' calls to attempt calling 'self.create'
// again which will in turn panic because 'self.create' will now
// be 'None'.
self.state.store(LAZY_STATE_INIT, Ordering::Release);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_send<T: Send>() {}
fn assert_sync<T: Sync>() {}
fn assert_unwind<T: core::panic::UnwindSafe>() {}
fn assert_refunwind<T: core::panic::RefUnwindSafe>() {}
#[test]
fn oibits() {
assert_send::<Lazy<u64>>();
assert_sync::<Lazy<u64>>();
assert_unwind::<Lazy<u64>>();
assert_refunwind::<Lazy<u64>>();
}
// This is a regression test because we used to rely on the inferred Sync
// impl for the Lazy type defined above (for 'alloc' mode). In the
// inferred impl, it only requires that T: Sync for Lazy<T>: Sync. But
// if we have that, we can actually make use of the fact that Lazy<T> drops
// T to create a value on one thread and drop it on another. This *should*
// require T: Send, but our missing bounds before let it sneak by.
//
// Basically, this test should not compile, so we... comment it out. We
// don't have a great way of testing compile-fail tests right now.
//
// See: https://github.com/BurntSushi/regex-automata/issues/30
/*
#[test]
fn sync_not_send() {
#[allow(dead_code)]
fn inner<T: Sync + Default>() {
let lazy = Lazy::new(move || T::default());
std::thread::scope(|scope| {
scope.spawn(|| {
Lazy::get(&lazy); // We create T in this thread
});
});
// And drop in this thread.
drop(lazy);
// So we have send a !Send type over threads. (with some more
// legwork, its possible to even sneak the value out of drop
// through thread local)
}
}
*/
}

2547
vendor/regex-automata/src/util/look.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,93 @@
/*!
This module defines simple wrapper routines for the memchr functions from the
`memchr` crate. Basically, when the `memchr` crate is available, we use it,
otherwise we use a naive implementation which is still pretty fast.
*/
pub(crate) use self::inner::*;
#[cfg(feature = "perf-literal-substring")]
pub(super) mod inner {
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
memchr::memchr(n1, haystack)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
memchr::memchr2(n1, n2, haystack)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memchr3(
n1: u8,
n2: u8,
n3: u8,
haystack: &[u8],
) -> Option<usize> {
memchr::memchr3(n1, n2, n3, haystack)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
memchr::memrchr(n1, haystack)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
memchr::memrchr2(n1, n2, haystack)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memrchr3(
n1: u8,
n2: u8,
n3: u8,
haystack: &[u8],
) -> Option<usize> {
memchr::memrchr3(n1, n2, n3, haystack)
}
}
#[cfg(not(feature = "perf-literal-substring"))]
pub(super) mod inner {
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().position(|&b| b == n1)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().position(|&b| b == n1 || b == n2)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memchr3(
n1: u8,
n2: u8,
n3: u8,
haystack: &[u8],
) -> Option<usize> {
haystack.iter().position(|&b| b == n1 || b == n2 || b == n3)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().rposition(|&b| b == n1)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().rposition(|&b| b == n1 || b == n2)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn memrchr3(
n1: u8,
n2: u8,
n3: u8,
haystack: &[u8],
) -> Option<usize> {
haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3)
}
}

57
vendor/regex-automata/src/util/mod.rs vendored Normal file
View File

@@ -0,0 +1,57 @@
/*!
A collection of modules that provide APIs that are useful across many regex
engines.
While one should explore the sub-modules directly to get a sense of what's
there, here are some highlights that tie the sub-modules to higher level
use cases:
* `alphabet` contains APIs that are useful if you're doing low level things
with the DFAs in this crate. For example, implementing determinization or
walking its state graph directly.
* `captures` contains APIs for dealing with capture group matches and their
mapping to "slots" used inside an NFA graph. This is also where you can find
iterators over capture group names.
* `escape` contains types for pretty-printing raw byte slices as strings.
* `iter` contains API helpers for writing regex iterators.
* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and
`once_cell`.
* `look` contains APIs for matching and configuring look-around assertions.
* `pool` provides a way to reuse mutable memory allocated in a thread safe
manner.
* `prefilter` provides APIs for building prefilters and using them in searches.
* `primitives` are what you might use if you're doing lower level work on
automata, such as walking an NFA state graph.
* `syntax` provides some higher level convenience functions for interacting
with the `regex-syntax` crate.
* `wire` is useful if you're working with DFA serialization.
*/
pub mod alphabet;
#[cfg(feature = "alloc")]
pub mod captures;
pub mod escape;
#[cfg(feature = "alloc")]
pub mod interpolate;
pub mod iter;
pub mod lazy;
pub mod look;
#[cfg(feature = "alloc")]
pub mod pool;
pub mod prefilter;
pub mod primitives;
pub mod start;
#[cfg(feature = "syntax")]
pub mod syntax;
pub mod wire;
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
pub(crate) mod determinize;
pub(crate) mod empty;
pub(crate) mod int;
pub(crate) mod memchr;
pub(crate) mod search;
#[cfg(feature = "alloc")]
pub(crate) mod sparse_set;
pub(crate) mod unicode_data;
pub(crate) mod utf8;

1199
vendor/regex-automata/src/util/pool.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,149 @@
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct AhoCorasick {
#[cfg(not(feature = "perf-literal-multisubstring"))]
_unused: (),
#[cfg(feature = "perf-literal-multisubstring")]
ac: aho_corasick::AhoCorasick,
}
impl AhoCorasick {
pub(crate) fn new<B: AsRef<[u8]>>(
kind: MatchKind,
needles: &[B],
) -> Option<AhoCorasick> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
None
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// We used to use `aho_corasick::MatchKind::Standard` here when
// `kind` was `MatchKind::All`, but this is not correct. The
// "standard" Aho-Corasick match semantics are to report a match
// immediately as soon as it is seen, but `All` isn't like that.
// In particular, with "standard" semantics, given the needles
// "abc" and "b" and the haystack "abc," it would report a match
// at offset 1 before a match at offset 0. This is never what we
// want in the context of the regex engine, regardless of whether
// we have leftmost-first or 'all' semantics. Namely, we always
// want the leftmost match.
let ac_match_kind = match kind {
MatchKind::LeftmostFirst | MatchKind::All => {
aho_corasick::MatchKind::LeftmostFirst
}
};
// This is kind of just an arbitrary number, but basically, if we
// have a small enough set of literals, then we try to use the VERY
// memory hungry DFA. Otherwise, we wimp out and use an NFA. The
// upshot is that the NFA is quite lean and decently fast. Faster
// than a naive Aho-Corasick NFA anyway.
let ac_kind = if needles.len() <= 500 {
aho_corasick::AhoCorasickKind::DFA
} else {
aho_corasick::AhoCorasickKind::ContiguousNFA
};
let result = aho_corasick::AhoCorasick::builder()
.kind(Some(ac_kind))
.match_kind(ac_match_kind)
.start_kind(aho_corasick::StartKind::Both)
// We try to handle all of the prefilter cases in the super
// module, and only use Aho-Corasick for the actual automaton.
// The aho-corasick crate does have some extra prefilters,
// namely, looking for rare bytes to feed to memchr{,2,3}
// instead of just the first byte. If we end up wanting
// those---and they are somewhat tricky to implement---then
// we could port them to this crate.
//
// The main reason for doing things this way is so we have a
// complete and easy to understand picture of which prefilters
// are available and how they work. Otherwise it seems too
// easy to get into a situation where we have a prefilter
// layered on top of prefilter, and that might have unintended
// consequences.
.prefilter(false)
.build(needles);
let ac = match result {
Ok(ac) => ac,
Err(_err) => {
debug!("aho-corasick prefilter failed to build: {_err}");
return None;
}
};
Some(AhoCorasick { ac })
}
}
}
impl PrefilterI for AhoCorasick {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let input =
aho_corasick::Input::new(haystack).span(span.start..span.end);
self.ac
.find(input)
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let input = aho_corasick::Input::new(haystack)
.anchored(aho_corasick::Anchored::Yes)
.span(span.start..span.end);
self.ac
.find(input)
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn memory_usage(&self) -> usize {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
self.ac.memory_usage()
}
}
fn is_fast(&self) -> bool {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// Aho-Corasick is never considered "fast" because it's never
// going to be even close to an order of magnitude faster than the
// regex engine itself (assuming a DFA is used). In fact, it is
// usually slower. The magic of Aho-Corasick is that it can search
// a *large* number of literals with a relatively small amount of
// memory. The regex engines are far more wasteful.
//
// Aho-Corasick may be "fast" when the regex engine corresponds
// to, say, the PikeVM. That happens when the lazy DFA couldn't be
// built or used for some reason. But in these cases, the regex
// itself is likely quite big and we're probably hosed no matter
// what we do. (In this case, the best bet is for the caller to
// increase some of the memory limits on the hybrid cache capacity
// and hope that's enough.)
false
}
}
}

View File

@@ -0,0 +1,58 @@
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct ByteSet([bool; 256]);
impl ByteSet {
pub(crate) fn new<B: AsRef<[u8]>>(
_kind: MatchKind,
needles: &[B],
) -> Option<ByteSet> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
None
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let mut set = [false; 256];
for needle in needles.iter() {
let needle = needle.as_ref();
if needle.len() != 1 {
return None;
}
set[usize::from(needle[0])] = true;
}
Some(ByteSet(set))
}
}
}
impl PrefilterI for ByteSet {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| {
let start = span.start + i;
let end = start + 1;
Span { start, end }
})
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
let b = *haystack.get(span.start)?;
if self.0[usize::from(b)] {
Some(Span { start: span.start, end: span.start + 1 })
} else {
None
}
}
fn memory_usage(&self) -> usize {
0
}
fn is_fast(&self) -> bool {
false
}
}

View File

@@ -0,0 +1,186 @@
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct Memchr(u8);
impl Memchr {
pub(crate) fn new<B: AsRef<[u8]>>(
_kind: MatchKind,
needles: &[B],
) -> Option<Memchr> {
#[cfg(not(feature = "perf-literal-substring"))]
{
None
}
#[cfg(feature = "perf-literal-substring")]
{
if needles.len() != 1 {
return None;
}
if needles[0].as_ref().len() != 1 {
return None;
}
Some(Memchr(needles[0].as_ref()[0]))
}
}
}
impl PrefilterI for Memchr {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-substring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-substring")]
{
memchr::memchr(self.0, &haystack[span]).map(|i| {
let start = span.start + i;
let end = start + 1;
Span { start, end }
})
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
let b = *haystack.get(span.start)?;
if self.0 == b {
Some(Span { start: span.start, end: span.start + 1 })
} else {
None
}
}
fn memory_usage(&self) -> usize {
0
}
fn is_fast(&self) -> bool {
true
}
}
#[derive(Clone, Debug)]
pub(crate) struct Memchr2(u8, u8);
impl Memchr2 {
pub(crate) fn new<B: AsRef<[u8]>>(
_kind: MatchKind,
needles: &[B],
) -> Option<Memchr2> {
#[cfg(not(feature = "perf-literal-substring"))]
{
None
}
#[cfg(feature = "perf-literal-substring")]
{
if needles.len() != 2 {
return None;
}
if !needles.iter().all(|n| n.as_ref().len() == 1) {
return None;
}
let b1 = needles[0].as_ref()[0];
let b2 = needles[1].as_ref()[0];
Some(Memchr2(b1, b2))
}
}
}
impl PrefilterI for Memchr2 {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-substring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-substring")]
{
memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| {
let start = span.start + i;
let end = start + 1;
Span { start, end }
})
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
let b = *haystack.get(span.start)?;
if self.0 == b || self.1 == b {
Some(Span { start: span.start, end: span.start + 1 })
} else {
None
}
}
fn memory_usage(&self) -> usize {
0
}
fn is_fast(&self) -> bool {
true
}
}
#[derive(Clone, Debug)]
pub(crate) struct Memchr3(u8, u8, u8);
impl Memchr3 {
pub(crate) fn new<B: AsRef<[u8]>>(
_kind: MatchKind,
needles: &[B],
) -> Option<Memchr3> {
#[cfg(not(feature = "perf-literal-substring"))]
{
None
}
#[cfg(feature = "perf-literal-substring")]
{
if needles.len() != 3 {
return None;
}
if !needles.iter().all(|n| n.as_ref().len() == 1) {
return None;
}
let b1 = needles[0].as_ref()[0];
let b2 = needles[1].as_ref()[0];
let b3 = needles[2].as_ref()[0];
Some(Memchr3(b1, b2, b3))
}
}
}
impl PrefilterI for Memchr3 {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-substring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-substring")]
{
memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| {
let start = span.start + i;
let end = start + 1;
Span { start, end }
})
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
let b = *haystack.get(span.start)?;
if self.0 == b || self.1 == b || self.2 == b {
Some(Span { start: span.start, end: span.start + 1 })
} else {
None
}
}
fn memory_usage(&self) -> usize {
0
}
fn is_fast(&self) -> bool {
true
}
}

View File

@@ -0,0 +1,88 @@
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct Memmem {
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
_unused: (),
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
finder: memchr::memmem::Finder<'static>,
}
impl Memmem {
pub(crate) fn new<B: AsRef<[u8]>>(
_kind: MatchKind,
needles: &[B],
) -> Option<Memmem> {
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
{
None
}
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
{
if needles.len() != 1 {
return None;
}
let needle = needles[0].as_ref();
let finder = memchr::memmem::Finder::new(needle).into_owned();
Some(Memmem { finder })
}
}
}
impl PrefilterI for Memmem {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
{
unreachable!()
}
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
{
self.finder.find(&haystack[span]).map(|i| {
let start = span.start + i;
let end = start + self.finder.needle().len();
Span { start, end }
})
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
{
unreachable!()
}
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
{
let needle = self.finder.needle();
if haystack[span].starts_with(needle) {
Some(Span { end: span.start + needle.len(), ..span })
} else {
None
}
}
}
fn memory_usage(&self) -> usize {
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
{
unreachable!()
}
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
{
self.finder.needle().len()
}
}
fn is_fast(&self) -> bool {
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
{
unreachable!()
}
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
{
true
}
}
}

View File

@@ -0,0 +1,719 @@
/*!
Defines a prefilter for accelerating regex searches.
A prefilter can be created by building a [`Prefilter`] value.
A prefilter represents one of the most important optimizations available for
accelerating regex searches. The idea of a prefilter is to very quickly find
candidate locations in a haystack where a regex _could_ match. Once a candidate
is found, it is then intended for the regex engine to run at that position to
determine whether the candidate is a match or a false positive.
In the aforementioned description of the prefilter optimization also lay its
demise. Namely, if a prefilter has a high false positive rate and it produces
lots of candidates, then a prefilter can overall make a regex search slower.
It can run more slowly because more time is spent ping-ponging between the
prefilter search and the regex engine attempting to confirm each candidate as
a match. This ping-ponging has overhead that adds up, and is exacerbated by
a high false positive rate.
Nevertheless, the optimization is still generally worth performing in most
cases. Particularly given just how much throughput can be improved. (It is not
uncommon for prefilter optimizations to improve throughput by one or two orders
of magnitude.)
Typically a prefilter is used to find occurrences of literal prefixes from a
regex pattern, but this isn't required. A prefilter can be used to look for
suffixes or even inner literals.
Note that as of now, prefilters throw away information about which pattern
each literal comes from. In other words, when a prefilter finds a match,
there's no way to know which pattern (or patterns) it came from. Therefore,
in order to confirm a match, you'll have to check all of the patterns by
running the full regex engine.
*/
mod aho_corasick;
mod byteset;
mod memchr;
mod memmem;
mod teddy;
use core::{
borrow::Borrow,
fmt::Debug,
panic::{RefUnwindSafe, UnwindSafe},
};
#[cfg(feature = "alloc")]
use alloc::sync::Arc;
#[cfg(feature = "syntax")]
use regex_syntax::hir::{literal, Hir};
use crate::util::search::{MatchKind, Span};
pub(crate) use crate::util::prefilter::{
aho_corasick::AhoCorasick,
byteset::ByteSet,
memchr::{Memchr, Memchr2, Memchr3},
memmem::Memmem,
teddy::Teddy,
};
/// A prefilter for accelerating regex searches.
///
/// If you already have your literals that you want to search with,
/// then the vanilla [`Prefilter::new`] constructor is for you. But
/// if you have an [`Hir`] value from the `regex-syntax` crate, then
/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses
/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to
/// extract literal prefixes for you, optimize them and then select and build a
/// prefilter matcher.
///
/// A prefilter must have **zero false negatives**. However, by its very
/// nature, it may produce false positives. That is, a prefilter will never
/// skip over a position in the haystack that corresponds to a match of the
/// original regex pattern, but it *may* produce a match for a position
/// in the haystack that does *not* correspond to a match of the original
/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or
/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is
/// upheld for you automatically. This guarantee is not preserved if you use
/// [`Prefilter::new`] though, since it is up to the caller to provide correct
/// literal strings with respect to the original regex pattern.
///
/// # Cloning
///
/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning
/// it will not duplicate whatever heap memory is used to represent the
/// underlying matcher.
///
/// # Example
///
/// This example shows how to attach a `Prefilter` to the
/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate
/// searches.
///
/// ```
/// use regex_automata::{
/// nfa::thompson::pikevm::PikeVM,
/// util::prefilter::Prefilter,
/// Match, MatchKind,
/// };
///
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "])
/// .expect("a prefilter");
/// let re = PikeVM::builder()
/// .configure(PikeVM::config().prefilter(Some(pre)))
/// .build(r"Bruce \w+")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
/// Some(Match::must(0, 6..23)),
/// re.find(&mut cache, "Hello Bruce Springsteen!"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// But note that if you get your prefilter incorrect, it could lead to an
/// incorrect result!
///
/// ```
/// use regex_automata::{
/// nfa::thompson::pikevm::PikeVM,
/// util::prefilter::Prefilter,
/// Match, MatchKind,
/// };
///
/// // This prefilter is wrong!
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "])
/// .expect("a prefilter");
/// let re = PikeVM::builder()
/// .configure(PikeVM::config().prefilter(Some(pre)))
/// .build(r"Bruce \w+")?;
/// let mut cache = re.create_cache();
/// // We find no match even though the regex does match.
/// assert_eq!(
/// None,
/// re.find(&mut cache, "Hello Bruce Springsteen!"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Prefilter {
#[cfg(not(feature = "alloc"))]
_unused: (),
#[cfg(feature = "alloc")]
pre: Arc<dyn PrefilterI>,
#[cfg(feature = "alloc")]
is_fast: bool,
#[cfg(feature = "alloc")]
max_needle_len: usize,
}
impl Prefilter {
/// Create a new prefilter from a sequence of needles and a corresponding
/// match semantics.
///
/// This may return `None` for a variety of reasons, for example, if
/// a suitable prefilter could not be constructed. That might occur
/// if they are unavailable (e.g., the `perf-literal-substring` and
/// `perf-literal-multisubstring` features aren't enabled), or it might
/// occur because of heuristics or other artifacts of how the prefilter
/// works.
///
/// Note that if you have an [`Hir`] expression, it may be more convenient
/// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the
/// task of extracting prefix literals for you.
///
/// # Example
///
/// This example shows how match semantics can impact the matching
/// algorithm used by the prefilter. For this reason, it is important to
/// ensure that the match semantics given here are consistent with the
/// match semantics intended for the regular expression that the literals
/// were extracted from.
///
/// ```
/// use regex_automata::{
/// util::{prefilter::Prefilter, syntax},
/// MatchKind, Span,
/// };
///
/// let hay = "Hello samwise";
///
/// // With leftmost-first, we find 'samwise' here because it comes
/// // before 'sam' in the sequence we give it..
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"])
/// .expect("a prefilter");
/// assert_eq!(
/// Some(Span::from(6..13)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
/// // Still with leftmost-first but with the literals reverse, now 'sam'
/// // will match instead!
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"])
/// .expect("a prefilter");
/// assert_eq!(
/// Some(Span::from(6..9)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new<B: AsRef<[u8]>>(
kind: MatchKind,
needles: &[B],
) -> Option<Prefilter> {
Choice::new(kind, needles).and_then(|choice| {
let max_needle_len =
needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0);
Prefilter::from_choice(choice, max_needle_len)
})
}
/// This turns a prefilter selection into a `Prefilter`. That is, in turns
/// the enum given into a trait object.
fn from_choice(
choice: Choice,
max_needle_len: usize,
) -> Option<Prefilter> {
#[cfg(not(feature = "alloc"))]
{
None
}
#[cfg(feature = "alloc")]
{
let pre: Arc<dyn PrefilterI> = match choice {
Choice::Memchr(p) => Arc::new(p),
Choice::Memchr2(p) => Arc::new(p),
Choice::Memchr3(p) => Arc::new(p),
Choice::Memmem(p) => Arc::new(p),
Choice::Teddy(p) => Arc::new(p),
Choice::ByteSet(p) => Arc::new(p),
Choice::AhoCorasick(p) => Arc::new(p),
};
let is_fast = pre.is_fast();
Some(Prefilter { pre, is_fast, max_needle_len })
}
}
/// This attempts to extract prefixes from the given `Hir` expression for
/// the given match semantics, and if possible, builds a prefilter for
/// them.
///
/// # Example
///
/// This example shows how to build a prefilter directly from an [`Hir`]
/// expression, and use to find an occurrence of a prefix from the regex
/// pattern.
///
/// ```
/// use regex_automata::{
/// util::{prefilter::Prefilter, syntax},
/// MatchKind, Span,
/// };
///
/// let hir = syntax::parse(r"(Bruce|Patti) \w+")?;
/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
/// .expect("a prefilter");
/// let hay = "Hello Patti Scialfa!";
/// assert_eq!(
/// Some(Span::from(6..12)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> {
Prefilter::from_hirs_prefix(kind, &[hir])
}
/// This attempts to extract prefixes from the given `Hir` expressions for
/// the given match semantics, and if possible, builds a prefilter for
/// them.
///
/// Note that as of now, prefilters throw away information about which
/// pattern each literal comes from. In other words, when a prefilter finds
/// a match, there's no way to know which pattern (or patterns) it came
/// from. Therefore, in order to confirm a match, you'll have to check all
/// of the patterns by running the full regex engine.
///
/// # Example
///
/// This example shows how to build a prefilter directly from multiple
/// `Hir` expressions expression, and use it to find an occurrence of a
/// prefix from the regex patterns.
///
/// ```
/// use regex_automata::{
/// util::{prefilter::Prefilter, syntax},
/// MatchKind, Span,
/// };
///
/// let hirs = syntax::parse_many(&[
/// r"(Bruce|Patti) \w+",
/// r"Mrs?\. Doubtfire",
/// ])?;
/// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs)
/// .expect("a prefilter");
/// let hay = "Hello Mrs. Doubtfire";
/// assert_eq!(
/// Some(Span::from(6..20)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn from_hirs_prefix<H: Borrow<Hir>>(
kind: MatchKind,
hirs: &[H],
) -> Option<Prefilter> {
prefixes(kind, hirs)
.literals()
.and_then(|lits| Prefilter::new(kind, lits))
}
/// Run this prefilter on `haystack[span.start..end]` and return a matching
/// span if one exists.
///
/// The span returned is guaranteed to have a start position greater than
/// or equal to the one given, and an end position less than or equal to
/// the one given.
///
/// # Example
///
/// This example shows how to build a prefilter directly from an [`Hir`]
/// expression, and use it to find an occurrence of a prefix from the regex
/// pattern.
///
/// ```
/// use regex_automata::{
/// util::{prefilter::Prefilter, syntax},
/// MatchKind, Span,
/// };
///
/// let hir = syntax::parse(r"Bruce \w+")?;
/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
/// .expect("a prefilter");
/// let hay = "Hello Bruce Springsteen!";
/// assert_eq!(
/// Some(Span::from(6..12)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "alloc"))]
{
unreachable!()
}
#[cfg(feature = "alloc")]
{
self.pre.find(haystack, span)
}
}
/// Returns the span of a prefix of `haystack[span.start..span.end]` if
/// the prefilter matches.
///
/// The span returned is guaranteed to have a start position equivalent to
/// the one given, and an end position less than or equal to the one given.
///
/// # Example
///
/// This example shows how to build a prefilter directly from an [`Hir`]
/// expression, and use it to find an occurrence of a prefix from the regex
/// pattern that begins at the start of a haystack only.
///
/// ```
/// use regex_automata::{
/// util::{prefilter::Prefilter, syntax},
/// MatchKind, Span,
/// };
///
/// let hir = syntax::parse(r"Bruce \w+")?;
/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
/// .expect("a prefilter");
/// let hay = "Hello Bruce Springsteen!";
/// // Nothing is found here because 'Bruce' does
/// // not occur at the beginning of our search.
/// assert_eq!(
/// None,
/// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())),
/// );
/// // But if we change where we start the search
/// // to begin where 'Bruce ' begins, then a
/// // match will be found.
/// assert_eq!(
/// Some(Span::from(6..12)),
/// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "alloc"))]
{
unreachable!()
}
#[cfg(feature = "alloc")]
{
self.pre.prefix(haystack, span)
}
}
/// Returns the heap memory, in bytes, used by the underlying prefilter.
#[inline]
pub fn memory_usage(&self) -> usize {
#[cfg(not(feature = "alloc"))]
{
unreachable!()
}
#[cfg(feature = "alloc")]
{
self.pre.memory_usage()
}
}
/// Return the length of the longest needle
/// in this Prefilter
#[inline]
pub fn max_needle_len(&self) -> usize {
#[cfg(not(feature = "alloc"))]
{
unreachable!()
}
#[cfg(feature = "alloc")]
{
self.max_needle_len
}
}
/// Implementations might return true here if they believe themselves to
/// be "fast." The concept of "fast" is deliberately left vague, but in
/// practice this usually corresponds to whether it's believed that SIMD
/// will be used.
///
/// Why do we care about this? Well, some prefilter tricks tend to come
/// with their own bits of overhead, and so might only make sense if we
/// know that a scan will be *much* faster than the regex engine itself.
/// Otherwise, the trick may not be worth doing. Whether something is
/// "much" faster than the regex engine generally boils down to whether
/// SIMD is used. (But not always. Even a SIMD matcher with a high false
/// positive rate can become quite slow.)
///
/// Even if this returns true, it is still possible for the prefilter to
/// be "slow." Remember, prefilters are just heuristics. We can't really
/// *know* a prefilter will be fast without actually trying the prefilter.
/// (Which of course we cannot afford to do.)
#[inline]
pub fn is_fast(&self) -> bool {
#[cfg(not(feature = "alloc"))]
{
unreachable!()
}
#[cfg(feature = "alloc")]
{
self.is_fast
}
}
}
/// A trait for abstracting over prefilters. Basically, a prefilter is
/// something that do an unanchored *and* an anchored search in a haystack
/// within a given span.
///
/// This exists pretty much only so that we can use prefilters as a trait
/// object (which is what `Prefilter` is). If we ever move off of trait objects
/// and to an enum, then it's likely this trait could be removed.
pub(crate) trait PrefilterI:
Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
{
/// Run this prefilter on `haystack[span.start..end]` and return a matching
/// span if one exists.
///
/// The span returned is guaranteed to have a start position greater than
/// or equal to the one given, and an end position less than or equal to
/// the one given.
fn find(&self, haystack: &[u8], span: Span) -> Option<Span>;
/// Returns the span of a prefix of `haystack[span.start..span.end]` if
/// the prefilter matches.
///
/// The span returned is guaranteed to have a start position equivalent to
/// the one given, and an end position less than or equal to the one given.
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>;
/// Returns the heap memory, in bytes, used by the underlying prefilter.
fn memory_usage(&self) -> usize;
/// Implementations might return true here if they believe themselves to
/// be "fast." See [`Prefilter::is_fast`] for more details.
fn is_fast(&self) -> bool;
}
#[cfg(feature = "alloc")]
impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
(**self).find(haystack, span)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
(**self).prefix(haystack, span)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn memory_usage(&self) -> usize {
(**self).memory_usage()
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn is_fast(&self) -> bool {
(&**self).is_fast()
}
}
/// A type that encapsulates the selection of a prefilter algorithm from a
/// sequence of needles.
///
/// The existence of this type is a little tricky, because we don't (currently)
/// use it for performing a search. Instead, we really only consume it by
/// converting the underlying prefilter into a trait object, whether that be
/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order
/// to avoid re-copying the prefilter selection logic, we isolate it here, and
/// then force anything downstream that wants to convert it to a trait object
/// to do trivial case analysis on it.
///
/// One wonders whether we *should* use an enum instead of a trait object.
/// At time of writing, I chose trait objects based on instinct because 1) I
/// knew I wasn't going to inline anything and 2) there would potentially be
/// many different choices. However, as of time of writing, I haven't actually
/// compared the trait object approach to the enum approach. That probably
/// should be litigated, but I ran out of steam.
///
/// Note that if the `alloc` feature is disabled, then values of this type
/// are (and should) never be constructed. Also, in practice, for any of the
/// prefilters to be selected, you'll need at least one of the `perf-literal-*`
/// features enabled.
#[derive(Clone, Debug)]
pub(crate) enum Choice {
Memchr(Memchr),
Memchr2(Memchr2),
Memchr3(Memchr3),
Memmem(Memmem),
Teddy(Teddy),
ByteSet(ByteSet),
AhoCorasick(AhoCorasick),
}
impl Choice {
/// Select what is believed to be the best prefilter algorithm for the
/// match semantics and sequence of needles given.
///
/// This selection algorithm uses the needles as given without any
/// modification. For example, if `[bar]` is given, then this doesn't
/// try to select `memchr` for `b`. Instead, it would select `memmem`
/// for `bar`. If callers would want `memchr` selected for `[bar]`, then
/// callers should massages the literals themselves. That is, callers are
/// responsible for heuristics surrounding which sequence of literals is
/// best.
///
/// What this selection algorithm does is attempt to use the fastest
/// prefilter that works for the literals given. So if `[a, b]`, is given,
/// then `memchr2` is selected.
///
/// Of course, which prefilter is selected is also subject to what
/// is available. For example, if `alloc` isn't enabled, then
/// that limits which prefilters can be selected. Similarly, if
/// `perf-literal-substring` isn't enabled, then nothing from the `memchr`
/// crate can be returned.
pub(crate) fn new<B: AsRef<[u8]>>(
kind: MatchKind,
needles: &[B],
) -> Option<Choice> {
// An empty set means the regex matches nothing, so no sense in
// building a prefilter.
if needles.len() == 0 {
debug!("prefilter building failed: found empty set of literals");
return None;
}
// If the regex can match the empty string, then the prefilter
// will by definition match at every position. This is obviously
// completely ineffective.
if needles.iter().any(|n| n.as_ref().is_empty()) {
debug!("prefilter building failed: literals match empty string");
return None;
}
// BREADCRUMBS: Perhaps the literal optimizer should special case
// sequences of length two or three if the leading bytes of each are
// "rare"? Or perhaps, if there are two or three total possible leading
// bytes, regardless of the number of literals, and all are rare...
// Then well, perhaps we should use memchr2 or memchr3 in those cases?
if let Some(pre) = Memchr::new(kind, needles) {
debug!("prefilter built: memchr");
return Some(Choice::Memchr(pre));
}
if let Some(pre) = Memchr2::new(kind, needles) {
debug!("prefilter built: memchr2");
return Some(Choice::Memchr2(pre));
}
if let Some(pre) = Memchr3::new(kind, needles) {
debug!("prefilter built: memchr3");
return Some(Choice::Memchr3(pre));
}
if let Some(pre) = Memmem::new(kind, needles) {
debug!("prefilter built: memmem");
return Some(Choice::Memmem(pre));
}
if let Some(pre) = Teddy::new(kind, needles) {
debug!("prefilter built: teddy");
return Some(Choice::Teddy(pre));
}
if let Some(pre) = ByteSet::new(kind, needles) {
debug!("prefilter built: byteset");
return Some(Choice::ByteSet(pre));
}
if let Some(pre) = AhoCorasick::new(kind, needles) {
debug!("prefilter built: aho-corasick");
return Some(Choice::AhoCorasick(pre));
}
debug!("prefilter building failed: no strategy could be found");
None
}
}
/// Extracts all of the prefix literals from the given HIR expressions into a
/// single `Seq`. The literals in the sequence are ordered with respect to the
/// order of the given HIR expressions and consistent with the match semantics
/// given.
///
/// The sequence returned is "optimized." That is, they may be shrunk or even
/// truncated according to heuristics with the intent of making them more
/// useful as a prefilter. (Which translates to both using faster algorithms
/// and minimizing the false positive rate.)
///
/// Note that this erases any connection between the literals and which pattern
/// (or patterns) they came from.
///
/// The match kind given must correspond to the match semantics of the regex
/// that is represented by the HIRs given. The match semantics may change the
/// literal sequence returned.
#[cfg(feature = "syntax")]
pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
where
H: core::borrow::Borrow<Hir>,
{
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Prefix);
let mut prefixes = literal::Seq::empty();
for hir in hirs {
prefixes.union(&mut extractor.extract(hir.borrow()));
}
debug!(
"prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
prefixes.len(),
prefixes.is_exact(),
prefixes
);
match kind {
MatchKind::All => {
prefixes.sort();
prefixes.dedup();
}
MatchKind::LeftmostFirst => {
prefixes.optimize_for_prefix_by_preference();
}
}
debug!(
"prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
prefixes.len(),
prefixes.is_exact(),
prefixes
);
prefixes
}
/// Like `prefixes`, but for all suffixes of all matches for the given HIRs.
#[cfg(feature = "syntax")]
pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
where
H: core::borrow::Borrow<Hir>,
{
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Suffix);
let mut suffixes = literal::Seq::empty();
for hir in hirs {
suffixes.union(&mut extractor.extract(hir.borrow()));
}
debug!(
"suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
suffixes.len(),
suffixes.is_exact(),
suffixes
);
match kind {
MatchKind::All => {
suffixes.sort();
suffixes.dedup();
}
MatchKind::LeftmostFirst => {
suffixes.optimize_for_suffix_by_preference();
}
}
debug!(
"suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
suffixes.len(),
suffixes.is_exact(),
suffixes
);
suffixes
}

View File

@@ -0,0 +1,160 @@
use crate::util::{
prefilter::PrefilterI,
search::{MatchKind, Span},
};
#[derive(Clone, Debug)]
pub(crate) struct Teddy {
#[cfg(not(feature = "perf-literal-multisubstring"))]
_unused: (),
/// The actual Teddy searcher.
///
/// Technically, it's possible that Teddy doesn't actually get used, since
/// Teddy does require its haystack to at least be of a certain size
/// (usually around the size of whatever vector is being used, so ~16
/// or ~32 bytes). For haystacks shorter than that, the implementation
/// currently uses Rabin-Karp.
#[cfg(feature = "perf-literal-multisubstring")]
searcher: aho_corasick::packed::Searcher,
/// When running an anchored search, the packed searcher can't handle it so
/// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
/// searchers to support anchored search would be difficult at worst and
/// annoying at best. Since packed searchers only apply to small numbers of
/// literals, we content ourselves that this is not much of an added cost.
/// (That packed searchers only work with a small number of literals is
/// also why we use a DFA here. Otherwise, the memory usage of a DFA would
/// likely be unacceptable.)
#[cfg(feature = "perf-literal-multisubstring")]
anchored_ac: aho_corasick::dfa::DFA,
/// The length of the smallest literal we look for.
///
/// We use this as a heuristic to figure out whether this will be "fast" or
/// not. Generally, the longer the better, because longer needles are more
/// discriminating and thus reduce false positive rate.
#[cfg(feature = "perf-literal-multisubstring")]
minimum_len: usize,
}
impl Teddy {
pub(crate) fn new<B: AsRef<[u8]>>(
kind: MatchKind,
needles: &[B],
) -> Option<Teddy> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
None
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// We only really support leftmost-first semantics. In
// theory we could at least support leftmost-longest, as the
// aho-corasick crate does, but regex-automata doesn't know about
// leftmost-longest currently.
//
// And like the aho-corasick prefilter, if we're using `All`
// semantics, then we can still use leftmost semantics for a
// prefilter. (This might be a suspicious choice for the literal
// engine, which uses a prefilter as a regex engine directly, but
// that only happens when using leftmost-first semantics.)
let (packed_match_kind, ac_match_kind) = match kind {
MatchKind::LeftmostFirst | MatchKind::All => (
aho_corasick::packed::MatchKind::LeftmostFirst,
aho_corasick::MatchKind::LeftmostFirst,
),
};
let minimum_len =
needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
let packed = aho_corasick::packed::Config::new()
.match_kind(packed_match_kind)
.builder()
.extend(needles)
.build()?;
let anchored_ac = aho_corasick::dfa::DFA::builder()
.match_kind(ac_match_kind)
.start_kind(aho_corasick::StartKind::Anchored)
.prefilter(false)
.build(needles)
.ok()?;
Some(Teddy { searcher: packed, anchored_ac, minimum_len })
}
}
}
impl PrefilterI for Teddy {
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
let ac_span =
aho_corasick::Span { start: span.start, end: span.end };
self.searcher
.find_in(haystack, ac_span)
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
use aho_corasick::automaton::Automaton;
let input = aho_corasick::Input::new(haystack)
.anchored(aho_corasick::Anchored::Yes)
.span(span.start..span.end);
self.anchored_ac
.try_find(&input)
// OK because we build the DFA with anchored support.
.expect("aho-corasick DFA should never fail")
.map(|m| Span { start: m.start(), end: m.end() })
}
}
fn memory_usage(&self) -> usize {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
use aho_corasick::automaton::Automaton;
self.searcher.memory_usage() + self.anchored_ac.memory_usage()
}
}
fn is_fast(&self) -> bool {
#[cfg(not(feature = "perf-literal-multisubstring"))]
{
unreachable!()
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// Teddy is usually quite fast, but I have seen some cases where
// a large number of literals can overwhelm it and make it not so
// fast. We make an educated but conservative guess at a limit, at
// which point, we're not so comfortable thinking Teddy is "fast."
//
// Well... this used to incorporate a "limit" on the *number*
// of literals, but I have since changed it to a minimum on the
// *smallest* literal. Namely, when there is a very small literal
// (1 or 2 bytes), it is far more likely that it leads to a higher
// false positive rate. (Although, of course, not always. For
// example, 'zq' is likely to have a very low false positive rate.)
// But when we have 3 bytes, we have a really good chance of being
// quite discriminatory and thus fast.
//
// We may still want to add some kind of limit on the number of
// literals here, but keep in mind that Teddy already has its own
// somewhat small limit (64 at time of writing). The main issue
// here is that if 'is_fast' is false, it opens the door for the
// reverse inner optimization to kick in. We really only want to
// resort to the reverse inner optimization if we absolutely must.
self.minimum_len >= 3
}
}
}

View File

@@ -0,0 +1,776 @@
/*!
Lower level primitive types that are useful in a variety of circumstances.
# Overview
This list represents the principle types in this module and briefly describes
when you might want to use them.
* [`PatternID`] - A type that represents the identifier of a regex pattern.
This is probably the most widely used type in this module (which is why it's
also re-exported in the crate root).
* [`StateID`] - A type the represents the identifier of a finite automaton
state. This is used for both NFAs and DFAs, with the notable exception of
the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
identifier.)
* [`SmallIndex`] - The internal representation of both a `PatternID` and a
`StateID`. Its purpose is to serve as a type that can index memory without
being as big as a `usize` on 64-bit targets. The main idea behind this type
is that there are many things in regex engines that will, in practice, never
overflow a 32-bit integer. (For example, like the number of patterns in a regex
or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
memory without peppering `as` casts everywhere. Moreover, it forces callers
to handle errors in the case where, somehow, the value would otherwise overflow
either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
* [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a
result, `Option<NonMaxUsize>` has the same size in memory as a `usize`. This
useful, for example, when representing the offsets of submatches since it
reduces memory usage by a factor of 2. It is a legal optimization since Rust
guarantees that slices never have a length that exceeds `isize::MAX`.
*/
use core::num::NonZeroUsize;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use crate::util::int::{Usize, U16, U32, U64};
/// A `usize` that can never be `usize::MAX`.
///
/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting
/// a zero value, this does not permit a max value.
///
/// This is useful in certain contexts where one wants to optimize the memory
/// usage of things that contain match offsets. Namely, since Rust slices
/// are guaranteed to never have a length exceeding `isize::MAX`, we can use
/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed,
/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a
/// `usize`.
///
/// This type is defined to be `repr(transparent)` for
/// `core::num::NonZeroUsize`, which is in turn defined to be
/// `repr(transparent)` for `usize`.
#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct NonMaxUsize(NonZeroUsize);
impl NonMaxUsize {
/// Create a new `NonMaxUsize` from the given value.
///
/// This returns `None` only when the given value is equal to `usize::MAX`.
#[inline]
pub fn new(value: usize) -> Option<NonMaxUsize> {
NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize)
}
/// Return the underlying `usize` value. The returned value is guaranteed
/// to not equal `usize::MAX`.
#[inline]
pub fn get(self) -> usize {
self.0.get().wrapping_sub(1)
}
}
// We provide our own Debug impl because seeing the internal repr can be quite
// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'.
impl core::fmt::Debug for NonMaxUsize {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "{:?}", self.get())
}
}
/// A type that represents a "small" index.
///
/// The main idea of this type is to provide something that can index memory,
/// but uses less memory than `usize` on 64-bit systems. Specifically, its
/// representation is always a `u32` and has `repr(transparent)` enabled. (So
/// it is safe to transmute between a `u32` and a `SmallIndex`.)
///
/// A small index is typically useful in cases where there is no practical way
/// that the index will overflow a 32-bit integer. A good example of this is
/// an NFA state. If you could somehow build an NFA with `2^30` states, its
/// memory usage would be exorbitant and its runtime execution would be so
/// slow as to be completely worthless. Therefore, this crate generally deems
/// it acceptable to return an error if it would otherwise build an NFA that
/// requires a slice longer than what a 32-bit integer can index. In exchange,
/// we can use 32-bit indices instead of 64-bit indices in various places.
///
/// This type ensures this by providing a constructor that will return an error
/// if its argument cannot fit into the type. This makes it much easier to
/// handle these sorts of boundary cases that are otherwise extremely subtle.
///
/// On all targets, this type guarantees that its value will fit in a `u32`,
/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
/// example, this type's maximum value will never overflow an `isize`,
/// which means it will never overflow a `i16` even though its internal
/// representation is still a `u32`.
///
/// The purpose for making the type fit into even signed integer types like
/// `isize` is to guarantee that the difference between any two small indices
/// is itself also a small index. This is useful in certain contexts, e.g.,
/// for delta encoding.
///
/// # Other types
///
/// The following types wrap `SmallIndex` to provide a more focused use case:
///
/// * [`PatternID`] is for representing the identifiers of patterns.
/// * [`StateID`] is for representing the identifiers of states in finite
/// automata. It is used for both NFAs and DFAs.
///
/// # Representation
///
/// This type is always represented internally by a `u32` and is marked as
/// `repr(transparent)`. Thus, this type always has the same representation as
/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
///
/// # Indexing
///
/// For convenience, callers may use a `SmallIndex` to index slices.
///
/// # Safety
///
/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `SmallIndex` with
/// an invalid value can be done in entirely safe code. This may in turn result
/// in panics or silent logical errors.
#[derive(
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
)]
#[repr(transparent)]
pub struct SmallIndex(u32);
impl SmallIndex {
/// The maximum index value.
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
pub const MAX: SmallIndex =
// FIXME: Use as_usize() once const functions in traits are stable.
SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
/// The maximum index value.
#[cfg(target_pointer_width = "16")]
pub const MAX: SmallIndex =
SmallIndex::new_unchecked(core::isize::MAX - 1);
/// The total number of values that can be represented as a small index.
pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
/// The zero index value.
pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
/// The number of bytes that a single small index uses in memory.
pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
/// Create a new small index.
///
/// If the given index exceeds [`SmallIndex::MAX`], then this returns
/// an error.
#[inline]
pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
SmallIndex::try_from(index)
}
/// Create a new small index without checking whether the given value
/// exceeds [`SmallIndex::MAX`].
///
/// Using this routine with an invalid index value will result in
/// unspecified behavior, but *not* undefined behavior. In particular, an
/// invalid index value is likely to cause panics or possibly even silent
/// logical errors.
///
/// Callers must never rely on a `SmallIndex` to be within a certain range
/// for memory safety.
#[inline]
pub const fn new_unchecked(index: usize) -> SmallIndex {
// FIXME: Use as_u32() once const functions in traits are stable.
SmallIndex(index as u32)
}
/// Like [`SmallIndex::new`], but panics if the given index is not valid.
#[inline]
pub fn must(index: usize) -> SmallIndex {
SmallIndex::new(index).expect("invalid small index")
}
/// Return this small index as a `usize`. This is guaranteed to never
/// overflow `usize`.
#[inline]
pub const fn as_usize(&self) -> usize {
// FIXME: Use as_usize() once const functions in traits are stable.
self.0 as usize
}
/// Return this small index as a `u64`. This is guaranteed to never
/// overflow.
#[inline]
pub const fn as_u64(&self) -> u64 {
// FIXME: Use u64::from() once const functions in traits are stable.
self.0 as u64
}
/// Return the internal `u32` of this small index. This is guaranteed to
/// never overflow `u32`.
#[inline]
pub const fn as_u32(&self) -> u32 {
self.0
}
/// Return the internal `u32` of this small index represented as an `i32`.
/// This is guaranteed to never overflow an `i32`.
#[inline]
pub const fn as_i32(&self) -> i32 {
// This is OK because we guarantee that our max value is <= i32::MAX.
self.0 as i32
}
/// Returns one more than this small index as a usize.
///
/// Since a small index has constraints on its maximum value, adding `1` to
/// it will always fit in a `usize`, `u32` and a `i32`.
#[inline]
pub fn one_more(&self) -> usize {
self.as_usize() + 1
}
/// Decode this small index from the bytes given using the native endian
/// byte order for the current target.
///
/// If the decoded integer is not representable as a small index for the
/// current target, then this returns an error.
#[inline]
pub fn from_ne_bytes(
bytes: [u8; 4],
) -> Result<SmallIndex, SmallIndexError> {
let id = u32::from_ne_bytes(bytes);
if id > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(id) });
}
Ok(SmallIndex::new_unchecked(id.as_usize()))
}
/// Decode this small index from the bytes given using the native endian
/// byte order for the current target.
///
/// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
/// check whether the decoded integer is representable as a small index.
#[inline]
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
}
/// Return the underlying small index integer as raw bytes in native endian
/// format.
#[inline]
pub fn to_ne_bytes(&self) -> [u8; 4] {
self.0.to_ne_bytes()
}
}
impl<T> core::ops::Index<SmallIndex> for [T] {
type Output = T;
#[inline]
fn index(&self, index: SmallIndex) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<SmallIndex> for [T] {
#[inline]
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
&mut self[index.as_usize()]
}
}
#[cfg(feature = "alloc")]
impl<T> core::ops::Index<SmallIndex> for Vec<T> {
type Output = T;
#[inline]
fn index(&self, index: SmallIndex) -> &T {
&self[index.as_usize()]
}
}
#[cfg(feature = "alloc")]
impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
#[inline]
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
&mut self[index.as_usize()]
}
}
impl From<u8> for SmallIndex {
fn from(index: u8) -> SmallIndex {
SmallIndex::new_unchecked(usize::from(index))
}
}
impl TryFrom<u16> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
if u32::from(index) > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(index) });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<u32> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(index) });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<u64> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_u64() {
return Err(SmallIndexError { attempted: index });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<usize> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_usize() {
return Err(SmallIndexError { attempted: index.as_u64() });
}
Ok(SmallIndex::new_unchecked(index))
}
}
#[cfg(test)]
impl quickcheck::Arbitrary for SmallIndex {
fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex {
use core::cmp::max;
let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
if id > SmallIndex::MAX.as_i32() {
SmallIndex::MAX
} else {
SmallIndex::new(usize::try_from(id).unwrap()).unwrap()
}
}
}
/// This error occurs when a small index could not be constructed.
///
/// This occurs when given an integer exceeding the maximum small index value.
///
/// When the `std` feature is enabled, this implements the `Error` trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SmallIndexError {
attempted: u64,
}
impl SmallIndexError {
/// Returns the value that could not be converted to a small index.
pub fn attempted(&self) -> u64 {
self.attempted
}
}
#[cfg(feature = "std")]
impl std::error::Error for SmallIndexError {}
impl core::fmt::Display for SmallIndexError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create small index from {:?}, which exceeds {:?}",
self.attempted(),
SmallIndex::MAX,
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct SmallIndexIter {
rng: core::ops::Range<usize>,
}
impl Iterator for SmallIndexIter {
type Item = SmallIndex;
fn next(&mut self) -> Option<SmallIndex> {
if self.rng.start >= self.rng.end {
return None;
}
let next_id = self.rng.start + 1;
let id = core::mem::replace(&mut self.rng.start, next_id);
// new_unchecked is OK since we asserted that the number of
// elements in this iterator will fit in an ID at construction.
Some(SmallIndex::new_unchecked(id))
}
}
macro_rules! index_type_impls {
($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
impl $name {
/// The maximum value.
pub const MAX: $name = $name(SmallIndex::MAX);
/// The total number of values that can be represented.
pub const LIMIT: usize = SmallIndex::LIMIT;
/// The zero value.
pub const ZERO: $name = $name(SmallIndex::ZERO);
/// The number of bytes that a single value uses in memory.
pub const SIZE: usize = SmallIndex::SIZE;
/// Create a new value that is represented by a "small index."
///
/// If the given index exceeds the maximum allowed value, then this
/// returns an error.
#[inline]
pub fn new(value: usize) -> Result<$name, $err> {
SmallIndex::new(value).map($name).map_err($err)
}
/// Create a new value without checking whether the given argument
/// exceeds the maximum.
///
/// Using this routine with an invalid value will result in
/// unspecified behavior, but *not* undefined behavior. In
/// particular, an invalid ID value is likely to cause panics or
/// possibly even silent logical errors.
///
/// Callers must never rely on this type to be within a certain
/// range for memory safety.
#[inline]
pub const fn new_unchecked(value: usize) -> $name {
$name(SmallIndex::new_unchecked(value))
}
/// Like `new`, but panics if the given value is not valid.
#[inline]
pub fn must(value: usize) -> $name {
$name::new(value).expect(concat!(
"invalid ",
stringify!($name),
" value"
))
}
/// Return the internal value as a `usize`. This is guaranteed to
/// never overflow `usize`.
#[inline]
pub const fn as_usize(&self) -> usize {
self.0.as_usize()
}
/// Return the internal value as a `u64`. This is guaranteed to
/// never overflow.
#[inline]
pub const fn as_u64(&self) -> u64 {
self.0.as_u64()
}
/// Return the internal value as a `u32`. This is guaranteed to
/// never overflow `u32`.
#[inline]
pub const fn as_u32(&self) -> u32 {
self.0.as_u32()
}
/// Return the internal value as a i32`. This is guaranteed to
/// never overflow an `i32`.
#[inline]
pub const fn as_i32(&self) -> i32 {
self.0.as_i32()
}
/// Returns one more than this value as a usize.
///
/// Since values represented by a "small index" have constraints
/// on their maximum value, adding `1` to it will always fit in a
/// `usize`, `u32` and a `i32`.
#[inline]
pub fn one_more(&self) -> usize {
self.0.one_more()
}
/// Decode this value from the bytes given using the native endian
/// byte order for the current target.
///
/// If the decoded integer is not representable as a small index
/// for the current target, then this returns an error.
#[inline]
pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
}
/// Decode this value from the bytes given using the native endian
/// byte order for the current target.
///
/// This is analogous to `new_unchecked` in that is does not check
/// whether the decoded integer is representable as a small index.
#[inline]
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
$name(SmallIndex::from_ne_bytes_unchecked(bytes))
}
/// Return the underlying integer as raw bytes in native endian
/// format.
#[inline]
pub fn to_ne_bytes(&self) -> [u8; 4] {
self.0.to_ne_bytes()
}
/// Returns an iterator over all values from 0 up to and not
/// including the given length.
///
/// If the given length exceeds this type's limit, then this
/// panics.
pub(crate) fn iter(len: usize) -> $iter {
$iter::new(len)
}
}
// We write our own Debug impl so that we get things like PatternID(5)
// instead of PatternID(SmallIndex(5)).
impl core::fmt::Debug for $name {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
}
}
impl<T> core::ops::Index<$name> for [T] {
type Output = T;
#[inline]
fn index(&self, index: $name) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<$name> for [T] {
#[inline]
fn index_mut(&mut self, index: $name) -> &mut T {
&mut self[index.as_usize()]
}
}
#[cfg(feature = "alloc")]
impl<T> core::ops::Index<$name> for Vec<T> {
type Output = T;
#[inline]
fn index(&self, index: $name) -> &T {
&self[index.as_usize()]
}
}
#[cfg(feature = "alloc")]
impl<T> core::ops::IndexMut<$name> for Vec<T> {
#[inline]
fn index_mut(&mut self, index: $name) -> &mut T {
&mut self[index.as_usize()]
}
}
impl From<u8> for $name {
fn from(value: u8) -> $name {
$name(SmallIndex::from(value))
}
}
impl TryFrom<u16> for $name {
type Error = $err;
fn try_from(value: u16) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<u32> for $name {
type Error = $err;
fn try_from(value: u32) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<u64> for $name {
type Error = $err;
fn try_from(value: u64) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<usize> for $name {
type Error = $err;
fn try_from(value: usize) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
#[cfg(test)]
impl quickcheck::Arbitrary for $name {
fn arbitrary(gen: &mut quickcheck::Gen) -> $name {
$name(SmallIndex::arbitrary(gen))
}
}
/// This error occurs when a value could not be constructed.
///
/// This occurs when given an integer exceeding the maximum allowed
/// value.
///
/// When the `std` feature is enabled, this implements the `Error`
/// trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct $err(SmallIndexError);
impl $err {
/// Returns the value that could not be converted to an ID.
pub fn attempted(&self) -> u64 {
self.0.attempted()
}
}
#[cfg(feature = "std")]
impl std::error::Error for $err {}
impl core::fmt::Display for $err {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create {} from {:?}, which exceeds {:?}",
stringify!($name),
self.attempted(),
$name::MAX,
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct $iter(SmallIndexIter);
impl $iter {
fn new(len: usize) -> $iter {
assert!(
len <= $name::LIMIT,
"cannot create iterator for {} when number of \
elements exceed {:?}",
stringify!($name),
$name::LIMIT,
);
$iter(SmallIndexIter { rng: 0..len })
}
}
impl Iterator for $iter {
type Item = $name;
fn next(&mut self) -> Option<$name> {
self.0.next().map($name)
}
}
/// An iterator adapter that is like std::iter::Enumerate, but attaches
/// small index values instead. It requires `ExactSizeIterator`. At
/// construction, it ensures that the index of each element in the
/// iterator is representable in the corresponding small index type.
#[derive(Clone, Debug)]
pub(crate) struct $withiter<I> {
it: I,
ids: $iter,
}
impl<I: Iterator + ExactSizeIterator> $withiter<I> {
fn new(it: I) -> $withiter<I> {
let ids = $name::iter(it.len());
$withiter { it, ids }
}
}
impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
type Item = ($name, I::Item);
fn next(&mut self) -> Option<($name, I::Item)> {
let item = self.it.next()?;
// Number of elements in this iterator must match, according
// to contract of ExactSizeIterator.
let id = self.ids.next().unwrap();
Some((id, item))
}
}
};
}
/// The identifier of a regex pattern, represented by a [`SmallIndex`].
///
/// The identifier for a pattern corresponds to its relative position among
/// other patterns in a single finite state machine. Namely, when building
/// a multi-pattern regex engine, one must supply a sequence of patterns to
/// match. The position (starting at 0) of each pattern in that sequence
/// represents its identifier. This identifier is in turn used to identify and
/// report matches of that pattern in various APIs.
///
/// See the [`SmallIndex`] type for more information about what it means for
/// a pattern ID to be a "small index."
///
/// Note that this type is defined in the
/// [`util::primitives`](crate::util::primitives) module, but it is also
/// re-exported at the crate root due to how common it is.
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct PatternID(SmallIndex);
/// The identifier of a finite automaton state, represented by a
/// [`SmallIndex`].
///
/// Most regex engines in this crate are built on top of finite automata. Each
/// state in a finite automaton defines transitions from its state to another.
/// Those transitions point to other states via their identifiers, i.e., a
/// `StateID`. Since finite automata tend to contain many transitions, it is
/// much more memory efficient to define state IDs as small indices.
///
/// See the [`SmallIndex`] type for more information about what it means for
/// a state ID to be a "small index."
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct StateID(SmallIndex);
index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
/// A utility trait that defines a couple of adapters for making it convenient
/// to access indices as "small index" types. We require ExactSizeIterator so
/// that iterator construction can do a single check to make sure the index of
/// each element is representable by its small index type.
pub(crate) trait IteratorIndexExt: Iterator {
fn with_pattern_ids(self) -> WithPatternIDIter<Self>
where
Self: Sized + ExactSizeIterator,
{
WithPatternIDIter::new(self)
}
fn with_state_ids(self) -> WithStateIDIter<Self>
where
Self: Sized + ExactSizeIterator,
{
WithStateIDIter::new(self)
}
}
impl<I: Iterator> IteratorIndexExt for I {}

1988
vendor/regex-automata/src/util/search.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,239 @@
/*!
This module defines a sparse set data structure. Its most interesting
properties are:
* They preserve insertion order.
* Set membership testing is done in constant time.
* Set insertion is done in constant time.
* Clearing the set is done in constant time.
The cost for doing this is that the capacity of the set needs to be known up
front, and the elements in the set are limited to state identifiers.
These sets are principally used when traversing an NFA state graph. This
happens at search time, for example, in the PikeVM. It also happens during DFA
determinization.
*/
use alloc::{vec, vec::Vec};
use crate::util::primitives::StateID;
/// A pair of sparse sets.
///
/// This is useful when one needs to compute NFA epsilon closures from a
/// previous set of states derived from an epsilon closure. One set can be the
/// starting states where as the other set can be the destination states after
/// following the transitions for a particular byte of input.
///
/// There is no significance to 'set1' or 'set2'. They are both sparse sets of
/// the same size.
///
/// The members of this struct are exposed so that callers may borrow 'set1'
/// and 'set2' individually without being force to borrow both at the same
/// time.
#[derive(Clone, Debug)]
pub(crate) struct SparseSets {
pub(crate) set1: SparseSet,
pub(crate) set2: SparseSet,
}
impl SparseSets {
/// Create a new pair of sparse sets where each set has the given capacity.
///
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
pub(crate) fn new(capacity: usize) -> SparseSets {
SparseSets {
set1: SparseSet::new(capacity),
set2: SparseSet::new(capacity),
}
}
/// Resizes these sparse sets to have the new capacity given.
///
/// The sets are automatically cleared.
///
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
#[inline]
pub(crate) fn resize(&mut self, new_capacity: usize) {
self.set1.resize(new_capacity);
self.set2.resize(new_capacity);
}
/// Clear both sparse sets.
pub(crate) fn clear(&mut self) {
self.set1.clear();
self.set2.clear();
}
/// Swap set1 with set2.
pub(crate) fn swap(&mut self) {
core::mem::swap(&mut self.set1, &mut self.set2);
}
/// Returns the memory usage, in bytes, used by this pair of sparse sets.
pub(crate) fn memory_usage(&self) -> usize {
self.set1.memory_usage() + self.set2.memory_usage()
}
}
/// A sparse set used for representing ordered NFA states.
///
/// This supports constant time addition and membership testing. Clearing an
/// entire set can also be done in constant time. Iteration yields elements
/// in the order in which they were inserted.
///
/// The data structure is based on: https://research.swtch.com/sparse
/// Note though that we don't actually use uninitialized memory. We generally
/// reuse sparse sets, so the initial allocation cost is bearable. However, its
/// other properties listed above are extremely useful.
#[derive(Clone)]
pub(crate) struct SparseSet {
/// The number of elements currently in this set.
len: usize,
/// Dense contains the ids in the order in which they were inserted.
dense: Vec<StateID>,
/// Sparse maps ids to their location in dense.
///
/// A state ID is in the set if and only if
/// sparse[id] < len && id == dense[sparse[id]].
///
/// Note that these are indices into 'dense'. It's a little weird to use
/// StateID here, but we know our length can never exceed the bounds of
/// StateID (enforced by 'resize') and StateID will be at most 4 bytes
/// where as a usize is likely double that in most cases.
sparse: Vec<StateID>,
}
impl SparseSet {
/// Create a new sparse set with the given capacity.
///
/// Sparse sets have a fixed size and they cannot grow. Attempting to
/// insert more distinct elements than the total capacity of the set will
/// result in a panic.
///
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
#[inline]
pub(crate) fn new(capacity: usize) -> SparseSet {
let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
set.resize(capacity);
set
}
/// Resizes this sparse set to have the new capacity given.
///
/// This set is automatically cleared.
///
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
#[inline]
pub(crate) fn resize(&mut self, new_capacity: usize) {
assert!(
new_capacity <= StateID::LIMIT,
"sparse set capacity cannot exceed {:?}",
StateID::LIMIT
);
self.clear();
self.dense.resize(new_capacity, StateID::ZERO);
self.sparse.resize(new_capacity, StateID::ZERO);
}
/// Returns the capacity of this set.
///
/// The capacity represents a fixed limit on the number of distinct
/// elements that are allowed in this set. The capacity cannot be changed.
#[inline]
pub(crate) fn capacity(&self) -> usize {
self.dense.len()
}
/// Returns the number of elements in this set.
#[inline]
pub(crate) fn len(&self) -> usize {
self.len
}
/// Returns true if and only if this set is empty.
#[inline]
pub(crate) fn is_empty(&self) -> bool {
self.len() == 0
}
/// Insert the state ID value into this set and return true if the given
/// state ID was not previously in this set.
///
/// This operation is idempotent. If the given value is already in this
/// set, then this is a no-op.
///
/// If more than `capacity` ids are inserted, then this panics.
///
/// This is marked as inline(always) since the compiler won't inline it
/// otherwise, and it's a fairly hot piece of code in DFA determinization.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn insert(&mut self, id: StateID) -> bool {
if self.contains(id) {
return false;
}
let i = self.len();
assert!(
i < self.capacity(),
"{:?} exceeds capacity of {:?} when inserting {:?}",
i,
self.capacity(),
id,
);
// OK since i < self.capacity() and self.capacity() is guaranteed to
// be <= StateID::LIMIT.
let index = StateID::new_unchecked(i);
self.dense[index] = id;
self.sparse[id] = index;
self.len += 1;
true
}
/// Returns true if and only if this set contains the given value.
#[inline]
pub(crate) fn contains(&self, id: StateID) -> bool {
let index = self.sparse[id];
index.as_usize() < self.len() && self.dense[index] == id
}
/// Clear this set such that it has no members.
#[inline]
pub(crate) fn clear(&mut self) {
self.len = 0;
}
#[inline]
pub(crate) fn iter(&self) -> SparseSetIter<'_> {
SparseSetIter(self.dense[..self.len()].iter())
}
/// Returns the heap memory usage, in bytes, used by this sparse set.
#[inline]
pub(crate) fn memory_usage(&self) -> usize {
self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE
}
}
impl core::fmt::Debug for SparseSet {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let elements: Vec<StateID> = self.iter().collect();
f.debug_tuple("SparseSet").field(&elements).finish()
}
}
/// An iterator over all elements in a sparse set.
///
/// The lifetime `'a` refers to the lifetime of the set being iterated over.
#[derive(Debug)]
pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
impl<'a> Iterator for SparseSetIter<'a> {
type Item = StateID;
#[cfg_attr(feature = "perf-inline", inline(always))]
fn next(&mut self) -> Option<StateID> {
self.0.next().copied()
}
}

479
vendor/regex-automata/src/util/start.rs vendored Normal file
View File

@@ -0,0 +1,479 @@
/*!
Provides helpers for dealing with start state configurations in DFAs.
*/
use crate::util::{
look::LookMatcher,
search::{Anchored, Input},
wire::{self, DeserializeError, SerializeError},
};
/// The configuration used to determine a DFA's start state for a search.
///
/// A DFA has a single starting state in the typical textbook description. That
/// is, it corresponds to the set of all starting states for the NFA that built
/// it, along with their epsilon closures. In this crate, however, DFAs have
/// many possible start states due to a few factors:
///
/// * DFAs support the ability to run either anchored or unanchored searches.
/// Each type of search needs its own start state. For example, an unanchored
/// search requires starting at a state corresponding to a regex with a
/// `(?s-u:.)*?` prefix, which will match through anything.
/// * DFAs also optionally support starting an anchored search for any one
/// specific pattern. Each such pattern requires its own start state.
/// * If a look-behind assertion like `^` or `\b` is used in the regex, then
/// the DFA will need to inspect a single byte immediately before the start of
/// the search to choose the correct start state.
///
/// Indeed, this configuration precisely encapsulates all of the above factors.
/// The [`Config::anchored`] method sets which kind of anchored search to
/// perform while the [`Config::look_behind`] method provides a way to set
/// the byte that occurs immediately before the start of the search.
///
/// Generally speaking, this type is only useful when you want to run searches
/// without using an [`Input`]. In particular, an `Input` wants a haystack
/// slice, but callers may not have a contiguous sequence of bytes as a
/// haystack in all cases. This type provides a lower level of control such
/// that callers can provide their own anchored configuration and look-behind
/// byte explicitly.
///
/// # Example
///
/// This shows basic usage that permits running a search with a DFA without
/// using the `Input` abstraction.
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
/// util::start,
/// Anchored,
/// };
///
/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
/// let haystack = "quartz";
///
/// let config = start::Config::new().anchored(Anchored::Yes);
/// let mut state = dfa.start_state(&config)?;
/// for &b in haystack.as_bytes().iter() {
/// state = dfa.next_state(state, b);
/// }
/// state = dfa.next_eoi_state(state);
/// assert!(dfa.is_match_state(state));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// This example shows how to correctly run a search that doesn't begin at
/// the start of a haystack. Notice how we set the look-behind byte, and as
/// a result, the `\b` assertion does not match.
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
/// util::start,
/// Anchored,
/// };
///
/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
/// let haystack = "quartz";
///
/// let config = start::Config::new()
/// .anchored(Anchored::Yes)
/// .look_behind(Some(b'q'));
/// let mut state = dfa.start_state(&config)?;
/// for &b in haystack.as_bytes().iter().skip(1) {
/// state = dfa.next_state(state, b);
/// }
/// state = dfa.next_eoi_state(state);
/// // No match!
/// assert!(!dfa.is_match_state(state));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// If we had instead not set a look-behind byte, then the DFA would assume
/// that it was starting at the beginning of the haystack, and thus `\b` should
/// match. This in turn would result in erroneously reporting a match:
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
/// util::start,
/// Anchored,
/// };
///
/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
/// let haystack = "quartz";
///
/// // Whoops, forgot the look-behind byte...
/// let config = start::Config::new().anchored(Anchored::Yes);
/// let mut state = dfa.start_state(&config)?;
/// for &b in haystack.as_bytes().iter().skip(1) {
/// state = dfa.next_state(state, b);
/// }
/// state = dfa.next_eoi_state(state);
/// // And now we get a match unexpectedly.
/// assert!(dfa.is_match_state(state));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Config {
look_behind: Option<u8>,
anchored: Anchored,
}
impl Config {
/// Create a new default start configuration.
///
/// The default is an unanchored search that starts at the beginning of the
/// haystack.
pub fn new() -> Config {
Config { anchored: Anchored::No, look_behind: None }
}
/// A convenience routine for building a start configuration from an
/// [`Input`] for a forward search.
///
/// This automatically sets the look-behind byte to the byte immediately
/// preceding the start of the search. If the start of the search is at
/// offset `0`, then no look-behind byte is set.
pub fn from_input_forward(input: &Input<'_>) -> Config {
let look_behind = input
.start()
.checked_sub(1)
.and_then(|i| input.haystack().get(i).copied());
Config { look_behind, anchored: input.get_anchored() }
}
/// A convenience routine for building a start configuration from an
/// [`Input`] for a reverse search.
///
/// This automatically sets the look-behind byte to the byte immediately
/// following the end of the search. If the end of the search is at
/// offset `haystack.len()`, then no look-behind byte is set.
pub fn from_input_reverse(input: &Input<'_>) -> Config {
let look_behind = input.haystack().get(input.end()).copied();
Config { look_behind, anchored: input.get_anchored() }
}
/// Set the look-behind byte at the start of a search.
///
/// Unless the search is intended to logically start at the beginning of a
/// haystack, this should _always_ be set to the byte immediately preceding
/// the start of the search. If no look-behind byte is set, then the start
/// configuration will assume it is at the beginning of the haystack. For
/// example, the anchor `^` will match.
///
/// The default is that no look-behind byte is set.
pub fn look_behind(mut self, byte: Option<u8>) -> Config {
self.look_behind = byte;
self
}
/// Set the anchored mode of a search.
///
/// The default is an unanchored search.
pub fn anchored(mut self, mode: Anchored) -> Config {
self.anchored = mode;
self
}
/// Return the look-behind byte in this configuration, if one exists.
pub fn get_look_behind(&self) -> Option<u8> {
self.look_behind
}
/// Return the anchored mode in this configuration.
pub fn get_anchored(&self) -> Anchored {
self.anchored
}
}
/// A map from every possible byte value to its corresponding starting
/// configuration.
///
/// This map is used in order to lookup the start configuration for a particular
/// position in a haystack. This start configuration is then used in
/// combination with things like the anchored mode and pattern ID to fully
/// determine the start state.
///
/// Generally speaking, this map is only used for fully compiled DFAs and lazy
/// DFAs. For NFAs (including the one-pass DFA), the start state is generally
/// selected by virtue of traversing the NFA state graph. DFAs do the same
/// thing, but at build time and not search time. (Well, technically the lazy
/// DFA does it at search time, but it does enough work to cache the full
/// result of the epsilon closure that the NFA engines tend to need to do.)
#[derive(Clone)]
pub(crate) struct StartByteMap {
map: [Start; 256],
}
impl StartByteMap {
/// Create a new map from byte values to their corresponding starting
/// configurations. The map is determined, in part, by how look-around
/// assertions are matched via the matcher given.
pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap {
let mut map = [Start::NonWordByte; 256];
map[usize::from(b'\n')] = Start::LineLF;
map[usize::from(b'\r')] = Start::LineCR;
map[usize::from(b'_')] = Start::WordByte;
let mut byte = b'0';
while byte <= b'9' {
map[usize::from(byte)] = Start::WordByte;
byte += 1;
}
byte = b'A';
while byte <= b'Z' {
map[usize::from(byte)] = Start::WordByte;
byte += 1;
}
byte = b'a';
while byte <= b'z' {
map[usize::from(byte)] = Start::WordByte;
byte += 1;
}
let lineterm = lookm.get_line_terminator();
// If our line terminator is normal, then it is already handled by
// the LineLF and LineCR configurations. But if it's weird, then we
// overwrite whatever was there before for that terminator with a
// special configuration. The trick here is that if the terminator
// is, say, a word byte like `a`, then callers seeing this start
// configuration need to account for that and build their DFA state as
// if it *also* came from a word byte.
if lineterm != b'\r' && lineterm != b'\n' {
map[usize::from(lineterm)] = Start::CustomLineTerminator;
}
StartByteMap { map }
}
/// Return the starting configuration for the given look-behind byte.
///
/// If no look-behind exists, callers should use `Start::Text`.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn get(&self, byte: u8) -> Start {
self.map[usize::from(byte)]
}
/// Deserializes a byte class map from the given slice. If the slice is of
/// insufficient length or otherwise contains an impossible mapping, then
/// an error is returned. Upon success, the number of bytes read along with
/// the map are returned. The number of bytes read is always a multiple of
/// 8.
pub(crate) fn from_bytes(
slice: &[u8],
) -> Result<(StartByteMap, usize), DeserializeError> {
wire::check_slice_len(slice, 256, "start byte map")?;
let mut map = [Start::NonWordByte; 256];
for (i, &repr) in slice[..256].iter().enumerate() {
map[i] = match Start::from_usize(usize::from(repr)) {
Some(start) => start,
None => {
return Err(DeserializeError::generic(
"found invalid starting configuration",
))
}
};
}
Ok((StartByteMap { map }, 256))
}
/// Writes this map to the given byte buffer. if the given buffer is too
/// small, then an error is returned. Upon success, the total number of
/// bytes written is returned. The number of bytes written is guaranteed to
/// be a multiple of 8.
pub(crate) fn write_to(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = self.write_to_len();
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("start byte map"));
}
for (i, &start) in self.map.iter().enumerate() {
dst[i] = start.as_u8();
}
Ok(nwrite)
}
/// Returns the total number of bytes written by `write_to`.
pub(crate) fn write_to_len(&self) -> usize {
256
}
}
impl core::fmt::Debug for StartByteMap {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use crate::util::escape::DebugByte;
write!(f, "StartByteMap{{")?;
for byte in 0..=255 {
if byte > 0 {
write!(f, ", ")?;
}
let start = self.map[usize::from(byte)];
write!(f, "{:?} => {:?}", DebugByte(byte), start)?;
}
write!(f, "}}")?;
Ok(())
}
}
/// Represents the six possible starting configurations of a DFA search.
///
/// The starting configuration is determined by inspecting the beginning
/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID
/// (if specified) and the type of search (anchored or not) is what selects the
/// start state to use in a DFA.
///
/// As one example, if a DFA only supports unanchored searches and does not
/// support anchored searches for each pattern, then it will have at most 6
/// distinct start states. (Some start states may be reused if determinization
/// can determine that they will be equivalent.) If the DFA supports both
/// anchored and unanchored searches, then it will have a maximum of 12
/// distinct start states. Finally, if the DFA also supports anchored searches
/// for each pattern, then it can have up to `12 + (N * 6)` start states, where
/// `N` is the number of patterns.
///
/// Handling each of these starting configurations in the context of DFA
/// determinization can be *quite* tricky and subtle. But the code is small
/// and can be found at `crate::util::determinize::set_lookbehind_from_start`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum Start {
/// This occurs when the starting position is not any of the ones below.
NonWordByte = 0,
/// This occurs when the byte immediately preceding the start of the search
/// is an ASCII word byte.
WordByte = 1,
/// This occurs when the starting position of the search corresponds to the
/// beginning of the haystack.
Text = 2,
/// This occurs when the byte immediately preceding the start of the search
/// is a line terminator. Specifically, `\n`.
LineLF = 3,
/// This occurs when the byte immediately preceding the start of the search
/// is a line terminator. Specifically, `\r`.
LineCR = 4,
/// This occurs when a custom line terminator has been set via a
/// `LookMatcher`, and when that line terminator is neither a `\r` or a
/// `\n`.
///
/// If the custom line terminator is a word byte, then this start
/// configuration is still selected. DFAs that implement word boundary
/// assertions will likely need to check whether the custom line terminator
/// is a word byte, in which case, it should behave as if the byte
/// satisfies `\b` in addition to multi-line anchors.
CustomLineTerminator = 5,
}
impl Start {
/// Return the starting state corresponding to the given integer. If no
/// starting state exists for the given integer, then None is returned.
pub(crate) fn from_usize(n: usize) -> Option<Start> {
match n {
0 => Some(Start::NonWordByte),
1 => Some(Start::WordByte),
2 => Some(Start::Text),
3 => Some(Start::LineLF),
4 => Some(Start::LineCR),
5 => Some(Start::CustomLineTerminator),
_ => None,
}
}
/// Returns the total number of starting state configurations.
pub(crate) fn len() -> usize {
6
}
/// Return this starting configuration as `u8` integer. It is guaranteed to
/// be less than `Start::len()`.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn as_u8(&self) -> u8 {
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
// actual int.
*self as u8
}
/// Return this starting configuration as a `usize` integer. It is
/// guaranteed to be less than `Start::len()`.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn as_usize(&self) -> usize {
usize::from(self.as_u8())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn start_fwd_done_range() {
let smap = StartByteMap::new(&LookMatcher::default());
let input = Input::new("").range(1..0);
let config = Config::from_input_forward(&input);
let start =
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
assert_eq!(Start::Text, start);
}
#[test]
fn start_rev_done_range() {
let smap = StartByteMap::new(&LookMatcher::default());
let input = Input::new("").range(1..0);
let config = Config::from_input_reverse(&input);
let start =
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
assert_eq!(Start::Text, start);
}
#[test]
fn start_fwd() {
let f = |haystack, start, end| {
let smap = StartByteMap::new(&LookMatcher::default());
let input = Input::new(haystack).range(start..end);
let config = Config::from_input_forward(&input);
let start =
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
start
};
assert_eq!(Start::Text, f("", 0, 0));
assert_eq!(Start::Text, f("abc", 0, 3));
assert_eq!(Start::Text, f("\nabc", 0, 3));
assert_eq!(Start::LineLF, f("\nabc", 1, 3));
assert_eq!(Start::LineCR, f("\rabc", 1, 3));
assert_eq!(Start::WordByte, f("abc", 1, 3));
assert_eq!(Start::NonWordByte, f(" abc", 1, 3));
}
#[test]
fn start_rev() {
let f = |haystack, start, end| {
let smap = StartByteMap::new(&LookMatcher::default());
let input = Input::new(haystack).range(start..end);
let config = Config::from_input_reverse(&input);
let start =
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
start
};
assert_eq!(Start::Text, f("", 0, 0));
assert_eq!(Start::Text, f("abc", 0, 3));
assert_eq!(Start::Text, f("abc\n", 0, 4));
assert_eq!(Start::LineLF, f("abc\nz", 0, 3));
assert_eq!(Start::LineCR, f("abc\rz", 0, 3));
assert_eq!(Start::WordByte, f("abc", 0, 2));
assert_eq!(Start::NonWordByte, f("abc ", 0, 3));
}
}

482
vendor/regex-automata/src/util/syntax.rs vendored Normal file
View File

@@ -0,0 +1,482 @@
/*!
Utilities for dealing with the syntax of a regular expression.
This module currently only exposes a [`Config`] type that
itself represents a wrapper around the configuration for a
[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
this wrapper is to make configuring syntax options very similar to how other
configuration is done throughout this crate. Namely, instead of duplicating
syntax options across every builder (of which there are many), we instead
create small config objects like this one that can be passed around and
composed.
*/
use alloc::{vec, vec::Vec};
use regex_syntax::{
ast,
hir::{self, Hir},
Error, ParserBuilder,
};
/// A convenience routine for parsing a pattern into an HIR value with the
/// default configuration.
///
/// # Example
///
/// This shows how to parse a pattern into an HIR value:
///
/// ```
/// use regex_automata::util::syntax;
///
/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn parse(pattern: &str) -> Result<Hir, Error> {
parse_with(pattern, &Config::default())
}
/// A convenience routine for parsing many patterns into HIR value with the
/// default configuration.
///
/// # Example
///
/// This shows how to parse many patterns into an corresponding HIR values:
///
/// ```
/// use {
/// regex_automata::util::syntax,
/// regex_syntax::hir::Properties,
/// };
///
/// let hirs = syntax::parse_many(&[
/// r"([a-z]+)|([0-9]+)",
/// r"foo(A-Z]+)bar",
/// ])?;
/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
/// assert_eq!(Some(1), props.static_explicit_captures_len());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
parse_many_with(patterns, &Config::default())
}
/// A convenience routine for parsing a pattern into an HIR value using a
/// `Config`.
///
/// # Example
///
/// This shows how to parse a pattern into an HIR value with a non-default
/// configuration:
///
/// ```
/// use regex_automata::util::syntax;
///
/// let hir = syntax::parse_with(
/// r"^[a-z]+$",
/// &syntax::Config::new().multi_line(true).crlf(true),
/// )?;
/// assert!(hir.properties().look_set().contains_anchor_crlf());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
let mut builder = ParserBuilder::new();
config.apply(&mut builder);
builder.build().parse(pattern)
}
/// A convenience routine for parsing many patterns into HIR values using a
/// `Config`.
///
/// # Example
///
/// This shows how to parse many patterns into an corresponding HIR values
/// with a non-default configuration:
///
/// ```
/// use {
/// regex_automata::util::syntax,
/// regex_syntax::hir::Properties,
/// };
///
/// let patterns = &[
/// r"([a-z]+)|([0-9]+)",
/// r"\W",
/// r"foo(A-Z]+)bar",
/// ];
/// let config = syntax::Config::new().unicode(false).utf8(false);
/// let hirs = syntax::parse_many_with(patterns, &config)?;
/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
/// assert!(!props.is_utf8());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn parse_many_with<P: AsRef<str>>(
patterns: &[P],
config: &Config,
) -> Result<Vec<Hir>, Error> {
let mut builder = ParserBuilder::new();
config.apply(&mut builder);
let mut hirs = vec![];
for p in patterns.iter() {
hirs.push(builder.build().parse(p.as_ref())?);
}
Ok(hirs)
}
/// A common set of configuration options that apply to the syntax of a regex.
///
/// This represents a group of configuration options that specifically apply
/// to how the concrete syntax of a regular expression is interpreted. In
/// particular, they are generally forwarded to the
/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
/// in the
/// [`regex-syntax`](https://docs.rs/regex-syntax)
/// crate when building a regex from its concrete syntax directly.
///
/// These options are defined as a group since they apply to every regex engine
/// in this crate. Instead of re-defining them on every engine's builder, they
/// are instead provided here as one cohesive unit.
#[derive(Clone, Copy, Debug)]
pub struct Config {
case_insensitive: bool,
multi_line: bool,
dot_matches_new_line: bool,
crlf: bool,
line_terminator: u8,
swap_greed: bool,
ignore_whitespace: bool,
unicode: bool,
utf8: bool,
nest_limit: u32,
octal: bool,
}
impl Config {
/// Return a new default syntax configuration.
pub fn new() -> Config {
// These defaults match the ones used in regex-syntax.
Config {
case_insensitive: false,
multi_line: false,
dot_matches_new_line: false,
crlf: false,
line_terminator: b'\n',
swap_greed: false,
ignore_whitespace: false,
unicode: true,
utf8: true,
nest_limit: 250,
octal: false,
}
}
/// Enable or disable the case insensitive flag by default.
///
/// When Unicode mode is enabled, case insensitivity is Unicode-aware.
/// Specifically, it will apply the "simple" case folding rules as
/// specified by Unicode.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `i` flag.
pub fn case_insensitive(mut self, yes: bool) -> Config {
self.case_insensitive = yes;
self
}
/// Enable or disable the multi-line matching flag by default.
///
/// When this is enabled, the `^` and `$` look-around assertions will
/// match immediately after and immediately before a new line character,
/// respectively. Note that the `\A` and `\z` look-around assertions are
/// unaffected by this setting and always correspond to matching at the
/// beginning and end of the input.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `m` flag.
pub fn multi_line(mut self, yes: bool) -> Config {
self.multi_line = yes;
self
}
/// Enable or disable the "dot matches any character" flag by default.
///
/// When this is enabled, `.` will match any character. When it's disabled,
/// then `.` will match any character except for a new line character.
///
/// Note that `.` is impacted by whether the "unicode" setting is enabled
/// or not. When Unicode is enabled (the default), `.` will match any UTF-8
/// encoding of any Unicode scalar value (sans a new line, depending on
/// whether this "dot matches new line" option is enabled). When Unicode
/// mode is disabled, `.` will match any byte instead. Because of this,
/// when Unicode mode is disabled, `.` can only be used when the "allow
/// invalid UTF-8" option is enabled, since `.` could otherwise match
/// invalid UTF-8.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `s` flag.
pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
self.dot_matches_new_line = yes;
self
}
/// Enable or disable the "CRLF mode" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `R` flag.
///
/// When CRLF mode is enabled, the following happens:
///
/// * Unless `dot_matches_new_line` is enabled, `.` will match any character
/// except for `\r` and `\n`.
/// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
/// `\r` and `\n` as line terminators. And in particular, neither will
/// match between a `\r` and a `\n`.
pub fn crlf(mut self, yes: bool) -> Config {
self.crlf = yes;
self
}
/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
///
/// Namely, instead of `.` (by default) matching everything except for `\n`,
/// this will cause `.` to match everything except for the byte given.
///
/// If `.` is used in a context where Unicode mode is enabled and this byte
/// isn't ASCII, then an error will be returned. When Unicode mode is
/// disabled, then any byte is permitted, but will return an error if UTF-8
/// mode is enabled and it is a non-ASCII byte.
///
/// In short, any ASCII value for a line terminator is always okay. But a
/// non-ASCII byte might result in an error depending on whether Unicode
/// mode or UTF-8 mode are enabled.
///
/// Note that if `R` mode is enabled then it always takes precedence and
/// the line terminator will be treated as `\r` and `\n` simultaneously.
///
/// Note also that this *doesn't* impact the look-around assertions
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
/// configuration in the regex engine itself.
pub fn line_terminator(mut self, byte: u8) -> Config {
self.line_terminator = byte;
self
}
/// Enable or disable the "swap greed" flag by default.
///
/// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
/// will become greedy.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `U` flag.
pub fn swap_greed(mut self, yes: bool) -> Config {
self.swap_greed = yes;
self
}
/// Enable verbose mode in the regular expression.
///
/// When enabled, verbose mode permits insignificant whitespace in many
/// places in the regular expression, as well as comments. Comments are
/// started using `#` and continue until the end of the line.
///
/// By default, this is disabled. It may be selectively enabled in the
/// regular expression by using the `x` flag regardless of this setting.
pub fn ignore_whitespace(mut self, yes: bool) -> Config {
self.ignore_whitespace = yes;
self
}
/// Enable or disable the Unicode flag (`u`) by default.
///
/// By default this is **enabled**. It may alternatively be selectively
/// disabled in the regular expression itself via the `u` flag.
///
/// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
/// default), a regular expression will fail to parse if Unicode mode is
/// disabled and a sub-expression could possibly match invalid UTF-8.
///
/// **WARNING**: Unicode mode can greatly increase the size of the compiled
/// DFA, which can noticeably impact both memory usage and compilation
/// time. This is especially noticeable if your regex contains character
/// classes like `\w` that are impacted by whether Unicode is enabled or
/// not. If Unicode is not necessary, you are encouraged to disable it.
pub fn unicode(mut self, yes: bool) -> Config {
self.unicode = yes;
self
}
/// When disabled, the builder will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// For example, when [`Config::unicode`] is disabled, then
/// expressions like `[^a]` may match invalid UTF-8 since they can match
/// any single byte that is not `a`. By default, these sub-expressions
/// are disallowed to avoid returning offsets that split a UTF-8
/// encoded codepoint. However, in cases where matching at arbitrary
/// locations is desired, this option can be disabled to permit all such
/// sub-expressions.
///
/// When enabled (the default), the builder is guaranteed to produce a
/// regex that will only ever match valid UTF-8 (otherwise, the builder
/// will return an error).
pub fn utf8(mut self, yes: bool) -> Config {
self.utf8 = yes;
self
}
/// Set the nesting limit used for the regular expression parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow when building a finite automaton from a regular expression's
/// abstract syntax tree. In particular, construction currently uses
/// recursion. In the future, the implementation may stop using recursion
/// and this option will no longer be necessary.
///
/// This limit is not checked until the entire AST is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since the parser will
/// limit itself to heap space proportional to the length of the pattern
/// string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation AST item, which results
/// in a nest depth of `1`. In general, a nest limit is not something that
/// manifests in an obvious way in the concrete syntax, therefore, it
/// should not be used in a granular way.
pub fn nest_limit(mut self, limit: u32) -> Config {
self.nest_limit = limit;
self
}
/// Whether to support octal syntax or not.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints in
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
/// `\141` are all equivalent regular expressions, where the last example
/// shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem, it does
/// make good error messages harder. That is, in PCRE based regex engines,
/// syntax like `\1` invokes a backreference, which is explicitly
/// unsupported in Rust's regex engine. However, many users expect it to
/// be supported. Therefore, when octal support is disabled, the error
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
pub fn octal(mut self, yes: bool) -> Config {
self.octal = yes;
self
}
/// Returns whether "unicode" mode is enabled.
pub fn get_unicode(&self) -> bool {
self.unicode
}
/// Returns whether "case insensitive" mode is enabled.
pub fn get_case_insensitive(&self) -> bool {
self.case_insensitive
}
/// Returns whether "multi line" mode is enabled.
pub fn get_multi_line(&self) -> bool {
self.multi_line
}
/// Returns whether "dot matches new line" mode is enabled.
pub fn get_dot_matches_new_line(&self) -> bool {
self.dot_matches_new_line
}
/// Returns whether "CRLF" mode is enabled.
pub fn get_crlf(&self) -> bool {
self.crlf
}
/// Returns the line terminator in this syntax configuration.
pub fn get_line_terminator(&self) -> u8 {
self.line_terminator
}
/// Returns whether "swap greed" mode is enabled.
pub fn get_swap_greed(&self) -> bool {
self.swap_greed
}
/// Returns whether "ignore whitespace" mode is enabled.
pub fn get_ignore_whitespace(&self) -> bool {
self.ignore_whitespace
}
/// Returns whether UTF-8 mode is enabled.
pub fn get_utf8(&self) -> bool {
self.utf8
}
/// Returns the "nest limit" setting.
pub fn get_nest_limit(&self) -> u32 {
self.nest_limit
}
/// Returns whether "octal" mode is enabled.
pub fn get_octal(&self) -> bool {
self.octal
}
/// Applies this configuration to the given parser.
pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
builder
.unicode(self.unicode)
.case_insensitive(self.case_insensitive)
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
.crlf(self.crlf)
.line_terminator(self.line_terminator)
.swap_greed(self.swap_greed)
.ignore_whitespace(self.ignore_whitespace)
.utf8(self.utf8)
.nest_limit(self.nest_limit)
.octal(self.octal);
}
/// Applies this configuration to the given AST parser.
pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
builder
.ignore_whitespace(self.ignore_whitespace)
.nest_limit(self.nest_limit)
.octal(self.octal);
}
/// Applies this configuration to the given AST-to-HIR translator.
pub(crate) fn apply_hir(
&self,
builder: &mut hir::translate::TranslatorBuilder,
) {
builder
.unicode(self.unicode)
.case_insensitive(self.case_insensitive)
.multi_line(self.multi_line)
.crlf(self.crlf)
.dot_matches_new_line(self.dot_matches_new_line)
.line_terminator(self.line_terminator)
.swap_greed(self.swap_greed)
.utf8(self.utf8);
}
}
impl Default for Config {
fn default() -> Config {
Config::new()
}
}

View File

@@ -0,0 +1,17 @@
// This cfg should match the one in src/util/look.rs that uses perl_word.
#[cfg(all(
// We have to explicitly want to support Unicode word boundaries.
feature = "unicode-word-boundary",
not(all(
// If we don't have regex-syntax at all, then we definitely need to
// bring our own \w data table.
feature = "syntax",
// If unicode-perl is enabled, then regex-syntax/unicode-perl is
// also enabled, which in turn means we can use regex-syntax's
// is_word_character routine (and thus use its data tables). But if
// unicode-perl is not enabled, even if syntax is, then we need to
// bring our own.
feature = "unicode-perl",
)),
))]
pub(crate) mod perl_word;

View File

@@ -0,0 +1,806 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate perl-word ucd-16.0.0 --chars
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PERL_WORD: &'static [(char, char)] = &[
('0', '9'),
('A', 'Z'),
('_', '_'),
('a', 'z'),
('ª', 'ª'),
('µ', 'µ'),
('º', 'º'),
('À', 'Ö'),
('Ø', 'ö'),
('ø', 'ˁ'),
('ˆ', 'ˑ'),
('ˠ', 'ˤ'),
('ˬ', 'ˬ'),
('ˮ', 'ˮ'),
('\u{300}', 'ʹ'),
('Ͷ', 'ͷ'),
('ͺ', 'ͽ'),
('Ϳ', 'Ϳ'),
('Ά', 'Ά'),
('Έ', 'Ί'),
('Ό', 'Ό'),
('Ύ', 'Ρ'),
('Σ', 'ϵ'),
('Ϸ', 'ҁ'),
('\u{483}', 'ԯ'),
('Ա', 'Ֆ'),
('ՙ', 'ՙ'),
('ՠ', 'ֈ'),
('\u{591}', '\u{5bd}'),
('\u{5bf}', '\u{5bf}'),
('\u{5c1}', '\u{5c2}'),
('\u{5c4}', '\u{5c5}'),
('\u{5c7}', '\u{5c7}'),
('א', 'ת'),
('ׯ', 'ײ'),
('\u{610}', '\u{61a}'),
('ؠ', '٩'),
('ٮ', 'ۓ'),
('ە', '\u{6dc}'),
('\u{6df}', '\u{6e8}'),
('\u{6ea}', 'ۼ'),
('ۿ', 'ۿ'),
('ܐ', '\u{74a}'),
('ݍ', 'ޱ'),
('߀', 'ߵ'),
('ߺ', 'ߺ'),
('\u{7fd}', '\u{7fd}'),
('ࠀ', '\u{82d}'),
('ࡀ', '\u{85b}'),
('ࡠ', 'ࡪ'),
('ࡰ', 'ࢇ'),
('ࢉ', 'ࢎ'),
('\u{897}', '\u{8e1}'),
('\u{8e3}', '\u{963}'),
('', '९'),
('ॱ', 'ঃ'),
('অ', 'ঌ'),
('এ', 'ঐ'),
('ও', 'ন'),
('প', 'র'),
('ল', 'ল'),
('শ', 'হ'),
('\u{9bc}', '\u{9c4}'),
('ে', 'ৈ'),
('ো', 'ৎ'),
('\u{9d7}', '\u{9d7}'),
('ড়', 'ঢ়'),
('য়', '\u{9e3}'),
('', 'ৱ'),
('ৼ', 'ৼ'),
('\u{9fe}', '\u{9fe}'),
('\u{a01}', 'ਃ'),
('ਅ', 'ਊ'),
('ਏ', 'ਐ'),
('ਓ', 'ਨ'),
('ਪ', 'ਰ'),
('ਲ', 'ਲ਼'),
('ਵ', 'ਸ਼'),
('ਸ', 'ਹ'),
('\u{a3c}', '\u{a3c}'),
('ਾ', '\u{a42}'),
('\u{a47}', '\u{a48}'),
('\u{a4b}', '\u{a4d}'),
('\u{a51}', '\u{a51}'),
('ਖ਼', 'ੜ'),
('ਫ਼', 'ਫ਼'),
('', '\u{a75}'),
('\u{a81}', ''),
('અ', 'ઍ'),
('એ', 'ઑ'),
('ઓ', 'ન'),
('પ', 'ર'),
('લ', 'ળ'),
('વ', 'હ'),
('\u{abc}', '\u{ac5}'),
('\u{ac7}', 'ૉ'),
('ો', '\u{acd}'),
('ૐ', 'ૐ'),
('ૠ', '\u{ae3}'),
('', '૯'),
('ૹ', '\u{aff}'),
('\u{b01}', ''),
('ଅ', 'ଌ'),
('ଏ', 'ଐ'),
('ଓ', 'ନ'),
('ପ', 'ର'),
('ଲ', 'ଳ'),
('ଵ', 'ହ'),
('\u{b3c}', '\u{b44}'),
('େ', 'ୈ'),
('ୋ', '\u{b4d}'),
('\u{b55}', '\u{b57}'),
('ଡ଼', 'ଢ଼'),
('ୟ', '\u{b63}'),
('', '୯'),
('ୱ', 'ୱ'),
('\u{b82}', 'ஃ'),
('அ', 'ஊ'),
('எ', 'ஐ'),
('ஒ', 'க'),
('ங', 'ச'),
('ஜ', 'ஜ'),
('ஞ', 'ட'),
('ண', 'த'),
('ந', 'ப'),
('ம', 'ஹ'),
('\u{bbe}', 'ூ'),
('ெ', 'ை'),
('ொ', '\u{bcd}'),
('ௐ', 'ௐ'),
('\u{bd7}', '\u{bd7}'),
('', '௯'),
('\u{c00}', 'ఌ'),
('ఎ', 'ఐ'),
('ఒ', 'న'),
('ప', 'హ'),
('\u{c3c}', 'ౄ'),
('\u{c46}', '\u{c48}'),
('\u{c4a}', '\u{c4d}'),
('\u{c55}', '\u{c56}'),
('ౘ', 'ౚ'),
('ౝ', 'ౝ'),
('ౠ', '\u{c63}'),
('', '౯'),
('ಀ', 'ಃ'),
('ಅ', 'ಌ'),
('ಎ', 'ಐ'),
('ಒ', 'ನ'),
('ಪ', 'ಳ'),
('ವ', 'ಹ'),
('\u{cbc}', 'ೄ'),
('\u{cc6}', '\u{cc8}'),
('\u{cca}', '\u{ccd}'),
('\u{cd5}', '\u{cd6}'),
('ೝ', 'ೞ'),
('ೠ', '\u{ce3}'),
('', '೯'),
('ೱ', 'ೳ'),
('\u{d00}', 'ഌ'),
('എ', 'ഐ'),
('ഒ', '\u{d44}'),
('െ', 'ൈ'),
('ൊ', 'ൎ'),
('ൔ', '\u{d57}'),
('ൟ', '\u{d63}'),
('', '൯'),
('ൺ', 'ൿ'),
('\u{d81}', 'ඃ'),
('අ', 'ඖ'),
('ක', 'න'),
('ඳ', 'ර'),
('ල', 'ල'),
('ව', 'ෆ'),
('\u{dca}', '\u{dca}'),
('\u{dcf}', '\u{dd4}'),
('\u{dd6}', '\u{dd6}'),
('ෘ', '\u{ddf}'),
('෦', '෯'),
('ෲ', 'ෳ'),
('ก', '\u{e3a}'),
('เ', '\u{e4e}'),
('', '๙'),
('ກ', 'ຂ'),
('ຄ', 'ຄ'),
('ຆ', 'ຊ'),
('ຌ', 'ຣ'),
('ລ', 'ລ'),
('ວ', 'ຽ'),
('ເ', 'ໄ'),
('ໆ', 'ໆ'),
('\u{ec8}', '\u{ece}'),
('', '໙'),
('ໜ', 'ໟ'),
('ༀ', 'ༀ'),
('\u{f18}', '\u{f19}'),
('༠', '༩'),
('\u{f35}', '\u{f35}'),
('\u{f37}', '\u{f37}'),
('\u{f39}', '\u{f39}'),
('༾', 'ཇ'),
('ཉ', 'ཬ'),
('\u{f71}', '\u{f84}'),
('\u{f86}', '\u{f97}'),
('\u{f99}', '\u{fbc}'),
('\u{fc6}', '\u{fc6}'),
('က', '၉'),
('ၐ', '\u{109d}'),
('Ⴀ', 'Ⴥ'),
('Ⴧ', 'Ⴧ'),
('Ⴭ', 'Ⴭ'),
('ა', 'ჺ'),
('ჼ', 'ቈ'),
('ቊ', 'ቍ'),
('ቐ', 'ቖ'),
('ቘ', 'ቘ'),
('ቚ', 'ቝ'),
('በ', 'ኈ'),
('ኊ', 'ኍ'),
('ነ', 'ኰ'),
('ኲ', 'ኵ'),
('ኸ', 'ኾ'),
('ዀ', 'ዀ'),
('ዂ', 'ዅ'),
('ወ', 'ዖ'),
('ዘ', 'ጐ'),
('ጒ', 'ጕ'),
('ጘ', 'ፚ'),
('\u{135d}', '\u{135f}'),
('ᎀ', 'ᎏ'),
('', 'Ᏽ'),
('ᏸ', 'ᏽ'),
('ᐁ', 'ᙬ'),
('ᙯ', 'ᙿ'),
('ᚁ', 'ᚚ'),
('ᚠ', 'ᛪ'),
('ᛮ', 'ᛸ'),
('ᜀ', '\u{1715}'),
('ᜟ', '\u{1734}'),
('ᝀ', '\u{1753}'),
('ᝠ', 'ᝬ'),
('ᝮ', 'ᝰ'),
('\u{1772}', '\u{1773}'),
('ក', '\u{17d3}'),
('ៗ', 'ៗ'),
('ៜ', '\u{17dd}'),
('០', '៩'),
('\u{180b}', '\u{180d}'),
('\u{180f}', '᠙'),
('ᠠ', 'ᡸ'),
('ᢀ', 'ᢪ'),
('ᢰ', 'ᣵ'),
('ᤀ', 'ᤞ'),
('\u{1920}', 'ᤫ'),
('ᤰ', '\u{193b}'),
('᥆', 'ᥭ'),
('ᥰ', 'ᥴ'),
('ᦀ', 'ᦫ'),
('ᦰ', 'ᧉ'),
('᧐', '᧙'),
('ᨀ', '\u{1a1b}'),
('ᨠ', '\u{1a5e}'),
('\u{1a60}', '\u{1a7c}'),
('\u{1a7f}', '᪉'),
('᪐', '᪙'),
('ᪧ', 'ᪧ'),
('\u{1ab0}', '\u{1ace}'),
('\u{1b00}', 'ᭌ'),
('᭐', '᭙'),
('\u{1b6b}', '\u{1b73}'),
('\u{1b80}', '\u{1bf3}'),
('ᰀ', '\u{1c37}'),
('᱀', '᱉'),
('ᱍ', 'ᱽ'),
('ᲀ', 'ᲊ'),
('Ა', 'Ჺ'),
('Ჽ', 'Ჿ'),
('\u{1cd0}', '\u{1cd2}'),
('\u{1cd4}', 'ᳺ'),
('ᴀ', 'ἕ'),
('Ἐ', 'Ἕ'),
('ἠ', 'ὅ'),
('Ὀ', 'Ὅ'),
('ὐ', 'ὗ'),
('Ὑ', 'Ὑ'),
('Ὓ', 'Ὓ'),
('Ὕ', 'Ὕ'),
('Ὗ', 'ώ'),
('ᾀ', 'ᾴ'),
('ᾶ', 'ᾼ'),
('', ''),
('ῂ', 'ῄ'),
('ῆ', 'ῌ'),
('ῐ', 'ΐ'),
('ῖ', 'Ί'),
('ῠ', 'Ῥ'),
('ῲ', 'ῴ'),
('ῶ', 'ῼ'),
('\u{200c}', '\u{200d}'),
('‿', '⁀'),
('⁔', '⁔'),
('ⁱ', 'ⁱ'),
('ⁿ', 'ⁿ'),
('ₐ', 'ₜ'),
('\u{20d0}', '\u{20f0}'),
('', ''),
('ℇ', 'ℇ'),
('', ''),
('', ''),
('', ''),
('', ''),
('Ω', 'Ω'),
('', ''),
('', ''),
('', ''),
('ℼ', 'ℿ'),
('', ''),
('ⅎ', 'ⅎ'),
('', 'ↈ'),
('Ⓐ', 'ⓩ'),
('Ⰰ', 'ⳤ'),
('Ⳬ', 'ⳳ'),
('ⴀ', 'ⴥ'),
('ⴧ', 'ⴧ'),
('ⴭ', 'ⴭ'),
('ⴰ', 'ⵧ'),
('ⵯ', 'ⵯ'),
('\u{2d7f}', 'ⶖ'),
('ⶠ', 'ⶦ'),
('ⶨ', 'ⶮ'),
('ⶰ', 'ⶶ'),
('ⶸ', 'ⶾ'),
('ⷀ', 'ⷆ'),
('ⷈ', 'ⷎ'),
('ⷐ', 'ⷖ'),
('ⷘ', 'ⷞ'),
('\u{2de0}', '\u{2dff}'),
('ⸯ', 'ⸯ'),
('々', ''),
('〡', '\u{302f}'),
('〱', '〵'),
('〸', '〼'),
('ぁ', 'ゖ'),
('\u{3099}', '\u{309a}'),
('ゝ', 'ゟ'),
('ァ', 'ヺ'),
('ー', 'ヿ'),
('ㄅ', 'ㄯ'),
('ㄱ', 'ㆎ'),
('ㆠ', 'ㆿ'),
('ㇰ', 'ㇿ'),
('㐀', '䶿'),
('一', 'ꒌ'),
('', ''),
('ꔀ', 'ꘌ'),
('ꘐ', 'ꘫ'),
('Ꙁ', '\u{a672}'),
('\u{a674}', '\u{a67d}'),
('ꙿ', '\u{a6f1}'),
('ꜗ', 'ꜟ'),
('Ꜣ', 'ꞈ'),
('Ꞌ', 'ꟍ'),
('Ꟑ', 'ꟑ'),
('ꟓ', 'ꟓ'),
('ꟕ', 'Ƛ'),
('ꟲ', 'ꠧ'),
('\u{a82c}', '\u{a82c}'),
('ꡀ', 'ꡳ'),
('ꢀ', '\u{a8c5}'),
('꣐', '꣙'),
('\u{a8e0}', 'ꣷ'),
('ꣻ', 'ꣻ'),
('ꣽ', '\u{a92d}'),
('ꤰ', '\u{a953}'),
('ꥠ', 'ꥼ'),
('\u{a980}', '\u{a9c0}'),
('ꧏ', '꧙'),
('ꧠ', 'ꧾ'),
('ꨀ', '\u{aa36}'),
('ꩀ', 'ꩍ'),
('꩐', '꩙'),
('ꩠ', 'ꩶ'),
('ꩺ', 'ꫂ'),
('ꫛ', 'ꫝ'),
('ꫠ', 'ꫯ'),
('ꫲ', '\u{aaf6}'),
('ꬁ', 'ꬆ'),
('ꬉ', 'ꬎ'),
('ꬑ', 'ꬖ'),
('ꬠ', 'ꬦ'),
('ꬨ', 'ꬮ'),
('ꬰ', ''),
('ꭜ', 'ꭩ'),
('ꭰ', 'ꯪ'),
('꯬', '\u{abed}'),
('꯰', '꯹'),
('가', '힣'),
('ힰ', 'ퟆ'),
('ퟋ', 'ퟻ'),
('豈', '舘'),
('並', '龎'),
('ff', 'st'),
('ﬓ', 'ﬗ'),
('יִ', 'ﬨ'),
('שׁ', 'זּ'),
('טּ', 'לּ'),
('מּ', 'מּ'),
('נּ', 'סּ'),
('ףּ', 'פּ'),
('צּ', 'ﮱ'),
('ﯓ', 'ﴽ'),
('ﵐ', 'ﶏ'),
('ﶒ', 'ﷇ'),
('ﷰ', 'ﷻ'),
('\u{fe00}', '\u{fe0f}'),
('\u{fe20}', '\u{fe2f}'),
('︳', '︴'),
('', ''),
('ﹰ', 'ﹴ'),
('ﹶ', 'ﻼ'),
('', ''),
('', ''),
('_', '_'),
('', ''),
('ヲ', 'ᄒ'),
('ᅡ', 'ᅦ'),
('ᅧ', 'ᅬ'),
('ᅭ', 'ᅲ'),
('ᅳ', 'ᅵ'),
('𐀀', '𐀋'),
('𐀍', '𐀦'),
('𐀨', '𐀺'),
('𐀼', '𐀽'),
('𐀿', '𐁍'),
('𐁐', '𐁝'),
('𐂀', '𐃺'),
('𐅀', '𐅴'),
('\u{101fd}', '\u{101fd}'),
('𐊀', '𐊜'),
('𐊠', '𐋐'),
('\u{102e0}', '\u{102e0}'),
('𐌀', '𐌟'),
('𐌭', '𐍊'),
('𐍐', '\u{1037a}'),
('𐎀', '𐎝'),
('𐎠', '𐏃'),
('𐏈', '𐏏'),
('𐏑', '𐏕'),
('𐐀', '𐒝'),
('𐒠', '𐒩'),
('𐒰', '𐓓'),
('𐓘', '𐓻'),
('𐔀', '𐔧'),
('𐔰', '𐕣'),
('𐕰', '𐕺'),
('𐕼', '𐖊'),
('𐖌', '𐖒'),
('𐖔', '𐖕'),
('𐖗', '𐖡'),
('𐖣', '𐖱'),
('𐖳', '𐖹'),
('𐖻', '𐖼'),
('𐗀', '𐗳'),
('𐘀', '𐜶'),
('𐝀', '𐝕'),
('𐝠', '𐝧'),
('𐞀', '𐞅'),
('𐞇', '𐞰'),
('𐞲', '𐞺'),
('𐠀', '𐠅'),
('𐠈', '𐠈'),
('𐠊', '𐠵'),
('𐠷', '𐠸'),
('𐠼', '𐠼'),
('𐠿', '𐡕'),
('𐡠', '𐡶'),
('𐢀', '𐢞'),
('𐣠', '𐣲'),
('𐣴', '𐣵'),
('𐤀', '𐤕'),
('𐤠', '𐤹'),
('𐦀', '𐦷'),
('𐦾', '𐦿'),
('𐨀', '\u{10a03}'),
('\u{10a05}', '\u{10a06}'),
('\u{10a0c}', '𐨓'),
('𐨕', '𐨗'),
('𐨙', '𐨵'),
('\u{10a38}', '\u{10a3a}'),
('\u{10a3f}', '\u{10a3f}'),
('𐩠', '𐩼'),
('𐪀', '𐪜'),
('𐫀', '𐫇'),
('𐫉', '\u{10ae6}'),
('𐬀', '𐬵'),
('𐭀', '𐭕'),
('𐭠', '𐭲'),
('𐮀', '𐮑'),
('𐰀', '𐱈'),
('𐲀', '𐲲'),
('𐳀', '𐳲'),
('𐴀', '\u{10d27}'),
('𐴰', '𐴹'),
('𐵀', '𐵥'),
('\u{10d69}', '\u{10d6d}'),
('𐵯', '𐶅'),
('𐺀', '𐺩'),
('\u{10eab}', '\u{10eac}'),
('𐺰', '𐺱'),
('𐻂', '𐻄'),
('\u{10efc}', '𐼜'),
('𐼧', '𐼧'),
('𐼰', '\u{10f50}'),
('𐽰', '\u{10f85}'),
('𐾰', '𐿄'),
('𐿠', '𐿶'),
('𑀀', '\u{11046}'),
('𑁦', '𑁵'),
('\u{1107f}', '\u{110ba}'),
('\u{110c2}', '\u{110c2}'),
('𑃐', '𑃨'),
('𑃰', '𑃹'),
('\u{11100}', '\u{11134}'),
('𑄶', '𑄿'),
('𑅄', '𑅇'),
('𑅐', '\u{11173}'),
('𑅶', '𑅶'),
('\u{11180}', '𑇄'),
('\u{111c9}', '\u{111cc}'),
('𑇎', '𑇚'),
('𑇜', '𑇜'),
('𑈀', '𑈑'),
('𑈓', '\u{11237}'),
('\u{1123e}', '\u{11241}'),
('𑊀', '𑊆'),
('𑊈', '𑊈'),
('𑊊', '𑊍'),
('𑊏', '𑊝'),
('𑊟', '𑊨'),
('𑊰', '\u{112ea}'),
('𑋰', '𑋹'),
('\u{11300}', '𑌃'),
('𑌅', '𑌌'),
('𑌏', '𑌐'),
('𑌓', '𑌨'),
('𑌪', '𑌰'),
('𑌲', '𑌳'),
('𑌵', '𑌹'),
('\u{1133b}', '𑍄'),
('𑍇', '𑍈'),
('𑍋', '\u{1134d}'),
('𑍐', '𑍐'),
('\u{11357}', '\u{11357}'),
('𑍝', '𑍣'),
('\u{11366}', '\u{1136c}'),
('\u{11370}', '\u{11374}'),
('𑎀', '𑎉'),
('𑎋', '𑎋'),
('𑎎', '𑎎'),
('𑎐', '𑎵'),
('𑎷', '\u{113c0}'),
('\u{113c2}', '\u{113c2}'),
('\u{113c5}', '\u{113c5}'),
('\u{113c7}', '𑏊'),
('𑏌', '𑏓'),
('\u{113e1}', '\u{113e2}'),
('𑐀', '𑑊'),
('𑑐', '𑑙'),
('\u{1145e}', '𑑡'),
('𑒀', '𑓅'),
('𑓇', '𑓇'),
('𑓐', '𑓙'),
('𑖀', '\u{115b5}'),
('𑖸', '\u{115c0}'),
('𑗘', '\u{115dd}'),
('𑘀', '\u{11640}'),
('𑙄', '𑙄'),
('𑙐', '𑙙'),
('𑚀', '𑚸'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜀', '𑜚'),
('\u{1171d}', '\u{1172b}'),
('𑜰', '𑜹'),
('𑝀', '𑝆'),
('𑠀', '\u{1183a}'),
('𑢠', '𑣩'),
('𑣿', '𑤆'),
('𑤉', '𑤉'),
('𑤌', '𑤓'),
('𑤕', '𑤖'),
('𑤘', '𑤵'),
('𑤷', '𑤸'),
('\u{1193b}', '\u{11943}'),
('𑥐', '𑥙'),
('𑦠', '𑦧'),
('𑦪', '\u{119d7}'),
('\u{119da}', '𑧡'),
('𑧣', '𑧤'),
('𑨀', '\u{11a3e}'),
('\u{11a47}', '\u{11a47}'),
('𑩐', '\u{11a99}'),
('𑪝', '𑪝'),
('𑪰', '𑫸'),
('𑯀', '𑯠'),
('𑯰', '𑯹'),
('𑰀', '𑰈'),
('𑰊', '\u{11c36}'),
('\u{11c38}', '𑱀'),
('𑱐', '𑱙'),
('𑱲', '𑲏'),
('\u{11c92}', '\u{11ca7}'),
('𑲩', '\u{11cb6}'),
('𑴀', '𑴆'),
('𑴈', '𑴉'),
('𑴋', '\u{11d36}'),
('\u{11d3a}', '\u{11d3a}'),
('\u{11d3c}', '\u{11d3d}'),
('\u{11d3f}', '\u{11d47}'),
('𑵐', '𑵙'),
('𑵠', '𑵥'),
('𑵧', '𑵨'),
('𑵪', '𑶎'),
('\u{11d90}', '\u{11d91}'),
('𑶓', '𑶘'),
('𑶠', '𑶩'),
('𑻠', '𑻶'),
('\u{11f00}', '𑼐'),
('𑼒', '\u{11f3a}'),
('𑼾', '\u{11f42}'),
('𑽐', '\u{11f5a}'),
('𑾰', '𑾰'),
('𒀀', '𒎙'),
('𒐀', '𒑮'),
('𒒀', '𒕃'),
('𒾐', '𒿰'),
('𓀀', '𓐯'),
('\u{13440}', '\u{13455}'),
('𓑠', '𔏺'),
('𔐀', '𔙆'),
('𖄀', '𖄹'),
('𖠀', '𖨸'),
('𖩀', '𖩞'),
('𖩠', '𖩩'),
('𖩰', '𖪾'),
('𖫀', '𖫉'),
('𖫐', '𖫭'),
('\u{16af0}', '\u{16af4}'),
('𖬀', '\u{16b36}'),
('𖭀', '𖭃'),
('𖭐', '𖭙'),
('𖭣', '𖭷'),
('𖭽', '𖮏'),
('𖵀', '𖵬'),
('𖵰', '𖵹'),
('𖹀', '𖹿'),
('𖼀', '𖽊'),
('\u{16f4f}', '𖾇'),
('\u{16f8f}', '𖾟'),
('𖿠', '𖿡'),
('𖿣', '\u{16fe4}'),
('\u{16ff0}', '\u{16ff1}'),
('𗀀', '𘟷'),
('𘠀', '𘳕'),
('𘳿', '𘴈'),
('𚿰', '𚿳'),
('𚿵', '𚿻'),
('𚿽', '𚿾'),
('𛀀', '𛄢'),
('𛄲', '𛄲'),
('𛅐', '𛅒'),
('𛅕', '𛅕'),
('𛅤', '𛅧'),
('𛅰', '𛋻'),
('𛰀', '𛱪'),
('𛱰', '𛱼'),
('𛲀', '𛲈'),
('𛲐', '𛲙'),
('\u{1bc9d}', '\u{1bc9e}'),
('𜳰', '𜳹'),
('\u{1cf00}', '\u{1cf2d}'),
('\u{1cf30}', '\u{1cf46}'),
('\u{1d165}', '\u{1d169}'),
('\u{1d16d}', '\u{1d172}'),
('\u{1d17b}', '\u{1d182}'),
('\u{1d185}', '\u{1d18b}'),
('\u{1d1aa}', '\u{1d1ad}'),
('\u{1d242}', '\u{1d244}'),
('𝐀', '𝑔'),
('𝑖', '𝒜'),
('𝒞', '𝒟'),
('𝒢', '𝒢'),
('𝒥', '𝒦'),
('𝒩', '𝒬'),
('𝒮', '𝒹'),
('𝒻', '𝒻'),
('𝒽', '𝓃'),
('𝓅', '𝔅'),
('𝔇', '𝔊'),
('𝔍', '𝔔'),
('𝔖', '𝔜'),
('𝔞', '𝔹'),
('𝔻', '𝔾'),
('𝕀', '𝕄'),
('𝕆', '𝕆'),
('𝕊', '𝕐'),
('𝕒', '𝚥'),
('𝚨', '𝛀'),
('𝛂', '𝛚'),
('𝛜', '𝛺'),
('𝛼', '𝜔'),
('𝜖', '𝜴'),
('𝜶', '𝝎'),
('𝝐', '𝝮'),
('𝝰', '𝞈'),
('𝞊', '𝞨'),
('𝞪', '𝟂'),
('𝟄', '𝟋'),
('𝟎', '𝟿'),
('\u{1da00}', '\u{1da36}'),
('\u{1da3b}', '\u{1da6c}'),
('\u{1da75}', '\u{1da75}'),
('\u{1da84}', '\u{1da84}'),
('\u{1da9b}', '\u{1da9f}'),
('\u{1daa1}', '\u{1daaf}'),
('𝼀', '𝼞'),
('𝼥', '𝼪'),
('\u{1e000}', '\u{1e006}'),
('\u{1e008}', '\u{1e018}'),
('\u{1e01b}', '\u{1e021}'),
('\u{1e023}', '\u{1e024}'),
('\u{1e026}', '\u{1e02a}'),
('𞀰', '𞁭'),
('\u{1e08f}', '\u{1e08f}'),
('𞄀', '𞄬'),
('\u{1e130}', '𞄽'),
('𞅀', '𞅉'),
('𞅎', '𞅎'),
('𞊐', '\u{1e2ae}'),
('𞋀', '𞋹'),
('𞓐', '𞓹'),
('𞗐', '𞗺'),
('𞟠', '𞟦'),
('𞟨', '𞟫'),
('𞟭', '𞟮'),
('𞟰', '𞟾'),
('𞠀', '𞣄'),
('\u{1e8d0}', '\u{1e8d6}'),
('𞤀', '𞥋'),
('𞥐', '𞥙'),
('𞸀', '𞸃'),
('𞸅', '𞸟'),
('𞸡', '𞸢'),
('𞸤', '𞸤'),
('𞸧', '𞸧'),
('𞸩', '𞸲'),
('𞸴', '𞸷'),
('𞸹', '𞸹'),
('𞸻', '𞸻'),
('𞹂', '𞹂'),
('𞹇', '𞹇'),
('𞹉', '𞹉'),
('𞹋', '𞹋'),
('𞹍', '𞹏'),
('𞹑', '𞹒'),
('𞹔', '𞹔'),
('𞹗', '𞹗'),
('𞹙', '𞹙'),
('𞹛', '𞹛'),
('𞹝', '𞹝'),
('𞹟', '𞹟'),
('𞹡', '𞹢'),
('𞹤', '𞹤'),
('𞹧', '𞹪'),
('𞹬', '𞹲'),
('𞹴', '𞹷'),
('𞹹', '𞹼'),
('𞹾', '𞹾'),
('𞺀', '𞺉'),
('𞺋', '𞺛'),
('𞺡', '𞺣'),
('𞺥', '𞺩'),
('𞺫', '𞺻'),
('🄰', '🅉'),
('🅐', '🅩'),
('🅰', '🆉'),
('🯰', '🯹'),
('𠀀', '𪛟'),
('𪜀', '𫜹'),
('𫝀', '𫠝'),
('𫠠', '𬺡'),
('𬺰', '𮯠'),
('𮯰', '𮹝'),
('丽', '𪘀'),
('𰀀', '𱍊'),
('𱍐', '𲎯'),
('\u{e0100}', '\u{e01ef}'),
];

191
vendor/regex-automata/src/util/utf8.rs vendored Normal file
View File

@@ -0,0 +1,191 @@
/*!
Utilities for dealing with UTF-8.
This module provides some UTF-8 related helper routines, including an
incremental decoder.
*/
/// Returns true if and only if the given byte is considered a word character.
/// This only applies to ASCII.
///
/// This was copied from regex-syntax so that we can use it to determine the
/// starting DFA state while searching without depending on regex-syntax. The
/// definition is never going to change, so there's no maintenance/bit-rot
/// hazard here.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn is_word_byte(b: u8) -> bool {
const fn mkwordset() -> [bool; 256] {
// FIXME: Use as_usize() once const functions in traits are stable.
let mut set = [false; 256];
set[b'_' as usize] = true;
let mut byte = b'0';
while byte <= b'9' {
set[byte as usize] = true;
byte += 1;
}
byte = b'A';
while byte <= b'Z' {
set[byte as usize] = true;
byte += 1;
}
byte = b'a';
while byte <= b'z' {
set[byte as usize] = true;
byte += 1;
}
set
}
const WORD: [bool; 256] = mkwordset();
WORD[b as usize]
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for a
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
/// crate, then please file an issue and discuss your use case.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
if bytes.is_empty() {
return None;
}
let len = match len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}
/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the end of the given byte
/// slice, then the last byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
if bytes.is_empty() {
return None;
}
let mut start = bytes.len() - 1;
let limit = bytes.len().saturating_sub(4);
while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
start -= 1;
}
match decode(&bytes[start..]) {
None => None,
Some(Ok(ch)) => Some(Ok(ch)),
Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
}
}
/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn len(byte: u8) -> Option<usize> {
match byte {
0b0000_0000..=0b0111_1111 => Some(1),
0b1000_0000..=0b1011_1111 => None,
0b1100_0000..=0b1101_1111 => Some(2),
0b1110_0000..=0b1110_1111 => Some(3),
0b1111_0000..=0b1111_0111 => Some(4),
_ => None,
}
}
/// Returns true if and only if the given offset in the given bytes falls on a
/// valid UTF-8 encoded codepoint boundary.
///
/// If `bytes` is not valid UTF-8, then the behavior of this routine is
/// unspecified.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
match bytes.get(i) {
// The position at the end of the bytes always represents an empty
// string, which is a valid boundary. But anything after that doesn't
// make much sense to call valid a boundary.
None => i == bytes.len(),
// Other than ASCII (where the most significant bit is never set),
// valid starting bytes always have their most significant two bits
// set, where as continuation bytes never have their second most
// significant bit set. Therefore, this only returns true when bytes[i]
// corresponds to a byte that begins a valid UTF-8 encoding of a
// Unicode scalar value.
Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
}
}
/// Returns true if and only if the given byte is either a valid leading UTF-8
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
/// valid UTF-8 sequence.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn is_leading_or_invalid_byte(b: u8) -> bool {
// In the ASCII case, the most significant bit is never set. The leading
// byte of a 2/3/4-byte sequence always has the top two most significant
// bits set. For bytes that can never appear anywhere in valid UTF-8, this
// also returns true, since every such byte has its two most significant
// bits set:
//
// \xC0 :: 11000000
// \xC1 :: 11000001
// \xF5 :: 11110101
// \xF6 :: 11110110
// \xF7 :: 11110111
// \xF8 :: 11111000
// \xF9 :: 11111001
// \xFA :: 11111010
// \xFB :: 11111011
// \xFC :: 11111100
// \xFD :: 11111101
// \xFE :: 11111110
// \xFF :: 11111111
(b & 0b1100_0000) != 0b1000_0000
}
/*
/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
///
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
/// value is guaranteed to be greater than `i`. (If there is no value greater
/// than `i` that fits in `usize`, then this panics.)
///
/// Generally speaking, this should only be called on `text` when it is
/// permitted to assume that it is valid UTF-8 and where either `i >=
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
///
/// NOTE: This method was used in a previous conception of iterators where we
/// specifically tried to skip over empty matches that split a codepoint by
/// simply requiring that our next search begin at the beginning of codepoint.
/// But we ended up changing that technique to always advance by 1 byte and
/// then filter out matches that split a codepoint after-the-fact. Thus, we no
/// longer use this method. But I've kept it around in case we want to switch
/// back to this approach. Its guarantees are a little subtle, so I'd prefer
/// not to rebuild it from whole cloth.
pub(crate) fn next(text: &[u8], i: usize) -> usize {
let b = match text.get(i) {
None => return i.checked_add(1).unwrap(),
Some(&b) => b,
};
// For cases where we see an invalid UTF-8 byte, there isn't much we can do
// other than just start at the next byte.
let inc = len(b).unwrap_or(1);
i.checked_add(inc).unwrap()
}
*/

947
vendor/regex-automata/src/util/wire.rs vendored Normal file
View File

@@ -0,0 +1,947 @@
/*!
Types and routines that support the wire format of finite automata.
Currently, this module just exports a few error types and some small helpers
for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment.
*/
/*
A collection of helper functions, types and traits for serializing automata.
This crate defines its own bespoke serialization mechanism for some structures
provided in the public API, namely, DFAs. A bespoke mechanism was developed
primarily because structures like automata demand a specific binary format.
Attempting to encode their rich structure in an existing serialization
format is just not feasible. Moreover, the format for each structure is
generally designed such that deserialization is cheap. More specifically, that
deserialization can be done in constant time. (The idea being that you can
embed it into your binary or mmap it, and then use it immediately.)
In order to achieve this, the dense and sparse DFAs in this crate use an
in-memory representation that very closely corresponds to its binary serialized
form. This pervades and complicates everything, and in some cases, requires
dealing with alignment and reasoning about safety.
This technique does have major advantages. In particular, it permits doing
the potentially costly work of compiling a finite state machine in an offline
manner, and then loading it at runtime not only without having to re-compile
the regex, but even without the code required to do the compilation. This, for
example, permits one to use a pre-compiled DFA not only in environments without
Rust's standard library, but also in environments without a heap.
In the code below, whenever we insert some kind of padding, it's to enforce a
4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type
supported. (In a previous version of this library, DFAs were generic over the
state ID representation.)
Also, serialization generally requires the caller to specify endianness,
where as deserialization always assumes native endianness (otherwise cheap
deserialization would be impossible). This implies that serializing a structure
generally requires serializing both its big-endian and little-endian variants,
and then loading the correct one based on the target's endianness.
*/
use core::{cmp, mem::size_of};
#[cfg(feature = "alloc")]
use alloc::{vec, vec::Vec};
use crate::util::{
int::Pointer,
primitives::{PatternID, PatternIDError, StateID, StateIDError},
};
/// A hack to align a smaller type `B` with a bigger type `T`.
///
/// The usual use of this is with `B = [u8]` and `T = u32`. That is,
/// it permits aligning a sequence of bytes on a 4-byte boundary. This
/// is useful in contexts where one wants to embed a serialized [dense
/// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the
/// alignment required for the DFA.
///
/// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an
/// example of how to use this type.
#[repr(C)]
#[derive(Debug)]
pub struct AlignAs<B: ?Sized, T> {
/// A zero-sized field indicating the alignment we want.
pub _align: [T; 0],
/// A possibly non-sized field containing a sequence of bytes.
pub bytes: B,
}
/// An error that occurs when serializing an object from this crate.
///
/// Serialization, as used in this crate, universally refers to the process
/// of transforming a structure (like a DFA) into a custom binary format
/// represented by `&[u8]`. To this end, serialization is generally infallible.
/// However, it can fail when caller provided buffer sizes are too small. When
/// that occurs, a serialization error is reported.
///
/// A `SerializeError` provides no introspection capabilities. Its only
/// supported operation is conversion to a human readable error message.
///
/// This error type implements the `std::error::Error` trait only when the
/// `std` feature is enabled. Otherwise, this type is defined in all
/// configurations.
#[derive(Debug)]
pub struct SerializeError {
/// The name of the thing that a buffer is too small for.
///
/// Currently, the only kind of serialization error is one that is
/// committed by a caller: providing a destination buffer that is too
/// small to fit the serialized object. This makes sense conceptually,
/// since every valid inhabitant of a type should be serializable.
///
/// This is somewhat exposed in the public API of this crate. For example,
/// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are
/// guaranteed to never panic or error. This is only possible because the
/// implementation guarantees that it will allocate a `Vec<u8>` that is
/// big enough.
///
/// In summary, if a new serialization error kind needs to be added, then
/// it will need careful consideration.
what: &'static str,
}
impl SerializeError {
pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError {
SerializeError { what }
}
}
impl core::fmt::Display for SerializeError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "destination buffer is too small to write {}", self.what)
}
}
#[cfg(feature = "std")]
impl std::error::Error for SerializeError {}
/// An error that occurs when deserializing an object defined in this crate.
///
/// Serialization, as used in this crate, universally refers to the process
/// of transforming a structure (like a DFA) into a custom binary format
/// represented by `&[u8]`. Deserialization, then, refers to the process of
/// cheaply converting this binary format back to the object's in-memory
/// representation as defined in this crate. To the extent possible,
/// deserialization will report this error whenever this process fails.
///
/// A `DeserializeError` provides no introspection capabilities. Its only
/// supported operation is conversion to a human readable error message.
///
/// This error type implements the `std::error::Error` trait only when the
/// `std` feature is enabled. Otherwise, this type is defined in all
/// configurations.
#[derive(Debug)]
pub struct DeserializeError(DeserializeErrorKind);
#[derive(Debug)]
enum DeserializeErrorKind {
Generic { msg: &'static str },
BufferTooSmall { what: &'static str },
InvalidUsize { what: &'static str },
VersionMismatch { expected: u32, found: u32 },
EndianMismatch { expected: u32, found: u32 },
AlignmentMismatch { alignment: usize, address: usize },
LabelMismatch { expected: &'static str },
ArithmeticOverflow { what: &'static str },
PatternID { err: PatternIDError, what: &'static str },
StateID { err: StateIDError, what: &'static str },
}
impl DeserializeError {
pub(crate) fn generic(msg: &'static str) -> DeserializeError {
DeserializeError(DeserializeErrorKind::Generic { msg })
}
pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError {
DeserializeError(DeserializeErrorKind::BufferTooSmall { what })
}
fn invalid_usize(what: &'static str) -> DeserializeError {
DeserializeError(DeserializeErrorKind::InvalidUsize { what })
}
fn version_mismatch(expected: u32, found: u32) -> DeserializeError {
DeserializeError(DeserializeErrorKind::VersionMismatch {
expected,
found,
})
}
fn endian_mismatch(expected: u32, found: u32) -> DeserializeError {
DeserializeError(DeserializeErrorKind::EndianMismatch {
expected,
found,
})
}
fn alignment_mismatch(
alignment: usize,
address: usize,
) -> DeserializeError {
DeserializeError(DeserializeErrorKind::AlignmentMismatch {
alignment,
address,
})
}
fn label_mismatch(expected: &'static str) -> DeserializeError {
DeserializeError(DeserializeErrorKind::LabelMismatch { expected })
}
fn arithmetic_overflow(what: &'static str) -> DeserializeError {
DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what })
}
fn pattern_id_error(
err: PatternIDError,
what: &'static str,
) -> DeserializeError {
DeserializeError(DeserializeErrorKind::PatternID { err, what })
}
pub(crate) fn state_id_error(
err: StateIDError,
what: &'static str,
) -> DeserializeError {
DeserializeError(DeserializeErrorKind::StateID { err, what })
}
}
#[cfg(feature = "std")]
impl std::error::Error for DeserializeError {}
impl core::fmt::Display for DeserializeError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use self::DeserializeErrorKind::*;
match self.0 {
Generic { msg } => write!(f, "{msg}"),
BufferTooSmall { what } => {
write!(f, "buffer is too small to read {what}")
}
InvalidUsize { what } => {
write!(f, "{what} is too big to fit in a usize")
}
VersionMismatch { expected, found } => write!(
f,
"unsupported version: \
expected version {expected} but found version {found}",
),
EndianMismatch { expected, found } => write!(
f,
"endianness mismatch: expected 0x{expected:X} but \
got 0x{found:X}. (Are you trying to load an object \
serialized with a different endianness?)",
),
AlignmentMismatch { alignment, address } => write!(
f,
"alignment mismatch: slice starts at address 0x{address:X}, \
which is not aligned to a {alignment} byte boundary",
),
LabelMismatch { expected } => write!(
f,
"label mismatch: start of serialized object should \
contain a NUL terminated {expected:?} label, but a different \
label was found",
),
ArithmeticOverflow { what } => {
write!(f, "arithmetic overflow for {what}")
}
PatternID { ref err, what } => {
write!(f, "failed to read pattern ID for {what}: {err}")
}
StateID { ref err, what } => {
write!(f, "failed to read state ID for {what}: {err}")
}
}
}
}
/// Safely converts a `&[u32]` to `&[StateID]` with zero cost.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] {
// SAFETY: This is safe because StateID is defined to have the same memory
// representation as a u32 (it is repr(transparent)). While not every u32
// is a "valid" StateID, callers are not permitted to rely on the validity
// of StateIDs for memory safety. It can only lead to logical errors. (This
// is why StateID::new_unchecked is safe.)
unsafe {
core::slice::from_raw_parts(
slice.as_ptr().cast::<StateID>(),
slice.len(),
)
}
}
/// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost.
pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] {
// SAFETY: This is safe because StateID is defined to have the same memory
// representation as a u32 (it is repr(transparent)). While not every u32
// is a "valid" StateID, callers are not permitted to rely on the validity
// of StateIDs for memory safety. It can only lead to logical errors. (This
// is why StateID::new_unchecked is safe.)
unsafe {
core::slice::from_raw_parts_mut(
slice.as_mut_ptr().cast::<StateID>(),
slice.len(),
)
}
}
/// Safely converts a `&[u32]` to `&[PatternID]` with zero cost.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] {
// SAFETY: This is safe because PatternID is defined to have the same
// memory representation as a u32 (it is repr(transparent)). While not
// every u32 is a "valid" PatternID, callers are not permitted to rely
// on the validity of PatternIDs for memory safety. It can only lead to
// logical errors. (This is why PatternID::new_unchecked is safe.)
unsafe {
core::slice::from_raw_parts(
slice.as_ptr().cast::<PatternID>(),
slice.len(),
)
}
}
/// Checks that the given slice has an alignment that matches `T`.
///
/// This is useful for checking that a slice has an appropriate alignment
/// before casting it to a &[T]. Note though that alignment is not itself
/// sufficient to perform the cast for any `T`.
pub(crate) fn check_alignment<T>(
slice: &[u8],
) -> Result<(), DeserializeError> {
let alignment = core::mem::align_of::<T>();
let address = slice.as_ptr().as_usize();
if address % alignment == 0 {
return Ok(());
}
Err(DeserializeError::alignment_mismatch(alignment, address))
}
/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning
/// of the given slice. All padding bytes must be NUL bytes.
///
/// This is useful because it can be theoretically necessary to pad the
/// beginning of a serialized object with NUL bytes to ensure that it starts
/// at a correctly aligned address. These padding bytes should come immediately
/// before the label.
///
/// This returns the number of bytes read from the given slice.
pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize {
let mut nread = 0;
while nread < 7 && nread < slice.len() && slice[nread] == 0 {
nread += 1;
}
nread
}
/// Allocate a byte buffer of the given size, along with some initial padding
/// such that `buf[padding..]` has the same alignment as `T`, where the
/// alignment of `T` must be at most `8`. In particular, callers should treat
/// the first N bytes (second return value) as padding bytes that must not be
/// overwritten. In all cases, the following identity holds:
///
/// ```ignore
/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE);
/// assert_eq!(SIZE, buf[padding..].len());
/// ```
///
/// In practice, padding is often zero.
///
/// The requirement for `8` as a maximum here is somewhat arbitrary. In
/// practice, we never need anything bigger in this crate, and so this function
/// does some sanity asserts under the assumption of a max alignment of `8`.
#[cfg(feature = "alloc")]
pub(crate) fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
// NOTE: This is a kludge because there's no easy way to allocate a Vec<u8>
// with an alignment guaranteed to be greater than 1. We could create a
// Vec<u32>, but this cannot be safely transmuted to a Vec<u8> without
// concern, since reallocing or dropping the Vec<u8> is UB (different
// alignment than the initial allocation). We could define a wrapper type
// to manage this for us, but it seems like more machinery than it's worth.
let buf = vec![0; size];
let align = core::mem::align_of::<T>();
let address = buf.as_ptr().as_usize();
if address % align == 0 {
return (buf, 0);
}
// Let's try this again. We have to create a totally new alloc with
// the maximum amount of bytes we might need. We can't just extend our
// pre-existing 'buf' because that might create a new alloc with a
// different alignment.
let extra = align - 1;
let mut buf = vec![0; size + extra];
let address = buf.as_ptr().as_usize();
// The code below handles the case where 'address' is aligned to T, so if
// we got lucky and 'address' is now aligned to T (when it previously
// wasn't), then we're done.
if address % align == 0 {
buf.truncate(size);
return (buf, 0);
}
let padding = ((address & !(align - 1)).checked_add(align).unwrap())
.checked_sub(address)
.unwrap();
assert!(padding <= 7, "padding of {padding} is bigger than 7");
assert!(
padding <= extra,
"padding of {padding} is bigger than extra {extra} bytes",
);
buf.truncate(size + padding);
assert_eq!(size + padding, buf.len());
assert_eq!(
0,
buf[padding..].as_ptr().as_usize() % align,
"expected end of initial padding to be aligned to {align}",
);
(buf, padding)
}
/// Reads a NUL terminated label starting at the beginning of the given slice.
///
/// If a NUL terminated label could not be found, then an error is returned.
/// Similarly, if a label is found but doesn't match the expected label, then
/// an error is returned.
///
/// Upon success, the total number of bytes read (including padding bytes) is
/// returned.
pub(crate) fn read_label(
slice: &[u8],
expected_label: &'static str,
) -> Result<usize, DeserializeError> {
// Set an upper bound on how many bytes we scan for a NUL. Since no label
// in this crate is longer than 256 bytes, if we can't find one within that
// range, then we have corrupted data.
let first_nul =
slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0);
let first_nul = match first_nul {
Some(first_nul) => first_nul,
None => {
return Err(DeserializeError::generic(
"could not find NUL terminated label \
at start of serialized object",
));
}
};
let len = first_nul + padding_len(first_nul);
if slice.len() < len {
return Err(DeserializeError::generic(
"could not find properly sized label at start of serialized object"
));
}
if expected_label.as_bytes() != &slice[..first_nul] {
return Err(DeserializeError::label_mismatch(expected_label));
}
Ok(len)
}
/// Writes the given label to the buffer as a NUL terminated string. The label
/// given must not contain NUL, otherwise this will panic. Similarly, the label
/// must not be longer than 255 bytes, otherwise this will panic.
///
/// Additional NUL bytes are written as necessary to ensure that the number of
/// bytes written is always a multiple of 4.
///
/// Upon success, the total number of bytes written (including padding) is
/// returned.
pub(crate) fn write_label(
label: &str,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = write_label_len(label);
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("label"));
}
dst[..label.len()].copy_from_slice(label.as_bytes());
for i in 0..(nwrite - label.len()) {
dst[label.len() + i] = 0;
}
assert_eq!(nwrite % 4, 0);
Ok(nwrite)
}
/// Returns the total number of bytes (including padding) that would be written
/// for the given label. This panics if the given label contains a NUL byte or
/// is longer than 255 bytes. (The size restriction exists so that searching
/// for a label during deserialization can be done in small bounded space.)
pub(crate) fn write_label_len(label: &str) -> usize {
assert!(label.len() <= 255, "label must not be longer than 255 bytes");
assert!(label.bytes().all(|b| b != 0), "label must not contain NUL bytes");
let label_len = label.len() + 1; // +1 for the NUL terminator
label_len + padding_len(label_len)
}
/// Reads the endianness check from the beginning of the given slice and
/// confirms that the endianness of the serialized object matches the expected
/// endianness. If the slice is too small or if the endianness check fails,
/// this returns an error.
///
/// Upon success, the total number of bytes read is returned.
pub(crate) fn read_endianness_check(
slice: &[u8],
) -> Result<usize, DeserializeError> {
let (n, nr) = try_read_u32(slice, "endianness check")?;
assert_eq!(nr, write_endianness_check_len());
if n != 0xFEFF {
return Err(DeserializeError::endian_mismatch(0xFEFF, n));
}
Ok(nr)
}
/// Writes 0xFEFF as an integer using the given endianness.
///
/// This is useful for writing into the header of a serialized object. It can
/// be read during deserialization as a sanity check to ensure the proper
/// endianness is used.
///
/// Upon success, the total number of bytes written is returned.
pub(crate) fn write_endianness_check<E: Endian>(
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = write_endianness_check_len();
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("endianness check"));
}
E::write_u32(0xFEFF, dst);
Ok(nwrite)
}
/// Returns the number of bytes written by the endianness check.
pub(crate) fn write_endianness_check_len() -> usize {
size_of::<u32>()
}
/// Reads a version number from the beginning of the given slice and confirms
/// that is matches the expected version number given. If the slice is too
/// small or if the version numbers aren't equivalent, this returns an error.
///
/// Upon success, the total number of bytes read is returned.
///
/// N.B. Currently, we require that the version number is exactly equivalent.
/// In the future, if we bump the version number without a semver bump, then
/// we'll need to relax this a bit and support older versions.
pub(crate) fn read_version(
slice: &[u8],
expected_version: u32,
) -> Result<usize, DeserializeError> {
let (n, nr) = try_read_u32(slice, "version")?;
assert_eq!(nr, write_version_len());
if n != expected_version {
return Err(DeserializeError::version_mismatch(expected_version, n));
}
Ok(nr)
}
/// Writes the given version number to the beginning of the given slice.
///
/// This is useful for writing into the header of a serialized object. It can
/// be read during deserialization as a sanity check to ensure that the library
/// code supports the format of the serialized object.
///
/// Upon success, the total number of bytes written is returned.
pub(crate) fn write_version<E: Endian>(
version: u32,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = write_version_len();
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("version number"));
}
E::write_u32(version, dst);
Ok(nwrite)
}
/// Returns the number of bytes written by writing the version number.
pub(crate) fn write_version_len() -> usize {
size_of::<u32>()
}
/// Reads a pattern ID from the given slice. If the slice has insufficient
/// length, then this panics. If the deserialized integer exceeds the pattern
/// ID limit for the current target, then this returns an error.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn read_pattern_id(
slice: &[u8],
what: &'static str,
) -> Result<(PatternID, usize), DeserializeError> {
let bytes: [u8; PatternID::SIZE] =
slice[..PatternID::SIZE].try_into().unwrap();
let pid = PatternID::from_ne_bytes(bytes)
.map_err(|err| DeserializeError::pattern_id_error(err, what))?;
Ok((pid, PatternID::SIZE))
}
/// Reads a pattern ID from the given slice. If the slice has insufficient
/// length, then this panics. Otherwise, the deserialized integer is assumed
/// to be a valid pattern ID.
///
/// This also returns the number of bytes read.
pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
let pid = PatternID::from_ne_bytes_unchecked(
slice[..PatternID::SIZE].try_into().unwrap(),
);
(pid, PatternID::SIZE)
}
/// Write the given pattern ID to the beginning of the given slice of bytes
/// using the specified endianness. The given slice must have length at least
/// `PatternID::SIZE`, or else this panics. Upon success, the total number of
/// bytes written is returned.
pub(crate) fn write_pattern_id<E: Endian>(
pid: PatternID,
dst: &mut [u8],
) -> usize {
E::write_u32(pid.as_u32(), dst);
PatternID::SIZE
}
/// Attempts to read a state ID from the given slice. If the slice has an
/// insufficient number of bytes or if the state ID exceeds the limit for
/// the current target, then this returns an error.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn try_read_state_id(
slice: &[u8],
what: &'static str,
) -> Result<(StateID, usize), DeserializeError> {
if slice.len() < StateID::SIZE {
return Err(DeserializeError::buffer_too_small(what));
}
read_state_id(slice, what)
}
/// Reads a state ID from the given slice. If the slice has insufficient
/// length, then this panics. If the deserialized integer exceeds the state ID
/// limit for the current target, then this returns an error.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn read_state_id(
slice: &[u8],
what: &'static str,
) -> Result<(StateID, usize), DeserializeError> {
let bytes: [u8; StateID::SIZE] =
slice[..StateID::SIZE].try_into().unwrap();
let sid = StateID::from_ne_bytes(bytes)
.map_err(|err| DeserializeError::state_id_error(err, what))?;
Ok((sid, StateID::SIZE))
}
/// Reads a state ID from the given slice. If the slice has insufficient
/// length, then this panics. Otherwise, the deserialized integer is assumed
/// to be a valid state ID.
///
/// This also returns the number of bytes read.
pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
let sid = StateID::from_ne_bytes_unchecked(
slice[..StateID::SIZE].try_into().unwrap(),
);
(sid, StateID::SIZE)
}
/// Write the given state ID to the beginning of the given slice of bytes
/// using the specified endianness. The given slice must have length at least
/// `StateID::SIZE`, or else this panics. Upon success, the total number of
/// bytes written is returned.
pub(crate) fn write_state_id<E: Endian>(
sid: StateID,
dst: &mut [u8],
) -> usize {
E::write_u32(sid.as_u32(), dst);
StateID::SIZE
}
/// Try to read a u16 as a usize from the beginning of the given slice in
/// native endian format. If the slice has fewer than 2 bytes or if the
/// deserialized number cannot be represented by usize, then this returns an
/// error. The error message will include the `what` description of what is
/// being deserialized, for better error messages. `what` should be a noun in
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn try_read_u16_as_usize(
slice: &[u8],
what: &'static str,
) -> Result<(usize, usize), DeserializeError> {
try_read_u16(slice, what).and_then(|(n, nr)| {
usize::try_from(n)
.map(|n| (n, nr))
.map_err(|_| DeserializeError::invalid_usize(what))
})
}
/// Try to read a u32 as a usize from the beginning of the given slice in
/// native endian format. If the slice has fewer than 4 bytes or if the
/// deserialized number cannot be represented by usize, then this returns an
/// error. The error message will include the `what` description of what is
/// being deserialized, for better error messages. `what` should be a noun in
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn try_read_u32_as_usize(
slice: &[u8],
what: &'static str,
) -> Result<(usize, usize), DeserializeError> {
try_read_u32(slice, what).and_then(|(n, nr)| {
usize::try_from(n)
.map(|n| (n, nr))
.map_err(|_| DeserializeError::invalid_usize(what))
})
}
/// Try to read a u16 from the beginning of the given slice in native endian
/// format. If the slice has fewer than 2 bytes, then this returns an error.
/// The error message will include the `what` description of what is being
/// deserialized, for better error messages. `what` should be a noun in
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn try_read_u16(
slice: &[u8],
what: &'static str,
) -> Result<(u16, usize), DeserializeError> {
check_slice_len(slice, size_of::<u16>(), what)?;
Ok((read_u16(slice), size_of::<u16>()))
}
/// Try to read a u32 from the beginning of the given slice in native endian
/// format. If the slice has fewer than 4 bytes, then this returns an error.
/// The error message will include the `what` description of what is being
/// deserialized, for better error messages. `what` should be a noun in
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn try_read_u32(
slice: &[u8],
what: &'static str,
) -> Result<(u32, usize), DeserializeError> {
check_slice_len(slice, size_of::<u32>(), what)?;
Ok((read_u32(slice), size_of::<u32>()))
}
/// Try to read a u128 from the beginning of the given slice in native endian
/// format. If the slice has fewer than 16 bytes, then this returns an error.
/// The error message will include the `what` description of what is being
/// deserialized, for better error messages. `what` should be a noun in
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
pub(crate) fn try_read_u128(
slice: &[u8],
what: &'static str,
) -> Result<(u128, usize), DeserializeError> {
check_slice_len(slice, size_of::<u128>(), what)?;
Ok((read_u128(slice), size_of::<u128>()))
}
/// Read a u16 from the beginning of the given slice in native endian format.
/// If the slice has fewer than 2 bytes, then this panics.
///
/// Marked as inline to speed up sparse searching which decodes integers from
/// its automaton at search time.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn read_u16(slice: &[u8]) -> u16 {
let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap();
u16::from_ne_bytes(bytes)
}
/// Read a u32 from the beginning of the given slice in native endian format.
/// If the slice has fewer than 4 bytes, then this panics.
///
/// Marked as inline to speed up sparse searching which decodes integers from
/// its automaton at search time.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn read_u32(slice: &[u8]) -> u32 {
let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap();
u32::from_ne_bytes(bytes)
}
/// Read a u128 from the beginning of the given slice in native endian format.
/// If the slice has fewer than 16 bytes, then this panics.
pub(crate) fn read_u128(slice: &[u8]) -> u128 {
let bytes: [u8; 16] = slice[..size_of::<u128>()].try_into().unwrap();
u128::from_ne_bytes(bytes)
}
/// Checks that the given slice has some minimal length. If it's smaller than
/// the bound given, then a "buffer too small" error is returned with `what`
/// describing what the buffer represents.
pub(crate) fn check_slice_len<T>(
slice: &[T],
at_least_len: usize,
what: &'static str,
) -> Result<(), DeserializeError> {
if slice.len() < at_least_len {
return Err(DeserializeError::buffer_too_small(what));
}
Ok(())
}
/// Multiply the given numbers, and on overflow, return an error that includes
/// 'what' in the error message.
///
/// This is useful when doing arithmetic with untrusted data.
pub(crate) fn mul(
a: usize,
b: usize,
what: &'static str,
) -> Result<usize, DeserializeError> {
match a.checked_mul(b) {
Some(c) => Ok(c),
None => Err(DeserializeError::arithmetic_overflow(what)),
}
}
/// Add the given numbers, and on overflow, return an error that includes
/// 'what' in the error message.
///
/// This is useful when doing arithmetic with untrusted data.
pub(crate) fn add(
a: usize,
b: usize,
what: &'static str,
) -> Result<usize, DeserializeError> {
match a.checked_add(b) {
Some(c) => Ok(c),
None => Err(DeserializeError::arithmetic_overflow(what)),
}
}
/// Shift `a` left by `b`, and on overflow, return an error that includes
/// 'what' in the error message.
///
/// This is useful when doing arithmetic with untrusted data.
pub(crate) fn shl(
a: usize,
b: usize,
what: &'static str,
) -> Result<usize, DeserializeError> {
let amount = u32::try_from(b)
.map_err(|_| DeserializeError::arithmetic_overflow(what))?;
match a.checked_shl(amount) {
Some(c) => Ok(c),
None => Err(DeserializeError::arithmetic_overflow(what)),
}
}
/// Returns the number of additional bytes required to add to the given length
/// in order to make the total length a multiple of 4. The return value is
/// always less than 4.
pub(crate) fn padding_len(non_padding_len: usize) -> usize {
(4 - (non_padding_len & 0b11)) & 0b11
}
/// A simple trait for writing code generic over endianness.
///
/// This is similar to what byteorder provides, but we only need a very small
/// subset.
pub(crate) trait Endian {
/// Writes a u16 to the given destination buffer in a particular
/// endianness. If the destination buffer has a length smaller than 2, then
/// this panics.
fn write_u16(n: u16, dst: &mut [u8]);
/// Writes a u32 to the given destination buffer in a particular
/// endianness. If the destination buffer has a length smaller than 4, then
/// this panics.
fn write_u32(n: u32, dst: &mut [u8]);
/// Writes a u128 to the given destination buffer in a particular
/// endianness. If the destination buffer has a length smaller than 16,
/// then this panics.
fn write_u128(n: u128, dst: &mut [u8]);
}
/// Little endian writing.
pub(crate) enum LE {}
/// Big endian writing.
pub(crate) enum BE {}
#[cfg(target_endian = "little")]
pub(crate) type NE = LE;
#[cfg(target_endian = "big")]
pub(crate) type NE = BE;
impl Endian for LE {
fn write_u16(n: u16, dst: &mut [u8]) {
dst[..2].copy_from_slice(&n.to_le_bytes());
}
fn write_u32(n: u32, dst: &mut [u8]) {
dst[..4].copy_from_slice(&n.to_le_bytes());
}
fn write_u128(n: u128, dst: &mut [u8]) {
dst[..16].copy_from_slice(&n.to_le_bytes());
}
}
impl Endian for BE {
fn write_u16(n: u16, dst: &mut [u8]) {
dst[..2].copy_from_slice(&n.to_be_bytes());
}
fn write_u32(n: u32, dst: &mut [u8]) {
dst[..4].copy_from_slice(&n.to_be_bytes());
}
fn write_u128(n: u128, dst: &mut [u8]) {
dst[..16].copy_from_slice(&n.to_be_bytes());
}
}
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::*;
#[test]
fn labels() {
let mut buf = [0; 1024];
let nwrite = write_label("fooba", &mut buf).unwrap();
assert_eq!(nwrite, 8);
assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00");
let nread = read_label(&buf, "fooba").unwrap();
assert_eq!(nread, 8);
}
#[test]
#[should_panic]
fn bad_label_interior_nul() {
// interior NULs are not allowed
write_label("foo\x00bar", &mut [0; 1024]).unwrap();
}
#[test]
fn bad_label_almost_too_long() {
// ok
write_label(&"z".repeat(255), &mut [0; 1024]).unwrap();
}
#[test]
#[should_panic]
fn bad_label_too_long() {
// labels longer than 255 bytes are banned
write_label(&"z".repeat(256), &mut [0; 1024]).unwrap();
}
#[test]
fn padding() {
assert_eq!(0, padding_len(8));
assert_eq!(3, padding_len(9));
assert_eq!(2, padding_len(10));
assert_eq!(1, padding_len(11));
assert_eq!(0, padding_len(12));
assert_eq!(3, padding_len(13));
assert_eq!(2, padding_len(14));
assert_eq!(1, padding_len(15));
assert_eq!(0, padding_len(16));
}
}