Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

241
vendor/regex-automata/src/meta/error.rs vendored Normal file
View File

@@ -0,0 +1,241 @@
use regex_syntax::{ast, hir};
use crate::{nfa, util::search::MatchError, PatternID};
/// An error that occurs when construction of a `Regex` fails.
///
/// A build error is generally a result of one of two possible failure
/// modes. First is a parse or syntax error in the concrete syntax of a
/// pattern. Second is that the construction of the underlying regex matcher
/// fails, usually because it gets too big with respect to limits like
/// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit).
///
/// This error provides very little introspection capabilities. You can:
///
/// * Ask for the [`PatternID`] of the pattern that caused an error, if one
/// is available. This is available for things like syntax errors, but not for
/// cases where build limits are exceeded.
/// * Ask for the underlying syntax error, but only if the error is a syntax
/// error.
/// * Ask for a human readable message corresponding to the underlying error.
/// * The `BuildError::source` method (from the `std::error::Error`
/// trait implementation) may be used to query for an underlying error if one
/// exists. There are no API guarantees about which error is returned.
///
/// When the `std` feature is enabled, this implements `std::error::Error`.
#[derive(Clone, Debug)]
pub struct BuildError {
kind: BuildErrorKind,
}
#[derive(Clone, Debug)]
enum BuildErrorKind {
Syntax { pid: PatternID, err: regex_syntax::Error },
NFA(nfa::thompson::BuildError),
}
impl BuildError {
/// If it is known which pattern ID caused this build error to occur, then
/// this method returns it.
///
/// Some errors are not associated with a particular pattern. However, any
/// errors that occur as part of parsing a pattern are guaranteed to be
/// associated with a pattern ID.
///
/// # Example
///
/// ```
/// use regex_automata::{meta::Regex, PatternID};
///
/// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err();
/// assert_eq!(Some(PatternID::must(2)), err.pattern());
/// ```
pub fn pattern(&self) -> Option<PatternID> {
match self.kind {
BuildErrorKind::Syntax { pid, .. } => Some(pid),
_ => None,
}
}
/// If this error occurred because the regex exceeded the configured size
/// limit before being built, then this returns the configured size limit.
///
/// The limit returned is what was configured, and corresponds to the
/// maximum amount of heap usage in bytes.
pub fn size_limit(&self) -> Option<usize> {
match self.kind {
BuildErrorKind::NFA(ref err) => err.size_limit(),
_ => None,
}
}
/// If this error corresponds to a syntax error, then a reference to it is
/// returned by this method.
pub fn syntax_error(&self) -> Option<&regex_syntax::Error> {
match self.kind {
BuildErrorKind::Syntax { ref err, .. } => Some(err),
_ => None,
}
}
pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError {
let err = regex_syntax::Error::from(err);
BuildError { kind: BuildErrorKind::Syntax { pid, err } }
}
pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError {
let err = regex_syntax::Error::from(err);
BuildError { kind: BuildErrorKind::Syntax { pid, err } }
}
pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
BuildError { kind: BuildErrorKind::NFA(err) }
}
}
#[cfg(feature = "std")]
impl std::error::Error for BuildError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind {
BuildErrorKind::Syntax { ref err, .. } => Some(err),
BuildErrorKind::NFA(ref err) => Some(err),
}
}
}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind {
BuildErrorKind::Syntax { pid, .. } => {
write!(f, "error parsing pattern {}", pid.as_usize())
}
BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
}
}
}
/// An error that occurs when a search should be retried.
///
/// This retry error distinguishes between two different failure modes.
///
/// The first is one where potential quadratic behavior has been detected.
/// In this case, whatever optimization that led to this behavior should be
/// stopped, and the next best strategy should be used.
///
/// The second indicates that the underlying regex engine has failed for some
/// reason. This usually occurs because either a lazy DFA's cache has become
/// ineffective or because a non-ASCII byte has been seen *and* a Unicode word
/// boundary was used in one of the patterns. In this failure case, a different
/// regex engine that won't fail in these ways (PikeVM, backtracker or the
/// one-pass DFA) should be used.
///
/// This is an internal error only and should never bleed into the public
/// API.
#[derive(Debug)]
pub(crate) enum RetryError {
Quadratic(RetryQuadraticError),
Fail(RetryFailError),
}
#[cfg(feature = "std")]
impl std::error::Error for RetryError {}
impl core::fmt::Display for RetryError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
RetryError::Quadratic(ref err) => err.fmt(f),
RetryError::Fail(ref err) => err.fmt(f),
}
}
}
impl From<MatchError> for RetryError {
fn from(merr: MatchError) -> RetryError {
RetryError::Fail(RetryFailError::from(merr))
}
}
/// An error that occurs when potential quadratic behavior has been detected
/// when applying either the "reverse suffix" or "reverse inner" optimizations.
///
/// When this error occurs, callers should abandon the "reverse" optimization
/// and use a normal forward search.
#[derive(Debug)]
pub(crate) struct RetryQuadraticError(());
impl RetryQuadraticError {
pub(crate) fn new() -> RetryQuadraticError {
RetryQuadraticError(())
}
}
#[cfg(feature = "std")]
impl std::error::Error for RetryQuadraticError {}
impl core::fmt::Display for RetryQuadraticError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "regex engine gave up to avoid quadratic behavior")
}
}
impl From<RetryQuadraticError> for RetryError {
fn from(err: RetryQuadraticError) -> RetryError {
RetryError::Quadratic(err)
}
}
/// An error that occurs when a regex engine "gives up" for some reason before
/// finishing a search. Usually this occurs because of heuristic Unicode word
/// boundary support or because of ineffective cache usage in the lazy DFA.
///
/// When this error occurs, callers should retry the regex search with a
/// different regex engine.
///
/// Note that this has convenient `From` impls that will automatically
/// convert a `MatchError` into this error. This works because the meta
/// regex engine internals guarantee that errors like `HaystackTooLong` and
/// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and
/// `GaveUp`, which both correspond to this "failure" error.
#[derive(Debug)]
pub(crate) struct RetryFailError {
offset: usize,
}
impl RetryFailError {
pub(crate) fn from_offset(offset: usize) -> RetryFailError {
RetryFailError { offset }
}
}
#[cfg(feature = "std")]
impl std::error::Error for RetryFailError {}
impl core::fmt::Display for RetryFailError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "regex engine failed at offset {:?}", self.offset)
}
}
impl From<RetryFailError> for RetryError {
fn from(err: RetryFailError) -> RetryError {
RetryError::Fail(err)
}
}
impl From<MatchError> for RetryFailError {
fn from(merr: MatchError) -> RetryFailError {
use crate::util::search::MatchErrorKind::*;
match *merr.kind() {
Quit { offset, .. } => RetryFailError::from_offset(offset),
GaveUp { offset } => RetryFailError::from_offset(offset),
// These can never occur because we avoid them by construction
// or with higher level control flow logic. For example, the
// backtracker's wrapper will never hand out a backtracker engine
// when the haystack would be too long.
HaystackTooLong { .. } | UnsupportedAnchored { .. } => {
unreachable!("found impossible error in meta engine: {merr}")
}
}
}
}

View File

@@ -0,0 +1,251 @@
/*!
This module defines two bespoke reverse DFA searching routines. (One for the
lazy DFA and one for the fully compiled DFA.) These routines differ from the
usual ones by permitting the caller to specify a minimum starting position.
That is, the search will begin at `input.end()` and will usually stop at
`input.start()`, unless `min_start > input.start()`, in which case, the search
will stop at `min_start`.
In other words, this lets you say, "no, the search must not extend past this
point, even if it's within the bounds of the given `Input`." And if the search
*does* want to go past that point, it stops and returns a "may be quadratic"
error, which indicates that the caller should retry using some other technique.
These routines specifically exist to protect against quadratic behavior when
employing the "reverse suffix" and "reverse inner" optimizations. Without the
backstop these routines provide, it is possible for parts of the haystack to
get re-scanned over and over again. The backstop not only prevents this, but
*tells you when it is happening* so that you can change the strategy.
Why can't we just use the normal search routines? We could use the normal
search routines and just set the start bound on the provided `Input` to our
`min_start` position. The problem here is that it's impossible to distinguish
between "no match because we reached the end of input" and "determined there
was no match well before the end of input." The former case is what we care
about with respect to quadratic behavior. The latter case is totally fine.
Why don't we modify the normal search routines to report the position at which
the search stops? I considered this, and I still wonder if it is indeed the
right thing to do. However, I think the straight-forward thing to do there
would be to complicate the return type signature of almost every search routine
in this crate, which I really do not want to do. It therefore might make more
sense to provide a richer way for search routines to report meta data, but that
was beyond my bandwidth to work on at the time of writing.
See the 'opt/reverse-inner' and 'opt/reverse-suffix' benchmarks in rebar for a
real demonstration of how quadratic behavior is mitigated.
*/
use crate::{
meta::error::{RetryError, RetryQuadraticError},
HalfMatch, Input, MatchError,
};
#[cfg(feature = "dfa-build")]
pub(crate) fn dfa_try_search_half_rev(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
min_start: usize,
) -> Result<Option<HalfMatch>, RetryError> {
use crate::dfa::Automaton;
let mut mat = None;
let mut sid = dfa.start_state_reverse(input)?;
if input.start() == input.end() {
dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
loop {
sid = dfa.next_state(sid, input.haystack()[at]);
if dfa.is_special_state(sid) {
if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
// Since reverse searches report the beginning of a
// match and the beginning is inclusive (not exclusive
// like the end of a match), we add 1 to make it
// inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else if dfa.is_quit_state(sid) {
return Err(MatchError::quit(input.haystack()[at], at).into());
}
}
if at == input.start() {
break;
}
at -= 1;
if at < min_start {
trace!(
"reached position {at} which is before the previous literal \
match, quitting to avoid quadratic behavior",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
}
let was_dead = dfa.is_dead_state(sid);
dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
// If we reach the beginning of the search and we could otherwise still
// potentially keep matching if there was more to match, then we actually
// return an error to indicate giving up on this optimization. Why? Because
// we can't prove that the real match begins at where we would report it.
//
// This only happens when all of the following are true:
//
// 1) We reach the starting point of our search span.
// 2) The match we found is before the starting point.
// 3) The FSM reports we could possibly find a longer match.
//
// We need (1) because otherwise the search stopped before the starting
// point and there is no possible way to find a more leftmost position.
//
// We need (2) because if the match found has an offset equal to the minimum
// possible offset, then there is no possible more leftmost match.
//
// We need (3) because if the FSM couldn't continue anyway (i.e., it's in
// a dead state), then we know we couldn't find anything more leftmost
// than what we have. (We have to check the state we were in prior to the
// EOI transition since the EOI transition will usually bring us to a dead
// state by virtue of it represents the end-of-input.)
if at == input.start()
&& mat.map_or(false, |m| m.offset() > input.start())
&& !was_dead
{
trace!(
"reached beginning of search at offset {at} without hitting \
a dead state, quitting to avoid potential false positive match",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
Ok(mat)
}
#[cfg(feature = "hybrid")]
pub(crate) fn hybrid_try_search_half_rev(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
min_start: usize,
) -> Result<Option<HalfMatch>, RetryError> {
let mut mat = None;
let mut sid = dfa.start_state_reverse(cache, input)?;
if input.start() == input.end() {
hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
loop {
sid = dfa
.next_state(cache, sid, input.haystack()[at])
.map_err(|_| MatchError::gave_up(at))?;
if sid.is_tagged() {
if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
// Since reverse searches report the beginning of a
// match and the beginning is inclusive (not exclusive
// like the end of a match), we add 1 to make it
// inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
} else if sid.is_dead() {
return Ok(mat);
} else if sid.is_quit() {
return Err(MatchError::quit(input.haystack()[at], at).into());
}
}
if at == input.start() {
break;
}
at -= 1;
if at < min_start {
trace!(
"reached position {at} which is before the previous literal \
match, quitting to avoid quadratic behavior",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
}
let was_dead = sid.is_dead();
hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
// See the comments in the full DFA routine above for why we need this.
if at == input.start()
&& mat.map_or(false, |m| m.offset() > input.start())
&& !was_dead
{
trace!(
"reached beginning of search at offset {at} without hitting \
a dead state, quitting to avoid potential false positive match",
);
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
}
Ok(mat)
}
#[cfg(feature = "dfa-build")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn dfa_eoi_rev(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
sid: &mut crate::util::primitives::StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
use crate::dfa::Automaton;
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa.next_state(*sid, byte);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!dfa.is_quit_state(*sid));
}
Ok(())
}
#[cfg(feature = "hybrid")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn hybrid_eoi_rev(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
sid: &mut crate::hybrid::LazyStateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa
.next_state(cache, *sid, byte)
.map_err(|_| MatchError::gave_up(sp.start))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if sid.is_quit() {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid = dfa
.next_eoi_state(cache, *sid)
.map_err(|_| MatchError::gave_up(sp.start))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!sid.is_quit());
}
Ok(())
}

View File

@@ -0,0 +1,81 @@
use alloc::{vec, vec::Vec};
use regex_syntax::hir::Hir;
use crate::{meta::regex::RegexInfo, util::search::MatchKind};
/// Pull out an alternation of literals from the given sequence of HIR
/// expressions.
///
/// There are numerous ways for this to fail. Generally, this only applies
/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
/// are "too few" alternates, in which case, the regex engine is likely faster.
///
/// And currently, this only returns something when 'hirs.len() == 1'.
pub(crate) fn alternation_literals(
info: &RegexInfo,
hirs: &[&Hir],
) -> Option<Vec<Vec<u8>>> {
use regex_syntax::hir::{HirKind, Literal};
// Might as well skip the work below if we know we can't build an
// Aho-Corasick searcher.
if !cfg!(feature = "perf-literal-multisubstring") {
return None;
}
// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
if hirs.len() != 1
|| !info.props()[0].look_set().is_empty()
|| info.props()[0].explicit_captures_len() > 0
|| !info.props()[0].is_alternation_literal()
|| info.config().get_match_kind() != MatchKind::LeftmostFirst
{
return None;
}
let hir = &hirs[0];
let alts = match *hir.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None, // one literal isn't worth it
};
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Literal(Literal(ref bytes)) => {
lit.extend_from_slice(bytes)
}
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(Literal(ref bytes)) => {
lit.extend_from_slice(bytes);
}
_ => unreachable!("expected literal, got {e:?}"),
}
}
}
_ => unreachable!("expected literal or concat, got {alt:?}"),
}
lits.push(lit);
}
// Why do this? Well, when the number of literals is small, it's likely
// that we'll use the lazy DFA which is in turn likely to be faster than
// Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
// a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
// the latter because it is so hungry (in time and space), and the former
// is decently fast, but not as fast as a well oiled lazy DFA.
//
// However, once the number starts getting large, the lazy DFA is likely
// to start thrashing because of the modest default cache size. When
// exactly does this happen? Dunno. But at whatever point that is (we make
// a guess below based on ad hoc benchmarking), we'll want to cut over to
// Aho-Corasick, where even the contiguous NFA is likely to do much better.
if lits.len() < 3000 {
debug!("skipping Aho-Corasick because there are too few literals");
return None;
}
Some(lits)
}

62
vendor/regex-automata/src/meta/mod.rs vendored Normal file
View File

@@ -0,0 +1,62 @@
/*!
Provides a regex matcher that composes several other regex matchers
automatically.
This module is home to a meta [`Regex`], which provides a convenient high
level API for executing regular expressions in linear time.
# Comparison with the `regex` crate
A meta `Regex` is the implementation used directly by the `regex` crate.
Indeed, the `regex` crate API is essentially just a light wrapper over a meta
`Regex`. This means that if you need the full flexibility offered by this
API, then you should be able to switch to using this API directly without
any changes in match semantics or syntax. However, there are some API level
differences:
* The `regex` crate API returns match objects that include references to the
haystack itself, which in turn makes it easy to access the matching strings
without having to slice the haystack yourself. In contrast, a meta `Regex`
returns match objects that only have offsets in them.
* At time of writing, a meta `Regex` doesn't have some of the convenience
routines that the `regex` crate has, such as replacements. Note though that
[`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string)
will handle the replacement string interpolation for you.
* A meta `Regex` supports the [`Input`](crate::Input) abstraction, which
provides a way to configure a search in more ways than is supported by the
`regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can
be used to run an anchored search, regardless of whether the pattern is itself
anchored with a `^`.
* A meta `Regex` supports multi-pattern searching everywhere.
Indeed, every [`Match`](crate::Match) returned by the search APIs
include a [`PatternID`](crate::PatternID) indicating which pattern
matched. In the single pattern case, all matches correspond to
[`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate
has distinct `Regex` and a `RegexSet` APIs. The former only supports a single
pattern, while the latter supports multiple patterns but cannot report the
offsets of a match.
* A meta `Regex` provides the explicit capability of bypassing its internal
memory pool for automatically acquiring mutable scratch space required by its
internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower
level routines such as [`Regex::search_with`].
*/
pub use self::{
error::BuildError,
regex::{
Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split,
SplitN,
},
};
mod error;
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
mod limited;
mod literal;
mod regex;
mod reverse_inner;
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
mod stopat;
mod strategy;
mod wrappers;

3649
vendor/regex-automata/src/meta/regex.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,220 @@
/*!
A module dedicated to plucking inner literals out of a regex pattern, and
then constructing a prefilter for them. We also include a regex pattern
"prefix" that corresponds to the bits of the regex that need to match before
the literals do. The reverse inner optimization then proceeds by looking for
matches of the inner literal(s), and then doing a reverse search of the prefix
from the start of the literal match to find the overall start position of the
match.
The essential invariant we want to uphold here is that the literals we return
reflect a set where *at least* one of them must match in order for the overall
regex to match. We also need to maintain the invariant that the regex prefix
returned corresponds to the entirety of the regex up until the literals we
return.
This somewhat limits what we can do. That is, if we a regex like
`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
we could still extract `{@!, %%}`, and it is true that at least of them must
match. But then, what is our regex prefix? Again, in theory, that could be
`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
matches, and `\w+` only matches when `%%` matches.
All of that is technically possible to do, but it seemingly requires a lot of
sophistication and machinery. Probably the way to tackle that is with some kind
of formalism and approach this problem more generally.
For now, the code below basically just looks for a top-level concatenation.
And if it can find one, it looks for literals in each of the direct child
sub-expressions of that concatenation. If some good ones are found, we return
those and a concatenation of the Hir expressions seen up to that point.
*/
use alloc::vec::Vec;
use regex_syntax::hir::{self, literal, Hir, HirKind};
use crate::{util::prefilter::Prefilter, MatchKind};
/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
/// one was found, then a concatenation of the HIR expressions that precede it
/// is returned.
///
/// The idea here is that the prefilter returned can be used to find candidate
/// matches. And then the HIR returned can be used to build a reverse regex
/// matcher, which will find the start of the candidate match. Finally, the
/// match still has to be confirmed with a normal anchored forward scan to find
/// the end position of the match.
///
/// Note that this assumes leftmost-first match semantics, so callers must
/// not call this otherwise.
pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
if hirs.len() != 1 {
debug!(
"skipping reverse inner optimization since it only \
supports 1 pattern, {} were given",
hirs.len(),
);
return None;
}
let mut concat = match top_concat(hirs[0]) {
Some(concat) => concat,
None => {
debug!(
"skipping reverse inner optimization because a top-level \
concatenation could not found",
);
return None;
}
};
// We skip the first HIR because if it did have a prefix prefilter in it,
// we probably wouldn't be here looking for an inner prefilter.
for i in 1..concat.len() {
let hir = &concat[i];
let pre = match prefilter(hir) {
None => continue,
Some(pre) => pre,
};
// Even if we got a prefilter, if it isn't consider "fast," then we
// probably don't want to bother with it. Namely, since the reverse
// inner optimization requires some overhead, it likely only makes
// sense if the prefilter scan itself is (believed) to be much faster
// than the regex engine.
if !pre.is_fast() {
debug!(
"skipping extracted inner prefilter because \
it probably isn't fast"
);
continue;
}
let concat_suffix = Hir::concat(concat.split_off(i));
let concat_prefix = Hir::concat(concat);
// Look for a prefilter again. Why? Because above we only looked for
// a prefilter on the individual 'hir', but we might be able to find
// something better and more discriminatory by looking at the entire
// suffix. We don't do this above to avoid making this loop worst case
// quadratic in the length of 'concat'.
let pre2 = match prefilter(&concat_suffix) {
None => pre,
Some(pre2) => {
if pre2.is_fast() {
pre2
} else {
pre
}
}
};
return Some((concat_prefix, pre2));
}
debug!(
"skipping reverse inner optimization because a top-level \
sub-expression with a fast prefilter could not be found"
);
None
}
/// Attempt to extract a prefilter from an HIR expression.
///
/// We do a little massaging here to do our best that the prefilter we get out
/// of this is *probably* fast. Basically, the false positive rate has a much
/// higher impact for things like the reverse inner optimization because more
/// work needs to potentially be done for each candidate match.
///
/// Note that this assumes leftmost-first match semantics, so callers must
/// not call this otherwise.
fn prefilter(hir: &Hir) -> Option<Prefilter> {
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Prefix);
let mut prefixes = extractor.extract(hir);
debug!(
"inner prefixes (len={:?}) extracted before optimization: {:?}",
prefixes.len(),
prefixes
);
// Since these are inner literals, we know they cannot be exact. But the
// extractor doesn't know this. We mark them as inexact because this might
// impact literal optimization. Namely, optimization weights "all literals
// are exact" as very high, because it presumes that any match results in
// an overall match. But of course, that is not the case here.
//
// In practice, this avoids plucking out a ASCII-only \s as an alternation
// of single-byte whitespace characters.
prefixes.make_inexact();
prefixes.optimize_for_prefix_by_preference();
debug!(
"inner prefixes (len={:?}) extracted after optimization: {:?}",
prefixes.len(),
prefixes
);
prefixes
.literals()
.and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
}
/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
/// try to return one even if it's embedded in a capturing group, but is
/// otherwise pretty conservative in what is returned.
///
/// The HIR returned is a complete copy of the concat with all capturing
/// groups removed. In effect, the concat returned is "flattened" with respect
/// to capturing groups. This makes the detection logic above for prefixes
/// a bit simpler, and it works because 1) capturing groups never influence
/// whether a match occurs or not and 2) capturing groups are not used when
/// doing the reverse inner search to find the start of the match.
fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
loop {
hir = match hir.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_)
| HirKind::Repetition(_)
| HirKind::Alternation(_) => return None,
HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
HirKind::Concat(ref subs) => {
// We are careful to only do the flattening/copy when we know
// we have a "top level" concat we can inspect. This avoids
// doing extra work in cases where we definitely won't use it.
// (This might still be wasted work if we can't go on to find
// some literals to extract.)
let concat =
Hir::concat(subs.iter().map(|h| flatten(h)).collect());
return match concat.into_kind() {
HirKind::Concat(xs) => Some(xs),
// It is actually possible for this case to occur, because
// 'Hir::concat' might simplify the expression to the point
// that concatenations are actually removed. One wonders
// whether this leads to other cases where we should be
// extracting literals, but in theory, I believe if we do
// get here, then it means that a "real" prefilter failed
// to be extracted and we should probably leave well enough
// alone. (A "real" prefilter is unbothered by "top-level
// concats" and "capturing groups.")
_ => return None,
};
}
};
}
}
/// Returns a copy of the given HIR but with all capturing groups removed.
fn flatten(hir: &Hir) -> Hir {
match hir.kind() {
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
HirKind::Class(ref x) => Hir::class(x.clone()),
HirKind::Look(ref x) => Hir::look(x.clone()),
HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
// This is the interesting case. We just drop the group information
// entirely and use the child HIR itself.
HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
HirKind::Alternation(ref xs) => {
Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
}
HirKind::Concat(ref xs) => {
Hir::concat(xs.iter().map(|x| flatten(x)).collect())
}
}
}

212
vendor/regex-automata/src/meta/stopat.rs vendored Normal file
View File

@@ -0,0 +1,212 @@
/*!
This module defines two bespoke forward DFA search routines. One for the lazy
DFA and one for the fully compiled DFA. These routines differ from the normal
ones by reporting the position at which the search terminates when a match
*isn't* found.
This position at which a search terminates is useful in contexts where the meta
regex engine runs optimizations that could go quadratic if we aren't careful.
Namely, a regex search *could* scan to the end of the haystack only to report a
non-match. If the caller doesn't know that the search scanned to the end of the
haystack, it might restart the search at the next literal candidate it finds
and repeat the process.
Providing the caller with the position at which the search stopped provides a
way for the caller to determine the point at which subsequent scans should not
pass. This is principally used in the "reverse inner" optimization, which works
like this:
1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'.
2. At the spot where 'Z' matches, do a reverse anchored search from there for
'\w+'.
3. If the reverse search matches, it corresponds to the start position of a
(possible) match. At this point, do a forward anchored search to find the end
position. If an end position is found, then we have a match and we know its
bounds.
If the forward anchored search in (3) searches the entire rest of the haystack
but reports a non-match, then a naive implementation of the above will continue
back at step 1 looking for more candidates. There might still be a match to be
found! It's possible. But we already scanned the whole haystack. So if we keep
repeating the process, then we might wind up taking quadratic time in the size
of the haystack, which is not great.
So if the forward anchored search in (3) reports the position at which it
stops, then we can detect whether quadratic behavior might be occurring in
steps (1) and (2). For (1), it occurs if the literal candidate found occurs
*before* the end of the previous search in (3), since that means we're now
going to look for another match in a place where the forward search has already
scanned. It is *correct* to do so, but our technique has become inefficient.
For (2), quadratic behavior occurs similarly when its reverse search extends
past the point where the previous forward search in (3) terminated. Indeed, to
implement (2), we use the sibling 'limited' module for ensuring our reverse
scan doesn't go further than we want.
See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of
how quadratic behavior is mitigated.
*/
use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError};
#[cfg(feature = "dfa-build")]
pub(crate) fn dfa_try_search_half_fwd(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
) -> Result<Result<HalfMatch, usize>, RetryFailError> {
use crate::dfa::{accel, Automaton};
let mut mat = None;
let mut sid = dfa.start_state_forward(input)?;
let mut at = input.start();
while at < input.end() {
sid = dfa.next_state(sid, input.haystack()[at]);
if dfa.is_special_state(sid) {
if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
mat = Some(HalfMatch::new(pattern, at));
if input.get_earliest() {
return Ok(mat.ok_or(at));
}
if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
at = accel::find_fwd(needs, input.haystack(), at)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
at = accel::find_fwd(needs, input.haystack(), at)
.unwrap_or(input.end());
continue;
} else if dfa.is_dead_state(sid) {
return Ok(mat.ok_or(at));
} else if dfa.is_quit_state(sid) {
return Err(MatchError::quit(input.haystack()[at], at).into());
} else {
// Ideally we wouldn't use a DFA that specialized start states
// and thus 'is_start_state()' could never be true here, but in
// practice we reuse the DFA created for the full regex which
// will specialize start states whenever there is a prefilter.
debug_assert!(dfa.is_start_state(sid));
}
}
at += 1;
}
dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?;
Ok(mat.ok_or(at))
}
#[cfg(feature = "hybrid")]
pub(crate) fn hybrid_try_search_half_fwd(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
) -> Result<Result<HalfMatch, usize>, RetryFailError> {
let mut mat = None;
let mut sid = dfa.start_state_forward(cache, input)?;
let mut at = input.start();
while at < input.end() {
sid = dfa
.next_state(cache, sid, input.haystack()[at])
.map_err(|_| MatchError::gave_up(at))?;
if sid.is_tagged() {
if sid.is_match() {
let pattern = dfa.match_pattern(cache, sid, 0);
mat = Some(HalfMatch::new(pattern, at));
if input.get_earliest() {
return Ok(mat.ok_or(at));
}
} else if sid.is_dead() {
return Ok(mat.ok_or(at));
} else if sid.is_quit() {
return Err(MatchError::quit(input.haystack()[at], at).into());
} else {
// We should NEVER get an unknown state ID back from
// dfa.next_state().
debug_assert!(!sid.is_unknown());
// Ideally we wouldn't use a lazy DFA that specialized start
// states and thus 'sid.is_start()' could never be true here,
// but in practice we reuse the lazy DFA created for the full
// regex which will specialize start states whenever there is
// a prefilter.
debug_assert!(sid.is_start());
}
}
at += 1;
}
hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
Ok(mat.ok_or(at))
}
#[cfg(feature = "dfa-build")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn dfa_eoi_fwd(
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
input: &Input<'_>,
sid: &mut crate::util::primitives::StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
use crate::dfa::Automaton;
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid = dfa.next_state(*sid, b);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!dfa.is_quit_state(*sid));
}
}
Ok(())
}
#[cfg(feature = "hybrid")]
#[cfg_attr(feature = "perf-inline", inline(always))]
fn hybrid_eoi_fwd(
dfa: &crate::hybrid::dfa::DFA,
cache: &mut crate::hybrid::dfa::Cache,
input: &Input<'_>,
sid: &mut crate::hybrid::LazyStateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid = dfa
.next_state(cache, *sid, b)
.map_err(|_| MatchError::gave_up(sp.end))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if sid.is_quit() {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa
.next_eoi_state(cache, *sid)
.map_err(|_| MatchError::gave_up(input.haystack().len()))?;
if sid.is_match() {
let pattern = dfa.match_pattern(cache, *sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
// N.B. We don't have to check 'is_quit' here because the EOI
// transition can never lead to a quit state.
debug_assert!(!sid.is_quit());
}
}
Ok(())
}

1905
vendor/regex-automata/src/meta/strategy.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1351
vendor/regex-automata/src/meta/wrappers.rs vendored Normal file

File diff suppressed because it is too large Load Diff