Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

69
vendor/regex-automata/tests/dfa/api.rs vendored Normal file
View File

@@ -0,0 +1,69 @@
use std::error::Error;
use regex_automata::{
dfa::{dense, Automaton, OverlappingState},
nfa::thompson,
HalfMatch, Input, MatchError,
};
// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
let dfa = dense::Builder::new()
.configure(dense::Config::new().quit(b'x', true))
.build("[[:word:]]+$")?;
assert_eq!(
Err(MatchError::quit(b'x', 3)),
dfa.try_search_fwd(&Input::new(b"abcxyz"))
);
assert_eq!(
dfa.try_search_overlapping_fwd(
&Input::new(b"abcxyz"),
&mut OverlappingState::start()
),
Err(MatchError::quit(b'x', 3)),
);
Ok(())
}
// Tests that quit bytes in the reverse direction work correctly.
#[test]
fn quit_rev() -> Result<(), Box<dyn Error>> {
let dfa = dense::Builder::new()
.configure(dense::Config::new().quit(b'x', true))
.thompson(thompson::Config::new().reverse(true))
.build("^[[:word:]]+")?;
assert_eq!(
Err(MatchError::quit(b'x', 3)),
dfa.try_search_rev(&Input::new(b"abcxyz"))
);
Ok(())
}
// Tests that if we heuristically enable Unicode word boundaries but then
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
// will panic.
#[test]
#[should_panic]
fn quit_panics() {
dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false);
}
// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
#[test]
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
let mut config = dense::Config::new();
for b in 0x80..=0xFF {
config = config.quit(b, true);
}
let dfa = dense::Builder::new().configure(config).build(r"\b")?;
let expected = HalfMatch::must(0, 1);
assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a")));
Ok(())
}

View File

@@ -0,0 +1,8 @@
#[cfg(all(feature = "dfa-build", feature = "dfa-search"))]
mod api;
#[cfg(feature = "dfa-onepass")]
mod onepass;
#[cfg(all(feature = "dfa-build", feature = "dfa-search"))]
mod regression;
#[cfg(all(not(miri), feature = "dfa-build", feature = "dfa-search"))]
mod suite;

View File

@@ -0,0 +1,2 @@
#[cfg(not(miri))]
mod suite;

View File

@@ -0,0 +1,197 @@
use {
anyhow::Result,
regex_automata::{
dfa::onepass::{self, DFA},
nfa::thompson,
util::{iter, syntax},
},
regex_test::{
CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
TestRunner,
},
};
use crate::{create_input, suite, testify_captures, untestify_kind};
const EXPANSIONS: &[&str] = &["is_match", "find", "captures"];
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let builder = DFA::builder();
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled for all
/// tests.
#[test]
fn starts_for_each_pattern() -> Result<()> {
let mut builder = DFA::builder();
builder.configure(DFA::config().starts_for_each_pattern(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when byte classes are disabled.
///
/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
/// All it does is cause every byte value to be its own distinct equivalence
/// class.
#[test]
fn no_byte_classes() -> Result<()> {
let mut builder = DFA::builder();
builder.configure(DFA::config().byte_classes(false));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
fn compiler(
mut builder: onepass::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
// Check if our regex contains things that aren't supported by DFAs.
// That is, Unicode word boundaries when searching non-ASCII text.
if !configure_onepass_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = match builder.build_many(&regexes) {
Ok(re) => re,
Err(err) => {
let msg = err.to_string();
// This is pretty gross, but when a regex fails to compile as
// a one-pass regex, then we want to be OK with that and just
// skip the test. But we have to be careful to only skip it
// when the expected result is that the regex compiles. If
// the test is specifically checking that the regex does not
// compile, then we should bubble up that error and allow the
// test to pass.
//
// Since our error types are all generally opaque, we just
// look for an error string. Not great, but not the end of the
// world.
if test.compiles() && msg.contains("not one-pass") {
return Ok(CompiledRegex::skip());
}
return Err(err.into());
}
};
let mut cache = re.create_cache();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
}
}
fn run_test(
re: &DFA,
cache: &mut onepass::Cache,
test: &RegexTest,
) -> TestResult {
let input = create_input(test);
match test.additional_name() {
"is_match" => {
TestResult::matched(re.is_match(cache, input.earliest(true)))
}
"find" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Leftmost => {
let input =
input.earliest(test.search_kind() == SearchKind::Earliest);
let mut caps = re.create_captures();
let it = iter::Searcher::new(input)
.into_matches_iter(|input| {
re.try_search(cache, input, &mut caps)?;
Ok(caps.get_match())
})
.infallible()
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
});
TestResult::matches(it)
}
SearchKind::Overlapping => {
// The one-pass DFA does not support any kind of overlapping
// search. This is not just a matter of not having the API.
// It's fundamentally incompatible with the one-pass concept.
// If overlapping matches were possible, then the one-pass DFA
// would fail to build.
TestResult::skip()
}
},
"captures" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Leftmost => {
let input =
input.earliest(test.search_kind() == SearchKind::Earliest);
let it = iter::Searcher::new(input)
.into_captures_iter(re.create_captures(), |input, caps| {
re.try_search(cache, input, caps)
})
.infallible()
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
SearchKind::Overlapping => {
// The one-pass DFA does not support any kind of overlapping
// search. This is not just a matter of not having the API.
// It's fundamentally incompatible with the one-pass concept.
// If overlapping matches were possible, then the one-pass DFA
// would fail to build.
TestResult::skip()
}
},
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_onepass_builder(
test: &RegexTest,
builder: &mut onepass::Builder,
) -> bool {
if !test.anchored() {
return false;
}
let match_kind = match untestify_kind(test.match_kind()) {
None => return false,
Some(k) => k,
};
let config = DFA::config().match_kind(match_kind);
builder
.configure(config)
.syntax(config_syntax(test))
.thompson(config_thompson(test));
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
let mut lookm = regex_automata::util::look::LookMatcher::new();
lookm.set_line_terminator(test.line_terminator());
thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}

View File

@@ -0,0 +1,48 @@
// A regression test for checking that minimization correctly translates
// whether a state is a match state or not. Previously, it was possible for
// minimization to mark a non-matching state as matching.
#[test]
#[cfg(not(miri))]
fn minimize_sets_correct_match_states() {
use regex_automata::{
dfa::{dense::DFA, Automaton, StartKind},
Anchored, Input,
};
let pattern =
// This is a subset of the grapheme matching regex. I couldn't seem
// to get a repro any smaller than this unfortunately.
r"(?x)
(?:
\p{gcb=Prepend}*
(?:
(?:
(?:
\p{gcb=L}*
(?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
\p{gcb=T}*
)
|
\p{gcb=L}+
|
\p{gcb=T}+
)
|
\p{Extended_Pictographic}
(?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
|
[^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
)
[\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
)
";
let dfa = DFA::builder()
.configure(
DFA::config().start_kind(StartKind::Anchored).minimize(true),
)
.build(pattern)
.unwrap();
let input = Input::new(b"\xE2").anchored(Anchored::Yes);
assert_eq!(Ok(None), dfa.try_search_fwd(&input));
}

443
vendor/regex-automata/tests/dfa/suite.rs vendored Normal file
View File

@@ -0,0 +1,443 @@
use {
anyhow::Result,
regex_automata::{
dfa::{
self, dense, regex::Regex, sparse, Automaton, OverlappingState,
StartKind,
},
nfa::thompson,
util::{prefilter::Prefilter, syntax},
Anchored, Input, PatternSet,
},
regex_test::{
CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
TestRunner,
},
};
use crate::{create_input, suite, untestify_kind};
const EXPANSIONS: &[&str] = &["is_match", "find", "which"];
/// Runs the test suite with the default configuration.
#[test]
fn unminimized_default() -> Result<()> {
let builder = Regex::builder();
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite with the default configuration and a prefilter enabled,
/// if one can be built.
#[test]
fn unminimized_prefilter() -> Result<()> {
let my_compiler = |test: &RegexTest, regexes: &[String]| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
let mut builder = Regex::builder();
builder.dense(dense::DFA::config().prefilter(pre));
compiler(builder, |_, _, re| {
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
})(test, regexes)
};
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), my_compiler)
.assert();
Ok(())
}
/// Runs the test suite with start states specialized.
#[test]
fn unminimized_specialized_start_states() -> Result<()> {
let mut builder = Regex::builder();
builder.dense(dense::Config::new().specialize_start_states(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite with byte classes disabled.
#[test]
fn unminimized_no_byte_class() -> Result<()> {
let mut builder = Regex::builder();
builder.dense(dense::Config::new().byte_classes(false));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite with NFA shrinking enabled.
#[test]
fn unminimized_nfa_shrink() -> Result<()> {
let mut builder = Regex::builder();
builder.thompson(thompson::Config::new().shrink(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite on a minimized DFA with an otherwise default
/// configuration.
#[test]
fn minimized_default() -> Result<()> {
let mut builder = Regex::builder();
builder.dense(dense::Config::new().minimize(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite on a minimized DFA with byte classes disabled.
#[test]
fn minimized_no_byte_class() -> Result<()> {
let mut builder = Regex::builder();
builder.dense(dense::Config::new().minimize(true).byte_classes(false));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite on a sparse unminimized DFA.
#[test]
fn sparse_unminimized_default() -> Result<()> {
let builder = Regex::builder();
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), sparse_compiler(builder))
.assert();
Ok(())
}
/// Runs the test suite on a sparse unminimized DFA with prefilters enabled.
#[test]
fn sparse_unminimized_prefilter() -> Result<()> {
let my_compiler = |test: &RegexTest, regexes: &[String]| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
let mut builder = Regex::builder();
builder.dense(dense::DFA::config().prefilter(pre));
compiler(builder, |builder, _, re| {
let fwd = re.forward().to_sparse()?;
let rev = re.reverse().to_sparse()?;
let re = builder.build_from_dfas(fwd, rev);
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
})(test, regexes)
};
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), my_compiler)
.assert();
Ok(())
}
/// Another basic sanity test that checks we can serialize and then deserialize
/// a regex, and that the resulting regex can be used for searching correctly.
#[test]
fn serialization_unminimized_default() -> Result<()> {
let builder = Regex::builder();
let my_compiler = |builder| {
compiler(builder, |builder, _, re| {
let builder = builder.clone();
let (fwd_bytes, _) = re.forward().to_bytes_native_endian();
let (rev_bytes, _) = re.reverse().to_bytes_native_endian();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
let fwd: dense::DFA<&[u32]> =
dense::DFA::from_bytes(&fwd_bytes).unwrap().0;
let rev: dense::DFA<&[u32]> =
dense::DFA::from_bytes(&rev_bytes).unwrap().0;
let re = builder.build_from_dfas(fwd, rev);
run_test(&re, test)
}))
})
};
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), my_compiler(builder))
.assert();
Ok(())
}
/// A basic sanity test that checks we can serialize and then deserialize a
/// regex using sparse DFAs, and that the resulting regex can be used for
/// searching correctly.
#[test]
fn sparse_serialization_unminimized_default() -> Result<()> {
let builder = Regex::builder();
let my_compiler = |builder| {
compiler(builder, |builder, _, re| {
let builder = builder.clone();
let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian();
let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
let fwd: sparse::DFA<&[u8]> =
sparse::DFA::from_bytes(&fwd_bytes).unwrap().0;
let rev: sparse::DFA<&[u8]> =
sparse::DFA::from_bytes(&rev_bytes).unwrap().0;
let re = builder.build_from_dfas(fwd, rev);
run_test(&re, test)
}))
})
};
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), my_compiler(builder))
.assert();
Ok(())
}
fn dense_compiler(
builder: dfa::regex::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
compiler(builder, |_, _, re| {
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
})
}
fn sparse_compiler(
builder: dfa::regex::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
compiler(builder, |builder, _, re| {
let fwd = re.forward().to_sparse()?;
let rev = re.reverse().to_sparse()?;
let re = builder.build_from_dfas(fwd, rev);
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
})
}
fn compiler(
mut builder: dfa::regex::Builder,
mut create_matcher: impl FnMut(
&dfa::regex::Builder,
Option<Prefilter>,
Regex,
) -> Result<CompiledRegex>,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
// Parse regexes as HIRs for some analysis below.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
// Get a prefilter in case the test wants it.
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
// Check if our regex contains things that aren't supported by DFAs.
// That is, Unicode word boundaries when searching non-ASCII text.
if !test.haystack().is_ascii() {
for hir in hirs.iter() {
if hir.properties().look_set().contains_word_unicode() {
return Ok(CompiledRegex::skip());
}
}
}
if !configure_regex_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
create_matcher(&builder, pre, builder.build_many(&regexes)?)
}
}
fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> TestResult {
let input = create_input(test);
match test.additional_name() {
"is_match" => TestResult::matched(re.is_match(input.earliest(true))),
"find" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Leftmost => {
let input =
input.earliest(test.search_kind() == SearchKind::Earliest);
TestResult::matches(
re.find_iter(input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
}),
)
}
SearchKind::Overlapping => {
try_search_overlapping(re, &input).unwrap()
}
},
"which" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Leftmost => {
// There are no "which" APIs for standard searches.
TestResult::skip()
}
SearchKind::Overlapping => {
let dfa = re.forward();
let mut patset = PatternSet::new(dfa.pattern_len());
dfa.try_which_overlapping_matches(&input, &mut patset)
.unwrap();
TestResult::which(patset.iter().map(|p| p.as_usize()))
}
},
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_regex_builder(
test: &RegexTest,
builder: &mut dfa::regex::Builder,
) -> bool {
let match_kind = match untestify_kind(test.match_kind()) {
None => return false,
Some(k) => k,
};
let starts = if test.anchored() {
StartKind::Anchored
} else {
StartKind::Unanchored
};
let mut dense_config = dense::Config::new()
.start_kind(starts)
.match_kind(match_kind)
.unicode_word_boundary(true);
// When doing an overlapping search, we might try to find the start of each
// match with a custom search routine. In that case, we need to tell the
// reverse search (for the start offset) which pattern to look for. The
// only way that API works is when anchored starting states are compiled
// for each pattern. This does technically also enable it for the forward
// DFA, but we're okay with that.
if test.search_kind() == SearchKind::Overlapping {
dense_config = dense_config.starts_for_each_pattern(true);
}
builder
.syntax(config_syntax(test))
.thompson(config_thompson(test))
.dense(dense_config);
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
let mut lookm = regex_automata::util::look::LookMatcher::new();
lookm.set_line_terminator(test.line_terminator());
thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex syntax from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}
/// Execute an overlapping search, and for each match found, also find its
/// overlapping starting positions.
///
/// N.B. This routine used to be part of the crate API, but 1) it wasn't clear
/// to me how useful it was and 2) it wasn't clear to me what its semantics
/// should be. In particular, a potentially surprising footgun of this routine
/// that it is worst case *quadratic* in the size of the haystack. Namely, it's
/// possible to report a match at every position, and for every such position,
/// scan all the way to the beginning of the haystack to find the starting
/// position. Typical leftmost non-overlapping searches don't suffer from this
/// because, well, matches can't overlap. So subsequent searches after a match
/// is found don't revisit previously scanned parts of the haystack.
///
/// Its semantics can be strange for other reasons too. For example, given
/// the regex '.*' and the haystack 'zz', the full set of overlapping matches
/// is: [0, 0], [1, 1], [0, 1], [2, 2], [1, 2], [0, 2]. The ordering of
/// those matches is quite strange, but makes sense when you think about the
/// implementation: an end offset is found left-to-right, and then one or more
/// starting offsets are found right-to-left.
///
/// Nevertheless, we provide this routine in our test suite because it's
/// useful to test the low level DFA overlapping search and our test suite
/// is written in a way that requires starting offsets.
fn try_search_overlapping<A: Automaton>(
re: &Regex<A>,
input: &Input<'_>,
) -> Result<TestResult> {
let mut matches = vec![];
let mut fwd_state = OverlappingState::start();
let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse());
while let Some(end) = {
fwd_dfa.try_search_overlapping_fwd(input, &mut fwd_state)?;
fwd_state.get_match()
} {
let revsearch = input
.clone()
.range(input.start()..end.offset())
.anchored(Anchored::Pattern(end.pattern()))
.earliest(false);
let mut rev_state = OverlappingState::start();
while let Some(start) = {
rev_dfa.try_search_overlapping_rev(&revsearch, &mut rev_state)?;
rev_state.get_match()
} {
let span = Span { start: start.offset(), end: end.offset() };
let mat = Match { id: end.pattern().as_usize(), span };
matches.push(mat);
}
}
Ok(TestResult::matches(matches))
}

View File

@@ -0,0 +1,52 @@
// This test was found by a fuzzer input that crafted a way to provide
// an invalid serialization of ByteClasses that passed our verification.
// Specifically, the verification step in the deserialization of ByteClasses
// used an iterator that depends on part of the serialized bytes being correct.
// (Specifically, the encoding of the number of classes.)
#[test]
fn invalid_byte_classes() {
let data = include_bytes!(
"testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9",
);
let _ = fuzz_run(data);
}
#[test]
fn invalid_byte_classes_min() {
let data = include_bytes!(
"testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9",
);
let _ = fuzz_run(data);
}
// This is the code from the fuzz target. Kind of sucks to duplicate it here,
// but this is fundamentally how we interpret the date.
fn fuzz_run(given_data: &[u8]) -> Option<()> {
use regex_automata::dfa::Automaton;
if given_data.len() < 2 {
return None;
}
let haystack_len = usize::from(given_data[0]);
let haystack = given_data.get(1..1 + haystack_len)?;
let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
// We help the fuzzer along by adding a preamble to the bytes that should
// at least make these first parts valid. The preamble expects a very
// specific sequence of bytes, so it makes sense to just force this.
let label = "rust-regex-automata-dfa-dense\x00\x00\x00";
assert_eq!(0, label.len() % 4);
let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
let version_check = 2u32.to_ne_bytes().to_vec();
let mut dfa_bytes: Vec<u8> = vec![];
dfa_bytes.extend(label.as_bytes());
dfa_bytes.extend(&endianness_check);
dfa_bytes.extend(&version_check);
dfa_bytes.extend(given_dfa_bytes);
// This is the real test: checking that any input we give to
// DFA::from_bytes will never result in a panic.
let (dfa, _) =
regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?;
let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
Some(())
}

View File

@@ -0,0 +1,2 @@
mod dense;
mod sparse;

View File

@@ -0,0 +1,132 @@
// This is a regression test for a bug in how special states are handled. The
// fuzzer found a case where a state returned true for 'is_special_state' but
// *didn't* return true for 'is_dead_state', 'is_quit_state', 'is_match_state',
// 'is_start_state' or 'is_accel_state'. This in turn tripped a debug assertion
// in the core matching loop that requires 'is_special_state' being true to
// imply that one of the other routines returns true.
//
// We fixed this by adding some validation to both dense and sparse DFAs that
// checks that this property is true for every state ID in the DFA.
#[test]
fn invalid_special_state() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838",
);
let _ = fuzz_run(data);
}
// This is an interesting case where a fuzzer generated a DFA with
// a transition to a state ID that decoded as a valid state, but
// where the ID itself did not point to one of the two existing
// states for this particular DFA. This combined with marking this
// transition's state ID as special but without actually making one of the
// 'is_{dead,quit,match,start,accel}_state' predicates return true ended up
// tripping the 'debug_assert(dfa.is_quit_state(sid))' code in the search
// routine.
//
// We fixed this in alloc mode by checking that every transition points to a
// valid state ID. Technically this bug still exists in core-only mode, but
// it's not clear how to fix it. And it's worth pointing out that the search
// routine won't panic in production. It will just provide invalid results. And
// that's acceptable within the contract of DFA::from_bytes.
#[test]
fn transition_to_invalid_but_valid_state() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9",
);
let _ = fuzz_run(data);
}
// Another one caught by the fuzzer where it generated a DFA that reported a
// start state as a match state. Since matches are always delayed by one byte,
// start states specifically cannot be match states. And indeed, the search
// code relies on this.
#[test]
fn start_state_is_not_match_state() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000",
);
let _ = fuzz_run(data);
}
// This is variation on 'transition_to_invalid_but_valid_state', but happens
// to a start state. Namely, the fuzz data here builds a DFA with a start
// state ID that is incorrect but points to a sequence of bytes that satisfies
// state decoding validation. This errant state in turn has a non-zero number
// of transitions, and its those transitions that point to a state that does
// *not* satisfy state decoding validation. But we never checked those. So the
// fix here was to add validation of the transitions off of the start state.
#[test]
fn start_state_has_valid_transitions() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98",
);
let _ = fuzz_run(data);
}
// This fuzz input generated a DFA with a state whose ID was in the match state
// ID range, but where the state itself was encoded with zero pattern IDs. We
// added validation code to check this case.
#[test]
fn match_state_inconsistency() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570",
);
let _ = fuzz_run(data);
}
// This fuzz input generated a DFA with a state whose ID was in the accelerator
// range, but who didn't have any accelerators. This violated an invariant that
// assumes that if 'dfa.is_accel_state(sid)' returns true, then the state must
// have some accelerators.
#[test]
fn invalid_accelerators() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b",
);
let _ = fuzz_run(data);
}
// This fuzz input generated a DFA with a state whose EOI transition led to
// a quit state, which is generally considered illegal. Why? Because the EOI
// transition is defined over a special sentinel alphabet element and one
// cannot configure a DFA to "quit" on that sentinel.
#[test]
fn eoi_transition_to_quit_state() {
let data = include_bytes!(
"testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9",
);
let _ = fuzz_run(data);
}
// This is the code from the fuzz target. Kind of sucks to duplicate it here,
// but this is fundamentally how we interpret the date.
fn fuzz_run(given_data: &[u8]) -> Option<()> {
use regex_automata::dfa::Automaton;
if given_data.len() < 2 {
return None;
}
let haystack_len = usize::from(given_data[0]);
let haystack = given_data.get(1..1 + haystack_len)?;
let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
// We help the fuzzer along by adding a preamble to the bytes that should
// at least make these first parts valid. The preamble expects a very
// specific sequence of bytes, so it makes sense to just force this.
let label = "rust-regex-automata-dfa-sparse\x00\x00";
assert_eq!(0, label.len() % 4);
let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
let version_check = 2u32.to_ne_bytes().to_vec();
let mut dfa_bytes: Vec<u8> = vec![];
dfa_bytes.extend(label.as_bytes());
dfa_bytes.extend(&endianness_check);
dfa_bytes.extend(&version_check);
dfa_bytes.extend(given_dfa_bytes);
// This is the real test: checking that any input we give to
// DFA::from_bytes will never result in a panic.
let (dfa, _) =
regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?;
let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
Some(())
}

View File

@@ -0,0 +1,65 @@
This directory contains tests for serialized objects from the regex-automata
crate. Currently, there are only two supported such objects: dense and sparse
DFAs.
The idea behind these tests is to commit some serialized objects and run some
basic tests by deserializing them and running searches and ensuring they are
correct. We also make sure these are run under Miri, since deserialization is
one of the biggest places where undefined behavior might occur in this crate
(at the time of writing).
The main thing we're testing is that the *current* code can still deserialize
*old* objects correctly. Generally speaking, compatibility extends to semver
compatible releases of this crate. Beyond that, no promises are made, although
in practice callers can at least depend on errors occurring. (The serialized
format always includes a version number, and incompatible changes increment
that version number such that an error will occur if an unsupported version is
detected.)
To generate the dense DFAs, I used this command:
```
$ regex-cli generate serialize dense regex \
MULTI_PATTERN_V2 \
tests/gen/dense/ \
--rustfmt \
--safe \
--starts-for-each-pattern \
--specialize-start-states \
--start-kind both \
--unicode-word-boundary \
--minimize \
'\b[a-zA-Z]+\b' \
'(?m)^\S+$' \
'(?Rm)^\S+$'
```
And to generate the sparse DFAs, I used this command, which is the same as
above, but with `s/dense/sparse/g`.
```
$ regex-cli generate serialize sparse regex \
MULTI_PATTERN_V2 \
tests/gen/sparse/ \
--rustfmt \
--safe \
--starts-for-each-pattern \
--specialize-start-states \
--start-kind both \
--unicode-word-boundary \
--minimize \
'\b[a-zA-Z]+\b' \
'(?m)^\S+$' \
'(?Rm)^\S+$'
```
The idea is to try to enable as many of the DFA's options as possible in order
to test that serialization works for all of them.
Arguably we should increase test coverage here, but this is a start. Note
that in particular, this does not need to test that serialization and
deserialization correctly round-trips on its own. Indeed, the normal regex test
suite has a test that does a serialization round trip for every test supported
by DFAs. So that has very good coverage. What we're interested in testing here
is our compatibility promise: do DFAs generated with an older revision of the
code still deserialize correctly?

View File

@@ -0,0 +1,22 @@
use regex_automata::{Input, Match};
mod multi_pattern_v2;
#[test]
fn multi_pattern_v2() {
use multi_pattern_v2::MULTI_PATTERN_V2 as RE;
assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd"));
assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @"));
assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@"));
assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n"));
assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n"));
assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n"));
assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n"));
assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@"));
assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n"));
// Fails because we have heuristic support for Unicode word boundaries
// enabled.
assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err());
}

View File

@@ -0,0 +1,43 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize dense regex MULTI_PATTERN_V2 tests/gen/dense/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$
//
// regex-cli 0.0.1 is available on crates.io.
use regex_automata::{
dfa::{dense::DFA, regex::Regex},
util::{lazy::Lazy, wire::AlignAs},
};
pub static MULTI_PATTERN_V2: Lazy<Regex<DFA<&'static [u32]>>> =
Lazy::new(|| {
let dfafwd = {
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
_align: [],
#[cfg(target_endian = "big")]
bytes: *include_bytes!("multi_pattern_v2_fwd.bigendian.dfa"),
#[cfg(target_endian = "little")]
bytes: *include_bytes!(
"multi_pattern_v2_fwd.littleendian.dfa"
),
};
DFA::from_bytes(&ALIGNED.bytes)
.expect("serialized forward DFA should be valid")
.0
};
let dfarev = {
static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
_align: [],
#[cfg(target_endian = "big")]
bytes: *include_bytes!("multi_pattern_v2_rev.bigendian.dfa"),
#[cfg(target_endian = "little")]
bytes: *include_bytes!(
"multi_pattern_v2_rev.littleendian.dfa"
),
};
DFA::from_bytes(&ALIGNED.bytes)
.expect("serialized reverse DFA should be valid")
.0
};
Regex::builder().build_from_dfas(dfafwd, dfarev)
});

View File

@@ -0,0 +1,2 @@
mod dense;
mod sparse;

View File

@@ -0,0 +1,22 @@
use regex_automata::{Input, Match};
mod multi_pattern_v2;
#[test]
fn multi_pattern_v2() {
use multi_pattern_v2::MULTI_PATTERN_V2 as RE;
assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd"));
assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @"));
assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@"));
assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n"));
assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n"));
assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n"));
assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n"));
assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@"));
assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n"));
// Fails because we have heuristic support for Unicode word boundaries
// enabled.
assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err());
}

View File

@@ -0,0 +1,37 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// regex-cli generate serialize sparse regex MULTI_PATTERN_V2 regex-automata/tests/gen/sparse/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$
//
// regex-cli 0.0.1 is available on crates.io.
use regex_automata::{
dfa::{regex::Regex, sparse::DFA},
util::lazy::Lazy,
};
pub static MULTI_PATTERN_V2: Lazy<Regex<DFA<&'static [u8]>>> =
Lazy::new(|| {
let dfafwd = {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("multi_pattern_v2_fwd.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("multi_pattern_v2_fwd.littleendian.dfa");
DFA::from_bytes(BYTES)
.expect("serialized forward DFA should be valid")
.0
};
let dfarev = {
#[cfg(target_endian = "big")]
static BYTES: &'static [u8] =
include_bytes!("multi_pattern_v2_rev.bigendian.dfa");
#[cfg(target_endian = "little")]
static BYTES: &'static [u8] =
include_bytes!("multi_pattern_v2_rev.littleendian.dfa");
DFA::from_bytes(BYTES)
.expect("serialized reverse DFA should be valid")
.0
};
Regex::builder().build_from_dfas(dfafwd, dfarev)
});

View File

@@ -0,0 +1,171 @@
use std::error::Error;
use regex_automata::{
hybrid::dfa::{OverlappingState, DFA},
nfa::thompson,
HalfMatch, Input, MatchError,
};
// Tests that too many cache resets cause the lazy DFA to quit.
//
// We only test this on 64-bit because the test is gingerly crafted based on
// implementation details of cache sizes. It's not a great test because of
// that, but it does check some interesting properties around how positions are
// reported when a search "gives up."
//
// NOTE: If you change something in lazy DFA implementation that causes this
// test to fail by reporting different "gave up" positions, then it's generally
// okay to update the positions in the test below as long as you're sure your
// changes are correct. Namely, it is expected that if there are changes in the
// cache size (or changes in how big things are inside the cache), then its
// utilization may change slightly and thus impact where a search gives up.
// Precisely where a search gives up is not an API guarantee, so changing the
// offsets here is OK.
#[test]
#[cfg(target_pointer_width = "64")]
#[cfg(not(miri))]
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
// This is a carefully chosen regex. The idea is to pick one that requires
// some decent number of states (hence the bounded repetition). But we
// specifically choose to create a class with an ASCII letter and a
// non-ASCII letter so that we can check that no new states are created
// once the cache is full. Namely, if we fill up the cache on a haystack
// of 'a's, then in order to match one 'β', a new state will need to be
// created since a 'β' is encoded with multiple bytes.
//
// So we proceed by "filling" up the cache by searching a haystack of just
// 'a's. The cache won't have enough room to add enough states to find the
// match (because of the bounded repetition), which should result in it
// giving up before it finds a match.
//
// Since there's now no more room to create states, we search a haystack
// of 'β' and confirm that it gives up immediately.
let pattern = r"[aβ]{99}";
let dfa = DFA::builder()
.configure(
// Configure it so that we have the minimum cache capacity
// possible. And that if any resets occur, the search quits.
DFA::config()
.skip_cache_capacity_check(true)
.cache_capacity(0)
.minimum_cache_clear_count(Some(0)),
)
.thompson(thompson::NFA::config())
.build(pattern)?;
let mut cache = dfa.create_cache();
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::gave_up(24);
// Notice that we make the same amount of progress in each search! That's
// because the cache is reused and already has states to handle the first
// N bytes.
assert_eq!(
Err(err.clone()),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
assert_eq!(
Err(err.clone()),
dfa.try_search_overlapping_fwd(
&mut cache,
&Input::new(&haystack),
&mut OverlappingState::start()
),
);
let haystack = "β".repeat(101).into_bytes();
let err = MatchError::gave_up(2);
assert_eq!(
Err(err),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
// no need to test that other find routines quit, since we did that above
// OK, if we reset the cache, then we should be able to create more states
// and make more progress with searching for betas.
cache.reset(&dfa);
let err = MatchError::gave_up(26);
assert_eq!(
Err(err),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
// ... switching back to ASCII still makes progress since it just needs to
// set transitions on existing states!
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::gave_up(13);
assert_eq!(
Err(err),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
Ok(())
}
// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.build("[[:word:]]+$")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.try_search_fwd(&mut cache, &Input::new("abcxyz")),
Err(MatchError::quit(b'x', 3)),
);
assert_eq!(
dfa.try_search_overlapping_fwd(
&mut cache,
&Input::new(b"abcxyz"),
&mut OverlappingState::start()
),
Err(MatchError::quit(b'x', 3)),
);
Ok(())
}
// Tests that quit bytes in the reverse direction work correctly.
#[test]
fn quit_rev() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.thompson(thompson::Config::new().reverse(true))
.build("^[[:word:]]+")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.try_search_rev(&mut cache, &Input::new("abcxyz")),
Err(MatchError::quit(b'x', 3)),
);
Ok(())
}
// Tests that if we heuristically enable Unicode word boundaries but then
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
// will panic.
#[test]
#[should_panic]
fn quit_panics() {
DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
}
// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
#[test]
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
let mut config = DFA::config();
for b in 0x80..=0xFF {
config = config.quit(b, true);
}
let dfa = DFA::builder().configure(config).build(r"\b")?;
let mut cache = dfa.create_cache();
let expected = HalfMatch::must(0, 1);
assert_eq!(
Ok(Some(expected)),
dfa.try_search_fwd(&mut cache, &Input::new(" a")),
);
Ok(())
}

View File

@@ -0,0 +1,3 @@
mod api;
#[cfg(not(miri))]
mod suite;

View File

@@ -0,0 +1,347 @@
use {
anyhow::Result,
regex_automata::{
hybrid::{
dfa::{OverlappingState, DFA},
regex::{self, Regex},
},
nfa::thompson,
util::{prefilter::Prefilter, syntax},
Anchored, Input, PatternSet,
},
regex_test::{
CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
TestRunner,
},
};
use crate::{create_input, suite, untestify_kind};
const EXPANSIONS: &[&str] = &["is_match", "find", "which"];
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let builder = Regex::builder();
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA with prefilters enabled.
#[test]
fn prefilter() -> Result<()> {
let my_compiler = |test: &RegexTest, regexes: &[String]| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
let mut builder = Regex::builder();
builder.dfa(DFA::config().prefilter(pre));
compiler(builder)(test, regexes)
};
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), my_compiler)
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA with NFA shrinking enabled.
///
/// This is *usually* not the configuration one wants for a lazy DFA. NFA
/// shrinking is mostly only advantageous when building a full DFA since it
/// can sharply decrease the amount of time determinization takes. But NFA
/// shrinking is itself otherwise fairly expensive currently. Since a lazy DFA
/// has no compilation time (other than for building the NFA of course) before
/// executing a search, it's usually worth it to forgo NFA shrinking.
///
/// Nevertheless, we test to make sure everything is OK with NFA shrinking. As
/// a bonus, there are some tests we don't need to skip because they now fit in
/// the default cache capacity.
#[test]
fn nfa_shrink() -> Result<()> {
let mut builder = Regex::builder();
builder.thompson(thompson::Config::new().shrink(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled for all
/// tests.
#[test]
fn starts_for_each_pattern() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().starts_for_each_pattern(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when 'specialize_start_states' is enabled.
#[test]
fn specialize_start_states() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().specialize_start_states(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when byte classes are disabled.
///
/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
/// All it does is cause every byte value to be its own distinct equivalence
/// class.
#[test]
fn no_byte_classes() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().byte_classes(false));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests that hybrid NFA/DFA never clears its cache for any test with the
/// default capacity.
///
/// N.B. If a regex suite test is added that causes the cache to be cleared,
/// then this should just skip that test. (Which can be done by calling the
/// 'blacklist' method on 'TestRunner'.)
#[test]
fn no_cache_clearing() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when the minimum cache capacity is set.
#[test]
fn min_cache_capacity() -> Result<()> {
let mut builder = Regex::builder();
builder
.dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
TestRunner::new()?
.expand(EXPANSIONS, |t| t.compiles())
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
fn compiler(
mut builder: regex::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
// Parse regexes as HIRs for some analysis below.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
// Check if our regex contains things that aren't supported by DFAs.
// That is, Unicode word boundaries when searching non-ASCII text.
if !test.haystack().is_ascii() {
for hir in hirs.iter() {
if hir.properties().look_set().contains_word_unicode() {
return Ok(CompiledRegex::skip());
}
}
}
if !configure_regex_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = builder.build_many(&regexes)?;
let mut cache = re.create_cache();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
}
}
fn run_test(
re: &Regex,
cache: &mut regex::Cache,
test: &RegexTest,
) -> TestResult {
let input = create_input(test);
match test.additional_name() {
"is_match" => {
TestResult::matched(re.is_match(cache, input.earliest(true)))
}
"find" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Leftmost => {
let input =
input.earliest(test.search_kind() == SearchKind::Earliest);
TestResult::matches(
re.find_iter(cache, input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
}),
)
}
SearchKind::Overlapping => {
try_search_overlapping(re, cache, &input).unwrap()
}
},
"which" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Leftmost => {
// There are no "which" APIs for standard searches.
TestResult::skip()
}
SearchKind::Overlapping => {
let dfa = re.forward();
let cache = cache.as_parts_mut().0;
let mut patset = PatternSet::new(dfa.pattern_len());
dfa.try_which_overlapping_matches(cache, &input, &mut patset)
.unwrap();
TestResult::which(patset.iter().map(|p| p.as_usize()))
}
},
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_regex_builder(
test: &RegexTest,
builder: &mut regex::Builder,
) -> bool {
let match_kind = match untestify_kind(test.match_kind()) {
None => return false,
Some(k) => k,
};
let mut dfa_config =
DFA::config().match_kind(match_kind).unicode_word_boundary(true);
// When doing an overlapping search, we might try to find the start of each
// match with a custom search routine. In that case, we need to tell the
// reverse search (for the start offset) which pattern to look for. The
// only way that API works is when anchored starting states are compiled
// for each pattern. This does technically also enable it for the forward
// DFA, but we're okay with that.
if test.search_kind() == SearchKind::Overlapping {
dfa_config = dfa_config.starts_for_each_pattern(true);
}
builder
.syntax(config_syntax(test))
.thompson(config_thompson(test))
.dfa(dfa_config);
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
let mut lookm = regex_automata::util::look::LookMatcher::new();
lookm.set_line_terminator(test.line_terminator());
thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}
/// Execute an overlapping search, and for each match found, also find its
/// overlapping starting positions.
///
/// N.B. This routine used to be part of the crate API, but 1) it wasn't clear
/// to me how useful it was and 2) it wasn't clear to me what its semantics
/// should be. In particular, a potentially surprising footgun of this routine
/// that it is worst case *quadratic* in the size of the haystack. Namely, it's
/// possible to report a match at every position, and for every such position,
/// scan all the way to the beginning of the haystack to find the starting
/// position. Typical leftmost non-overlapping searches don't suffer from this
/// because, well, matches can't overlap. So subsequent searches after a match
/// is found don't revisit previously scanned parts of the haystack.
///
/// Its semantics can be strange for other reasons too. For example, given
/// the regex '.*' and the haystack 'zz', the full set of overlapping matches
/// is: [0, 0], [1, 1], [0, 1], [2, 2], [1, 2], [0, 2]. The ordering of
/// those matches is quite strange, but makes sense when you think about the
/// implementation: an end offset is found left-to-right, and then one or more
/// starting offsets are found right-to-left.
///
/// Nevertheless, we provide this routine in our test suite because it's
/// useful to test the low level DFA overlapping search and our test suite
/// is written in a way that requires starting offsets.
fn try_search_overlapping(
re: &Regex,
cache: &mut regex::Cache,
input: &Input<'_>,
) -> Result<TestResult> {
let mut matches = vec![];
let mut fwd_state = OverlappingState::start();
let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse());
let (fwd_cache, rev_cache) = cache.as_parts_mut();
while let Some(end) = {
fwd_dfa.try_search_overlapping_fwd(
fwd_cache,
input,
&mut fwd_state,
)?;
fwd_state.get_match()
} {
let revsearch = input
.clone()
.range(input.start()..end.offset())
.anchored(Anchored::Pattern(end.pattern()))
.earliest(false);
let mut rev_state = OverlappingState::start();
while let Some(start) = {
rev_dfa.try_search_overlapping_rev(
rev_cache,
&revsearch,
&mut rev_state,
)?;
rev_state.get_match()
} {
let span = Span { start: start.offset(), end: end.offset() };
let mat = Match { id: end.pattern().as_usize(), span };
matches.push(mat);
}
}
Ok(TestResult::matches(matches))
}

115
vendor/regex-automata/tests/lib.rs vendored Normal file
View File

@@ -0,0 +1,115 @@
// We have a similar config in the regex-automata crate root. Basically, it is
// just too annoying to deal with dead code when a subset of features is
// enabled.
#![cfg_attr(
not(all(
feature = "std",
feature = "nfa",
feature = "dfa",
feature = "hybrid",
feature = "perf-literal-substring",
feature = "perf-literal-multisubstring",
)),
allow(dead_code, unused_imports, unused_variables)
)]
// Similar deal with Miri. Just let dead code warnings be.
#![cfg_attr(miri, allow(dead_code, unused_imports, unused_variables))]
#[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))]
mod dfa;
#[cfg(feature = "dfa-search")]
mod fuzz;
#[cfg(feature = "dfa-search")]
mod gen;
#[cfg(feature = "hybrid")]
mod hybrid;
#[cfg(feature = "meta")]
mod meta;
#[cfg(any(feature = "nfa-backtrack", feature = "nfa-pikevm"))]
mod nfa;
fn suite() -> anyhow::Result<regex_test::RegexTests> {
let _ = env_logger::try_init();
let mut tests = regex_test::RegexTests::new();
macro_rules! load {
($name:expr) => {{
const DATA: &[u8] =
include_bytes!(concat!("../../testdata/", $name, ".toml"));
tests.load_slice($name, DATA)?;
}};
}
load!("anchored");
load!("bytes");
load!("crazy");
load!("crlf");
load!("earliest");
load!("empty");
load!("expensive");
load!("flags");
load!("iter");
load!("leftmost-all");
load!("line-terminator");
load!("misc");
load!("multiline");
load!("no-unicode");
load!("overlapping");
load!("regression");
load!("set");
load!("substring");
load!("unicode");
load!("utf8");
load!("word-boundary");
load!("word-boundary-special");
load!("fowler/basic");
load!("fowler/nullsubexpr");
load!("fowler/repetition");
Ok(tests)
}
/// Configure a regex_automata::Input with the given test configuration.
fn create_input<'h>(
test: &'h regex_test::RegexTest,
) -> regex_automata::Input<'h> {
use regex_automata::Anchored;
let bounds = test.bounds();
let anchored = if test.anchored() { Anchored::Yes } else { Anchored::No };
regex_automata::Input::new(test.haystack())
.range(bounds.start..bounds.end)
.anchored(anchored)
}
/// Convert capture matches into the test suite's capture values.
///
/// The given captures must represent a valid match, where the first capturing
/// group has a non-None span. Otherwise this panics.
fn testify_captures(
caps: &regex_automata::util::captures::Captures,
) -> regex_test::Captures {
assert!(caps.is_match(), "expected captures to represent a match");
let spans = caps.iter().map(|group| {
group.map(|m| regex_test::Span { start: m.start, end: m.end })
});
// These unwraps are OK because we assume our 'caps' represents a match,
// and a match always gives a non-zero number of groups with the first
// group being non-None.
regex_test::Captures::new(caps.pattern().unwrap().as_usize(), spans)
.unwrap()
}
/// Convert a test harness match kind to a regex-automata match kind. If
/// regex-automata doesn't support the harness kind, then `None` is returned.
fn untestify_kind(
kind: regex_test::MatchKind,
) -> Option<regex_automata::MatchKind> {
match kind {
regex_test::MatchKind::All => Some(regex_automata::MatchKind::All),
regex_test::MatchKind::LeftmostFirst => {
Some(regex_automata::MatchKind::LeftmostFirst)
}
regex_test::MatchKind::LeftmostLongest => None,
}
}

View File

@@ -0,0 +1,2 @@
#[cfg(not(miri))]
mod suite;

View File

@@ -0,0 +1,200 @@
use {
anyhow::Result,
regex_automata::{
meta::{self, Regex},
util::syntax,
MatchKind, PatternSet,
},
regex_test::{
CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
TestRunner,
},
};
use crate::{create_input, suite, testify_captures};
const BLACKLIST: &[&str] = &[
// These 'earliest' tests are blacklisted because the meta searcher doesn't
// give the same offsets that the test expects. This is legal because the
// 'earliest' routines don't guarantee a particular match offset other
// than "the earliest the regex engine can report a match." Some regex
// engines will quit earlier than others. The backtracker, for example,
// can't really quit before finding the full leftmost-first match. Many of
// the literal searchers also don't have the ability to quit fully or it's
// otherwise not worth doing. (A literal searcher not quitting as early as
// possible usually means looking at a few more bytes. That's no biggie.)
"earliest/",
];
/// Tests the default configuration of the meta regex engine.
#[test]
fn default() -> Result<()> {
let builder = Regex::builder();
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(BLACKLIST)
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the default configuration minus the full DFA.
#[test]
fn no_dfa() -> Result<()> {
let mut builder = Regex::builder();
builder.configure(Regex::config().dfa(false));
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(BLACKLIST)
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the default configuration minus the full DFA and lazy DFA.
#[test]
fn no_dfa_hybrid() -> Result<()> {
let mut builder = Regex::builder();
builder.configure(Regex::config().dfa(false).hybrid(false));
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(BLACKLIST)
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the default configuration minus the full DFA, lazy DFA and one-pass
/// DFA.
#[test]
fn no_dfa_hybrid_onepass() -> Result<()> {
let mut builder = Regex::builder();
builder.configure(Regex::config().dfa(false).hybrid(false).onepass(false));
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(BLACKLIST)
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the default configuration minus the full DFA, lazy DFA, one-pass
/// DFA and backtracker.
#[test]
fn no_dfa_hybrid_onepass_backtrack() -> Result<()> {
let mut builder = Regex::builder();
builder.configure(
Regex::config()
.dfa(false)
.hybrid(false)
.onepass(false)
.backtrack(false),
);
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(BLACKLIST)
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
fn compiler(
mut builder: meta::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
if !configure_meta_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = builder.build_many(&regexes)?;
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
}
}
fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
let input = create_input(test);
match test.additional_name() {
"is_match" => TestResult::matched(re.is_match(input)),
"find" => match test.search_kind() {
SearchKind::Earliest => TestResult::matches(
re.find_iter(input.earliest(true))
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
}),
),
SearchKind::Leftmost => TestResult::matches(
re.find_iter(input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
}),
),
SearchKind::Overlapping => {
let mut patset = PatternSet::new(re.pattern_len());
re.which_overlapping_matches(&input, &mut patset);
TestResult::which(patset.iter().map(|p| p.as_usize()))
}
},
"captures" => match test.search_kind() {
SearchKind::Earliest => {
let it = re
.captures_iter(input.earliest(true))
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
SearchKind::Leftmost => {
let it = re
.captures_iter(input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
SearchKind::Overlapping => {
// There is no overlapping regex API that supports captures.
TestResult::skip()
}
},
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_meta_builder(
test: &RegexTest,
builder: &mut meta::Builder,
) -> bool {
let match_kind = match test.match_kind() {
regex_test::MatchKind::All => MatchKind::All,
regex_test::MatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
regex_test::MatchKind::LeftmostLongest => return false,
};
let meta_config = Regex::config()
.match_kind(match_kind)
.utf8_empty(test.utf8())
.line_terminator(test.line_terminator());
builder.configure(meta_config).syntax(config_syntax(test));
true
}
/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}

View File

@@ -0,0 +1 @@
mod thompson;

View File

@@ -0,0 +1,2 @@
#[cfg(not(miri))]
mod suite;

View File

@@ -0,0 +1,213 @@
use {
anyhow::Result,
regex_automata::{
nfa::thompson::{
self,
backtrack::{self, BoundedBacktracker},
NFA,
},
util::{prefilter::Prefilter, syntax},
Input,
},
regex_test::{
CompiledRegex, Match, MatchKind, RegexTest, SearchKind, Span,
TestResult, TestRunner,
},
};
use crate::{create_input, suite, testify_captures};
/// Tests the default configuration of the bounded backtracker.
#[test]
fn default() -> Result<()> {
let builder = BoundedBacktracker::builder();
let mut runner = TestRunner::new()?;
runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
// At the time of writing, every regex search in the test suite fits
// into the backtracker's default visited capacity (except for the
// blacklisted tests below). If regexes are added that blow that capacity,
// then they should be blacklisted here. A tempting alternative is to
// automatically skip them by checking the haystack length against
// BoundedBacktracker::max_haystack_len, but that could wind up hiding
// interesting failure modes. e.g., If the visited capacity is somehow
// wrong or smaller than it should be.
runner.blacklist("expensive/backtrack-blow-visited-capacity");
runner.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
/// Tests the backtracker with prefilters enabled.
#[test]
fn prefilter() -> Result<()> {
let my_compiler = |test: &RegexTest, regexes: &[String]| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
// We can always select leftmost-first here because the backtracker
// only supports leftmost-first matching.
let pre = Prefilter::from_hirs_prefix(
regex_automata::MatchKind::LeftmostFirst,
&hirs,
);
let mut builder = BoundedBacktracker::builder();
builder.configure(BoundedBacktracker::config().prefilter(pre));
compiler(builder)(test, regexes)
};
let mut runner = TestRunner::new()?;
runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
runner.blacklist("expensive/backtrack-blow-visited-capacity");
runner.test_iter(suite()?.iter(), my_compiler).assert();
Ok(())
}
/// Tests the bounded backtracker when its visited capacity is set to its
/// minimum amount.
#[test]
fn min_visited_capacity() -> Result<()> {
let mut runner = TestRunner::new()?;
runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
runner
.test_iter(suite()?.iter(), move |test, regexes| {
let nfa = NFA::compiler()
.configure(config_thompson(test))
.syntax(config_syntax(test))
.build_many(&regexes)?;
let mut builder = BoundedBacktracker::builder();
if !configure_backtrack_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
// Setup the bounded backtracker so that its visited capacity is
// the absolute minimum required for the test's haystack.
builder.configure(BoundedBacktracker::config().visited_capacity(
backtrack::min_visited_capacity(
&nfa,
&Input::new(test.haystack()),
),
));
let re = builder.build_from_nfa(nfa)?;
let mut cache = re.create_cache();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
})
.assert();
Ok(())
}
fn compiler(
mut builder: backtrack::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
if !configure_backtrack_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = builder.build_many(&regexes)?;
let mut cache = re.create_cache();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
}
}
fn run_test(
re: &BoundedBacktracker,
cache: &mut backtrack::Cache,
test: &RegexTest,
) -> TestResult {
let input = create_input(test);
match test.additional_name() {
"is_match" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Overlapping => {
TestResult::skip()
}
SearchKind::Leftmost => {
let input = input.earliest(true);
TestResult::matched(re.try_is_match(cache, input).unwrap())
}
},
"find" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Overlapping => {
TestResult::skip()
}
SearchKind::Leftmost => TestResult::matches(
re.try_find_iter(cache, input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|result| result.unwrap())
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
}),
),
},
"captures" => match test.search_kind() {
SearchKind::Earliest | SearchKind::Overlapping => {
TestResult::skip()
}
SearchKind::Leftmost => TestResult::captures(
re.try_captures_iter(cache, input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|result| result.unwrap())
.map(|caps| testify_captures(&caps)),
),
},
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_backtrack_builder(
test: &RegexTest,
builder: &mut backtrack::Builder,
) -> bool {
match (test.search_kind(), test.match_kind()) {
// For testing the standard search APIs. This is the only supported
// configuration for the backtracker.
(SearchKind::Leftmost, MatchKind::LeftmostFirst) => {}
// Overlapping APIs not supported at all for backtracker.
(SearchKind::Overlapping, _) => return false,
// Backtracking doesn't really support the notion of 'earliest'.
// Namely, backtracking already works by returning as soon as it knows
// it has found a match. It just so happens that this corresponds to
// the standard 'leftmost' formulation.
//
// The 'earliest' definition in this crate does indeed permit this
// behavior, so this is "fine," but our test suite specifically looks
// for the earliest position at which a match is known, which our
// finite automata based regex engines have no problem providing. So
// for backtracking, we just skip these tests.
(SearchKind::Earliest, _) => return false,
// For backtracking, 'all' semantics don't really make sense.
(_, MatchKind::All) => return false,
// Not supported at all in regex-automata.
(_, MatchKind::LeftmostLongest) => return false,
};
let backtrack_config = BoundedBacktracker::config();
builder
.configure(backtrack_config)
.syntax(config_syntax(test))
.thompson(config_thompson(test));
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
let mut lookm = regex_automata::util::look::LookMatcher::new();
lookm.set_line_terminator(test.line_terminator());
thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}

View File

@@ -0,0 +1,4 @@
#[cfg(feature = "nfa-backtrack")]
mod backtrack;
#[cfg(feature = "nfa-pikevm")]
mod pikevm;

View File

@@ -0,0 +1,2 @@
#[cfg(not(miri))]
mod suite;

View File

@@ -0,0 +1,162 @@
use {
anyhow::Result,
regex_automata::{
nfa::thompson::{
self,
pikevm::{self, PikeVM},
},
util::{prefilter::Prefilter, syntax},
PatternSet,
},
regex_test::{
CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
TestRunner,
},
};
use crate::{create_input, suite, testify_captures, untestify_kind};
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let builder = PikeVM::builder();
let mut runner = TestRunner::new()?;
runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
runner.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
/// Tests the PikeVM with prefilters enabled.
#[test]
fn prefilter() -> Result<()> {
let my_compiler = |test: &RegexTest, regexes: &[String]| {
// Parse regexes as HIRs so we can get literals to build a prefilter.
let mut hirs = vec![];
for pattern in regexes.iter() {
hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
}
let kind = match untestify_kind(test.match_kind()) {
None => return Ok(CompiledRegex::skip()),
Some(kind) => kind,
};
let pre = Prefilter::from_hirs_prefix(kind, &hirs);
let mut builder = PikeVM::builder();
builder.configure(PikeVM::config().prefilter(pre));
compiler(builder)(test, regexes)
};
let mut runner = TestRunner::new()?;
runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
runner.test_iter(suite()?.iter(), my_compiler).assert();
Ok(())
}
fn compiler(
mut builder: pikevm::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
if !configure_pikevm_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = builder.build_many(&regexes)?;
let mut cache = re.create_cache();
Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
}
}
fn run_test(
re: &PikeVM,
cache: &mut pikevm::Cache,
test: &RegexTest,
) -> TestResult {
let input = create_input(test);
match test.additional_name() {
"is_match" => TestResult::matched(re.is_match(cache, input)),
"find" => match test.search_kind() {
SearchKind::Earliest => {
let it = re
.find_iter(cache, input.earliest(true))
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
});
TestResult::matches(it)
}
SearchKind::Leftmost => {
let it = re
.find_iter(cache, input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
span: Span { start: m.start(), end: m.end() },
});
TestResult::matches(it)
}
SearchKind::Overlapping => {
let mut patset = PatternSet::new(re.get_nfa().pattern_len());
re.which_overlapping_matches(cache, &input, &mut patset);
TestResult::which(patset.iter().map(|p| p.as_usize()))
}
},
"captures" => match test.search_kind() {
SearchKind::Earliest => {
let it = re
.captures_iter(cache, input.earliest(true))
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
SearchKind::Leftmost => {
let it = re
.captures_iter(cache, input)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
SearchKind::Overlapping => {
// There is no overlapping PikeVM API that supports captures.
TestResult::skip()
}
},
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_pikevm_builder(
test: &RegexTest,
builder: &mut pikevm::Builder,
) -> bool {
let match_kind = match untestify_kind(test.match_kind()) {
None => return false,
Some(k) => k,
};
let pikevm_config = PikeVM::config().match_kind(match_kind);
builder
.configure(pikevm_config)
.syntax(config_syntax(test))
.thompson(config_thompson(test));
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
let mut lookm = regex_automata::util::look::LookMatcher::new();
lookm.set_line_terminator(test.line_terminator());
thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}