Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

1809
vendor/regex-syntax/src/ast/mod.rs vendored Normal file

File diff suppressed because it is too large Load Diff

6377
vendor/regex-syntax/src/ast/parse.rs vendored Normal file

File diff suppressed because it is too large Load Diff

577
vendor/regex-syntax/src/ast/print.rs vendored Normal file
View File

@@ -0,0 +1,577 @@
/*!
This module provides a regular expression printer for `Ast`.
*/
use core::fmt;
use crate::ast::{
self,
visitor::{self, Visitor},
Ast,
};
/// A builder for constructing a printer.
///
/// Note that since a printer doesn't have any configuration knobs, this type
/// remains unexported.
#[derive(Clone, Debug)]
struct PrinterBuilder {
_priv: (),
}
impl Default for PrinterBuilder {
fn default() -> PrinterBuilder {
PrinterBuilder::new()
}
}
impl PrinterBuilder {
fn new() -> PrinterBuilder {
PrinterBuilder { _priv: () }
}
fn build(&self) -> Printer {
Printer { _priv: () }
}
}
/// A printer for a regular expression abstract syntax tree.
///
/// A printer converts an abstract syntax tree (AST) to a regular expression
/// pattern string. This particular printer uses constant stack space and heap
/// space proportional to the size of the AST.
///
/// This printer will not necessarily preserve the original formatting of the
/// regular expression pattern string. For example, all whitespace and comments
/// are ignored.
#[derive(Debug)]
pub struct Printer {
_priv: (),
}
impl Printer {
/// Create a new printer.
pub fn new() -> Printer {
PrinterBuilder::new().build()
}
/// Print the given `Ast` to the given writer. The writer must implement
/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
/// here are a `fmt::Formatter` (which is available in `fmt::Display`
/// implementations) or a `&mut String`.
pub fn print<W: fmt::Write>(&mut self, ast: &Ast, wtr: W) -> fmt::Result {
visitor::visit(ast, Writer { wtr })
}
}
#[derive(Debug)]
struct Writer<W> {
wtr: W,
}
impl<W: fmt::Write> Visitor for Writer<W> {
type Output = ();
type Err = fmt::Error;
fn finish(self) -> fmt::Result {
Ok(())
}
fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
match *ast {
Ast::Group(ref x) => self.fmt_group_pre(x),
Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
_ => Ok(()),
}
}
fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
match *ast {
Ast::Empty(_) => Ok(()),
Ast::Flags(ref x) => self.fmt_set_flags(x),
Ast::Literal(ref x) => self.fmt_literal(x),
Ast::Dot(_) => self.wtr.write_str("."),
Ast::Assertion(ref x) => self.fmt_assertion(x),
Ast::ClassPerl(ref x) => self.fmt_class_perl(x),
Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x),
Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
Ast::Repetition(ref x) => self.fmt_repetition(x),
Ast::Group(ref x) => self.fmt_group_post(x),
Ast::Alternation(_) => Ok(()),
Ast::Concat(_) => Ok(()),
}
}
fn visit_alternation_in(&mut self) -> fmt::Result {
self.wtr.write_str("|")
}
fn visit_class_set_item_pre(
&mut self,
ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
match *ast {
ast::ClassSetItem::Bracketed(ref x) => {
self.fmt_class_bracketed_pre(x)
}
_ => Ok(()),
}
}
fn visit_class_set_item_post(
&mut self,
ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
use crate::ast::ClassSetItem::*;
match *ast {
Empty(_) => Ok(()),
Literal(ref x) => self.fmt_literal(x),
Range(ref x) => {
self.fmt_literal(&x.start)?;
self.wtr.write_str("-")?;
self.fmt_literal(&x.end)?;
Ok(())
}
Ascii(ref x) => self.fmt_class_ascii(x),
Unicode(ref x) => self.fmt_class_unicode(x),
Perl(ref x) => self.fmt_class_perl(x),
Bracketed(ref x) => self.fmt_class_bracketed_post(x),
Union(_) => Ok(()),
}
}
fn visit_class_set_binary_op_in(
&mut self,
ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
self.fmt_class_set_binary_op_kind(&ast.kind)
}
}
impl<W: fmt::Write> Writer<W> {
fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result {
use crate::ast::GroupKind::*;
match ast.kind {
CaptureIndex(_) => self.wtr.write_str("("),
CaptureName { ref name, starts_with_p } => {
let start = if starts_with_p { "(?P<" } else { "(?<" };
self.wtr.write_str(start)?;
self.wtr.write_str(&name.name)?;
self.wtr.write_str(">")?;
Ok(())
}
NonCapturing(ref flags) => {
self.wtr.write_str("(?")?;
self.fmt_flags(flags)?;
self.wtr.write_str(":")?;
Ok(())
}
}
}
fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result {
self.wtr.write_str(")")
}
fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result {
use crate::ast::RepetitionKind::*;
match ast.op.kind {
ZeroOrOne if ast.greedy => self.wtr.write_str("?"),
ZeroOrOne => self.wtr.write_str("??"),
ZeroOrMore if ast.greedy => self.wtr.write_str("*"),
ZeroOrMore => self.wtr.write_str("*?"),
OneOrMore if ast.greedy => self.wtr.write_str("+"),
OneOrMore => self.wtr.write_str("+?"),
Range(ref x) => {
self.fmt_repetition_range(x)?;
if !ast.greedy {
self.wtr.write_str("?")?;
}
Ok(())
}
}
}
fn fmt_repetition_range(
&mut self,
ast: &ast::RepetitionRange,
) -> fmt::Result {
use crate::ast::RepetitionRange::*;
match *ast {
Exactly(x) => write!(self.wtr, "{{{}}}", x),
AtLeast(x) => write!(self.wtr, "{{{},}}", x),
Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y),
}
}
fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result {
use crate::ast::LiteralKind::*;
match ast.kind {
Verbatim => self.wtr.write_char(ast.c),
Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
HexFixed(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
}
HexFixed(ast::HexLiteralKind::UnicodeShort) => {
write!(self.wtr, r"\u{:04X}", u32::from(ast.c))
}
HexFixed(ast::HexLiteralKind::UnicodeLong) => {
write!(self.wtr, r"\U{:08X}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::UnicodeShort) => {
write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::UnicodeLong) => {
write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c))
}
Special(ast::SpecialLiteralKind::Bell) => {
self.wtr.write_str(r"\a")
}
Special(ast::SpecialLiteralKind::FormFeed) => {
self.wtr.write_str(r"\f")
}
Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"),
Special(ast::SpecialLiteralKind::LineFeed) => {
self.wtr.write_str(r"\n")
}
Special(ast::SpecialLiteralKind::CarriageReturn) => {
self.wtr.write_str(r"\r")
}
Special(ast::SpecialLiteralKind::VerticalTab) => {
self.wtr.write_str(r"\v")
}
Special(ast::SpecialLiteralKind::Space) => {
self.wtr.write_str(r"\ ")
}
}
}
fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result {
use crate::ast::AssertionKind::*;
match ast.kind {
StartLine => self.wtr.write_str("^"),
EndLine => self.wtr.write_str("$"),
StartText => self.wtr.write_str(r"\A"),
EndText => self.wtr.write_str(r"\z"),
WordBoundary => self.wtr.write_str(r"\b"),
NotWordBoundary => self.wtr.write_str(r"\B"),
WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
}
}
fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result {
self.wtr.write_str("(?")?;
self.fmt_flags(&ast.flags)?;
self.wtr.write_str(")")?;
Ok(())
}
fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result {
use crate::ast::{Flag, FlagsItemKind};
for item in &ast.items {
match item.kind {
FlagsItemKind::Negation => self.wtr.write_str("-"),
FlagsItemKind::Flag(ref flag) => match *flag {
Flag::CaseInsensitive => self.wtr.write_str("i"),
Flag::MultiLine => self.wtr.write_str("m"),
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
Flag::SwapGreed => self.wtr.write_str("U"),
Flag::Unicode => self.wtr.write_str("u"),
Flag::CRLF => self.wtr.write_str("R"),
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
},
}?;
}
Ok(())
}
fn fmt_class_bracketed_pre(
&mut self,
ast: &ast::ClassBracketed,
) -> fmt::Result {
if ast.negated {
self.wtr.write_str("[^")
} else {
self.wtr.write_str("[")
}
}
fn fmt_class_bracketed_post(
&mut self,
_ast: &ast::ClassBracketed,
) -> fmt::Result {
self.wtr.write_str("]")
}
fn fmt_class_set_binary_op_kind(
&mut self,
ast: &ast::ClassSetBinaryOpKind,
) -> fmt::Result {
use crate::ast::ClassSetBinaryOpKind::*;
match *ast {
Intersection => self.wtr.write_str("&&"),
Difference => self.wtr.write_str("--"),
SymmetricDifference => self.wtr.write_str("~~"),
}
}
fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result {
use crate::ast::ClassPerlKind::*;
match ast.kind {
Digit if ast.negated => self.wtr.write_str(r"\D"),
Digit => self.wtr.write_str(r"\d"),
Space if ast.negated => self.wtr.write_str(r"\S"),
Space => self.wtr.write_str(r"\s"),
Word if ast.negated => self.wtr.write_str(r"\W"),
Word => self.wtr.write_str(r"\w"),
}
}
fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result {
use crate::ast::ClassAsciiKind::*;
match ast.kind {
Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"),
Alnum => self.wtr.write_str("[:alnum:]"),
Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"),
Alpha => self.wtr.write_str("[:alpha:]"),
Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"),
Ascii => self.wtr.write_str("[:ascii:]"),
Blank if ast.negated => self.wtr.write_str("[:^blank:]"),
Blank => self.wtr.write_str("[:blank:]"),
Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"),
Cntrl => self.wtr.write_str("[:cntrl:]"),
Digit if ast.negated => self.wtr.write_str("[:^digit:]"),
Digit => self.wtr.write_str("[:digit:]"),
Graph if ast.negated => self.wtr.write_str("[:^graph:]"),
Graph => self.wtr.write_str("[:graph:]"),
Lower if ast.negated => self.wtr.write_str("[:^lower:]"),
Lower => self.wtr.write_str("[:lower:]"),
Print if ast.negated => self.wtr.write_str("[:^print:]"),
Print => self.wtr.write_str("[:print:]"),
Punct if ast.negated => self.wtr.write_str("[:^punct:]"),
Punct => self.wtr.write_str("[:punct:]"),
Space if ast.negated => self.wtr.write_str("[:^space:]"),
Space => self.wtr.write_str("[:space:]"),
Upper if ast.negated => self.wtr.write_str("[:^upper:]"),
Upper => self.wtr.write_str("[:upper:]"),
Word if ast.negated => self.wtr.write_str("[:^word:]"),
Word => self.wtr.write_str("[:word:]"),
Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"),
Xdigit => self.wtr.write_str("[:xdigit:]"),
}
}
fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result {
use crate::ast::ClassUnicodeKind::*;
use crate::ast::ClassUnicodeOpKind::*;
if ast.negated {
self.wtr.write_str(r"\P")?;
} else {
self.wtr.write_str(r"\p")?;
}
match ast.kind {
OneLetter(c) => self.wtr.write_char(c),
Named(ref x) => write!(self.wtr, "{{{}}}", x),
NamedValue { op: Equal, ref name, ref value } => {
write!(self.wtr, "{{{}={}}}", name, value)
}
NamedValue { op: Colon, ref name, ref value } => {
write!(self.wtr, "{{{}:{}}}", name, value)
}
NamedValue { op: NotEqual, ref name, ref value } => {
write!(self.wtr, "{{{}!={}}}", name, value)
}
}
}
}
#[cfg(test)]
mod tests {
use alloc::string::String;
use crate::ast::parse::ParserBuilder;
use super::*;
fn roundtrip(given: &str) {
roundtrip_with(|b| b, given);
}
fn roundtrip_with<F>(mut f: F, given: &str)
where
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
{
let mut builder = ParserBuilder::new();
f(&mut builder);
let ast = builder.build().parse(given).unwrap();
let mut printer = Printer::new();
let mut dst = String::new();
printer.print(&ast, &mut dst).unwrap();
assert_eq!(given, dst);
}
#[test]
fn print_literal() {
roundtrip("a");
roundtrip(r"\[");
roundtrip_with(|b| b.octal(true), r"\141");
roundtrip(r"\x61");
roundtrip(r"\x7F");
roundtrip(r"\u0061");
roundtrip(r"\U00000061");
roundtrip(r"\x{61}");
roundtrip(r"\x{7F}");
roundtrip(r"\u{61}");
roundtrip(r"\U{61}");
roundtrip(r"\a");
roundtrip(r"\f");
roundtrip(r"\t");
roundtrip(r"\n");
roundtrip(r"\r");
roundtrip(r"\v");
roundtrip(r"(?x)\ ");
}
#[test]
fn print_dot() {
roundtrip(".");
}
#[test]
fn print_concat() {
roundtrip("ab");
roundtrip("abcde");
roundtrip("a(bcd)ef");
}
#[test]
fn print_alternation() {
roundtrip("a|b");
roundtrip("a|b|c|d|e");
roundtrip("|a|b|c|d|e");
roundtrip("|a|b|c|d|e|");
roundtrip("a(b|c|d)|e|f");
}
#[test]
fn print_assertion() {
roundtrip(r"^");
roundtrip(r"$");
roundtrip(r"\A");
roundtrip(r"\z");
roundtrip(r"\b");
roundtrip(r"\B");
}
#[test]
fn print_repetition() {
roundtrip("a?");
roundtrip("a??");
roundtrip("a*");
roundtrip("a*?");
roundtrip("a+");
roundtrip("a+?");
roundtrip("a{5}");
roundtrip("a{5}?");
roundtrip("a{5,}");
roundtrip("a{5,}?");
roundtrip("a{5,10}");
roundtrip("a{5,10}?");
}
#[test]
fn print_flags() {
roundtrip("(?i)");
roundtrip("(?-i)");
roundtrip("(?s-i)");
roundtrip("(?-si)");
roundtrip("(?siUmux)");
}
#[test]
fn print_group() {
roundtrip("(?i:a)");
roundtrip("(?P<foo>a)");
roundtrip("(?<foo>a)");
roundtrip("(a)");
}
#[test]
fn print_class() {
roundtrip(r"[abc]");
roundtrip(r"[a-z]");
roundtrip(r"[^a-z]");
roundtrip(r"[a-z0-9]");
roundtrip(r"[-a-z0-9]");
roundtrip(r"[-a-z0-9]");
roundtrip(r"[a-z0-9---]");
roundtrip(r"[a-z&&m-n]");
roundtrip(r"[[a-z&&m-n]]");
roundtrip(r"[a-z--m-n]");
roundtrip(r"[a-z~~m-n]");
roundtrip(r"[a-z[0-9]]");
roundtrip(r"[a-z[^0-9]]");
roundtrip(r"\d");
roundtrip(r"\D");
roundtrip(r"\s");
roundtrip(r"\S");
roundtrip(r"\w");
roundtrip(r"\W");
roundtrip(r"[[:alnum:]]");
roundtrip(r"[[:^alnum:]]");
roundtrip(r"[[:alpha:]]");
roundtrip(r"[[:^alpha:]]");
roundtrip(r"[[:ascii:]]");
roundtrip(r"[[:^ascii:]]");
roundtrip(r"[[:blank:]]");
roundtrip(r"[[:^blank:]]");
roundtrip(r"[[:cntrl:]]");
roundtrip(r"[[:^cntrl:]]");
roundtrip(r"[[:digit:]]");
roundtrip(r"[[:^digit:]]");
roundtrip(r"[[:graph:]]");
roundtrip(r"[[:^graph:]]");
roundtrip(r"[[:lower:]]");
roundtrip(r"[[:^lower:]]");
roundtrip(r"[[:print:]]");
roundtrip(r"[[:^print:]]");
roundtrip(r"[[:punct:]]");
roundtrip(r"[[:^punct:]]");
roundtrip(r"[[:space:]]");
roundtrip(r"[[:^space:]]");
roundtrip(r"[[:upper:]]");
roundtrip(r"[[:^upper:]]");
roundtrip(r"[[:word:]]");
roundtrip(r"[[:^word:]]");
roundtrip(r"[[:xdigit:]]");
roundtrip(r"[[:^xdigit:]]");
roundtrip(r"\pL");
roundtrip(r"\PL");
roundtrip(r"\p{L}");
roundtrip(r"\P{L}");
roundtrip(r"\p{X=Y}");
roundtrip(r"\P{X=Y}");
roundtrip(r"\p{X:Y}");
roundtrip(r"\P{X:Y}");
roundtrip(r"\p{X!=Y}");
roundtrip(r"\P{X!=Y}");
}
}

522
vendor/regex-syntax/src/ast/visitor.rs vendored Normal file
View File

@@ -0,0 +1,522 @@
use alloc::{vec, vec::Vec};
use crate::ast::{self, Ast};
/// A trait for visiting an abstract syntax tree (AST) in depth first order.
///
/// The principle aim of this trait is to enable callers to perform case
/// analysis on an abstract syntax tree without necessarily using recursion.
/// In particular, this permits callers to do case analysis with constant stack
/// usage, which can be important since the size of an abstract syntax tree
/// may be proportional to end user input.
///
/// Typical usage of this trait involves providing an implementation and then
/// running it using the [`visit`] function.
///
/// Note that the abstract syntax tree for a regular expression is quite
/// complex. Unless you specifically need it, you might be able to use the much
/// simpler [high-level intermediate representation](crate::hir::Hir) and its
/// [corresponding `Visitor` trait](crate::hir::Visitor) instead.
pub trait Visitor {
/// The result of visiting an AST.
type Output;
/// An error that visiting an AST might return.
type Err;
/// All implementors of `Visitor` must provide a `finish` method, which
/// yields the result of visiting the AST or an error.
fn finish(self) -> Result<Self::Output, Self::Err>;
/// This method is called before beginning traversal of the AST.
fn start(&mut self) {}
/// This method is called on an `Ast` before descending into child `Ast`
/// nodes.
fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on an `Ast` after descending all of its child
/// `Ast` nodes.
fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of an
/// [`Alternation`](ast::Alternation).
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of a concatenation.
fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
/// before descending into child nodes.
fn visit_class_set_item_pre(
&mut self,
_ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
/// after descending into child nodes.
fn visit_class_set_item_post(
&mut self,
_ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every
/// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into
/// child nodes.
fn visit_class_set_binary_op_pre(
&mut self,
_ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every
/// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child
/// nodes.
fn visit_class_set_binary_op_post(
&mut self,
_ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between the left hand and right hand child nodes
/// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp).
fn visit_class_set_binary_op_in(
&mut self,
_ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
Ok(())
}
}
/// Executes an implementation of `Visitor` in constant stack space.
///
/// This function will visit every node in the given `Ast` while calling the
/// appropriate methods provided by the [`Visitor`] trait.
///
/// The primary use case for this method is when one wants to perform case
/// analysis over an `Ast` without using a stack size proportional to the depth
/// of the `Ast`. Namely, this method will instead use constant stack size, but
/// will use heap space proportional to the size of the `Ast`. This may be
/// desirable in cases where the size of `Ast` is proportional to end user
/// input.
///
/// If the visitor returns an error at any point, then visiting is stopped and
/// the error is returned.
pub fn visit<V: Visitor>(ast: &Ast, visitor: V) -> Result<V::Output, V::Err> {
HeapVisitor::new().visit(ast, visitor)
}
/// HeapVisitor visits every item in an `Ast` recursively using constant stack
/// size and a heap size proportional to the size of the `Ast`.
struct HeapVisitor<'a> {
/// A stack of `Ast` nodes. This is roughly analogous to the call stack
/// used in a typical recursive visitor.
stack: Vec<(&'a Ast, Frame<'a>)>,
/// Similar to the `Ast` stack above, but is used only for character
/// classes. In particular, character classes embed their own mini
/// recursive syntax.
stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>,
}
/// Represents a single stack frame while performing structural induction over
/// an `Ast`.
enum Frame<'a> {
/// A stack frame allocated just before descending into a repetition
/// operator's child node.
Repetition(&'a ast::Repetition),
/// A stack frame allocated just before descending into a group's child
/// node.
Group(&'a ast::Group),
/// The stack frame used while visiting every child node of a concatenation
/// of expressions.
Concat {
/// The child node we are currently visiting.
head: &'a Ast,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Ast],
},
/// The stack frame used while visiting every child node of an alternation
/// of expressions.
Alternation {
/// The child node we are currently visiting.
head: &'a Ast,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Ast],
},
}
/// Represents a single stack frame while performing structural induction over
/// a character class.
enum ClassFrame<'a> {
/// The stack frame used while visiting every child node of a union of
/// character class items.
Union {
/// The child node we are currently visiting.
head: &'a ast::ClassSetItem,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [ast::ClassSetItem],
},
/// The stack frame used while a binary class operation.
Binary { op: &'a ast::ClassSetBinaryOp },
/// A stack frame allocated just before descending into a binary operator's
/// left hand child node.
BinaryLHS {
op: &'a ast::ClassSetBinaryOp,
lhs: &'a ast::ClassSet,
rhs: &'a ast::ClassSet,
},
/// A stack frame allocated just before descending into a binary operator's
/// right hand child node.
BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet },
}
/// A representation of the inductive step when performing structural induction
/// over a character class.
///
/// Note that there is no analogous explicit type for the inductive step for
/// `Ast` nodes because the inductive step is just an `Ast`. For character
/// classes, the inductive step can produce one of two possible child nodes:
/// an item or a binary operation. (An item cannot be a binary operation
/// because that would imply binary operations can be unioned in the concrete
/// syntax, which is not possible.)
enum ClassInduct<'a> {
Item(&'a ast::ClassSetItem),
BinaryOp(&'a ast::ClassSetBinaryOp),
}
impl<'a> HeapVisitor<'a> {
fn new() -> HeapVisitor<'a> {
HeapVisitor { stack: vec![], stack_class: vec![] }
}
fn visit<V: Visitor>(
&mut self,
mut ast: &'a Ast,
mut visitor: V,
) -> Result<V::Output, V::Err> {
self.stack.clear();
self.stack_class.clear();
visitor.start();
loop {
visitor.visit_pre(ast)?;
if let Some(x) = self.induct(ast, &mut visitor)? {
let child = x.child();
self.stack.push((ast, x));
ast = child;
continue;
}
// No induction means we have a base case, so we can post visit
// it now.
visitor.visit_post(ast)?;
// At this point, we now try to pop our call stack until it is
// either empty or we hit another inductive case.
loop {
let (post_ast, frame) = match self.stack.pop() {
None => return visitor.finish(),
Some((post_ast, frame)) => (post_ast, frame),
};
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
match x {
Frame::Alternation { .. } => {
visitor.visit_alternation_in()?;
}
Frame::Concat { .. } => {
visitor.visit_concat_in()?;
}
_ => {}
}
ast = x.child();
self.stack.push((post_ast, x));
break;
}
// Otherwise, we've finished visiting all the child nodes for
// this AST, so we can post visit it now.
visitor.visit_post(post_ast)?;
}
}
}
/// Build a stack frame for the given AST if one is needed (which occurs if
/// and only if there are child nodes in the AST). Otherwise, return None.
///
/// If this visits a class, then the underlying visitor implementation may
/// return an error which will be passed on here.
fn induct<V: Visitor>(
&mut self,
ast: &'a Ast,
visitor: &mut V,
) -> Result<Option<Frame<'a>>, V::Err> {
Ok(match *ast {
Ast::ClassBracketed(ref x) => {
self.visit_class(x, visitor)?;
None
}
Ast::Repetition(ref x) => Some(Frame::Repetition(x)),
Ast::Group(ref x) => Some(Frame::Group(x)),
Ast::Concat(ref x) if x.asts.is_empty() => None,
Ast::Concat(ref x) => {
Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
}
Ast::Alternation(ref x) if x.asts.is_empty() => None,
Ast::Alternation(ref x) => Some(Frame::Alternation {
head: &x.asts[0],
tail: &x.asts[1..],
}),
_ => None,
})
}
/// Pops the given frame. If the frame has an additional inductive step,
/// then return it, otherwise return `None`.
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
match induct {
Frame::Repetition(_) => None,
Frame::Group(_) => None,
Frame::Concat { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
}
}
Frame::Alternation { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Alternation {
head: &tail[0],
tail: &tail[1..],
})
}
}
}
}
fn visit_class<V: Visitor>(
&mut self,
ast: &'a ast::ClassBracketed,
visitor: &mut V,
) -> Result<(), V::Err> {
let mut ast = ClassInduct::from_bracketed(ast);
loop {
self.visit_class_pre(&ast, visitor)?;
if let Some(x) = self.induct_class(&ast) {
let child = x.child();
self.stack_class.push((ast, x));
ast = child;
continue;
}
self.visit_class_post(&ast, visitor)?;
// At this point, we now try to pop our call stack until it is
// either empty or we hit another inductive case.
loop {
let (post_ast, frame) = match self.stack_class.pop() {
None => return Ok(()),
Some((post_ast, frame)) => (post_ast, frame),
};
// If this is a union or a binary op, then we might have
// additional inductive steps to process.
if let Some(x) = self.pop_class(frame) {
if let ClassFrame::BinaryRHS { ref op, .. } = x {
visitor.visit_class_set_binary_op_in(op)?;
}
ast = x.child();
self.stack_class.push((post_ast, x));
break;
}
// Otherwise, we've finished visiting all the child nodes for
// this class node, so we can post visit it now.
self.visit_class_post(&post_ast, visitor)?;
}
}
}
/// Call the appropriate `Visitor` methods given an inductive step.
fn visit_class_pre<V: Visitor>(
&self,
ast: &ClassInduct<'a>,
visitor: &mut V,
) -> Result<(), V::Err> {
match *ast {
ClassInduct::Item(item) => {
visitor.visit_class_set_item_pre(item)?;
}
ClassInduct::BinaryOp(op) => {
visitor.visit_class_set_binary_op_pre(op)?;
}
}
Ok(())
}
/// Call the appropriate `Visitor` methods given an inductive step.
fn visit_class_post<V: Visitor>(
&self,
ast: &ClassInduct<'a>,
visitor: &mut V,
) -> Result<(), V::Err> {
match *ast {
ClassInduct::Item(item) => {
visitor.visit_class_set_item_post(item)?;
}
ClassInduct::BinaryOp(op) => {
visitor.visit_class_set_binary_op_post(op)?;
}
}
Ok(())
}
/// Build a stack frame for the given class node if one is needed (which
/// occurs if and only if there are child nodes). Otherwise, return None.
fn induct_class(&self, ast: &ClassInduct<'a>) -> Option<ClassFrame<'a>> {
match *ast {
ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => {
match x.kind {
ast::ClassSet::Item(ref item) => {
Some(ClassFrame::Union { head: item, tail: &[] })
}
ast::ClassSet::BinaryOp(ref op) => {
Some(ClassFrame::Binary { op })
}
}
}
ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => {
if x.items.is_empty() {
None
} else {
Some(ClassFrame::Union {
head: &x.items[0],
tail: &x.items[1..],
})
}
}
ClassInduct::BinaryOp(op) => {
Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs })
}
_ => None,
}
}
/// Pops the given frame. If the frame has an additional inductive step,
/// then return it, otherwise return `None`.
fn pop_class(&self, induct: ClassFrame<'a>) -> Option<ClassFrame<'a>> {
match induct {
ClassFrame::Union { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(ClassFrame::Union {
head: &tail[0],
tail: &tail[1..],
})
}
}
ClassFrame::Binary { .. } => None,
ClassFrame::BinaryLHS { op, rhs, .. } => {
Some(ClassFrame::BinaryRHS { op, rhs })
}
ClassFrame::BinaryRHS { .. } => None,
}
}
}
impl<'a> Frame<'a> {
/// Perform the next inductive step on this frame and return the next
/// child AST node to visit.
fn child(&self) -> &'a Ast {
match *self {
Frame::Repetition(rep) => &rep.ast,
Frame::Group(group) => &group.ast,
Frame::Concat { head, .. } => head,
Frame::Alternation { head, .. } => head,
}
}
}
impl<'a> ClassFrame<'a> {
/// Perform the next inductive step on this frame and return the next
/// child class node to visit.
fn child(&self) -> ClassInduct<'a> {
match *self {
ClassFrame::Union { head, .. } => ClassInduct::Item(head),
ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op),
ClassFrame::BinaryLHS { ref lhs, .. } => {
ClassInduct::from_set(lhs)
}
ClassFrame::BinaryRHS { ref rhs, .. } => {
ClassInduct::from_set(rhs)
}
}
}
}
impl<'a> ClassInduct<'a> {
fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> {
ClassInduct::from_set(&ast.kind)
}
fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> {
match *ast {
ast::ClassSet::Item(ref item) => ClassInduct::Item(item),
ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op),
}
}
}
impl<'a> core::fmt::Debug for ClassFrame<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let x = match *self {
ClassFrame::Union { .. } => "Union",
ClassFrame::Binary { .. } => "Binary",
ClassFrame::BinaryLHS { .. } => "BinaryLHS",
ClassFrame::BinaryRHS { .. } => "BinaryRHS",
};
write!(f, "{}", x)
}
}
impl<'a> core::fmt::Debug for ClassInduct<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let x = match *self {
ClassInduct::Item(it) => match *it {
ast::ClassSetItem::Empty(_) => "Item(Empty)",
ast::ClassSetItem::Literal(_) => "Item(Literal)",
ast::ClassSetItem::Range(_) => "Item(Range)",
ast::ClassSetItem::Ascii(_) => "Item(Ascii)",
ast::ClassSetItem::Perl(_) => "Item(Perl)",
ast::ClassSetItem::Unicode(_) => "Item(Unicode)",
ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)",
ast::ClassSetItem::Union(_) => "Item(Union)",
},
ClassInduct::BinaryOp(it) => match it.kind {
ast::ClassSetBinaryOpKind::Intersection => {
"BinaryOp(Intersection)"
}
ast::ClassSetBinaryOpKind::Difference => {
"BinaryOp(Difference)"
}
ast::ClassSetBinaryOpKind::SymmetricDifference => {
"BinaryOp(SymmetricDifference)"
}
},
};
write!(f, "{}", x)
}
}

107
vendor/regex-syntax/src/debug.rs vendored Normal file
View File

@@ -0,0 +1,107 @@
/// A type that wraps a single byte with a convenient fmt::Debug impl that
/// escapes the byte.
pub(crate) struct Byte(pub(crate) u8);
impl core::fmt::Debug for Byte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// Special case ASCII space. It's too hard to read otherwise, so
// put quotes around it. I sometimes wonder whether just '\x20' would
// be better...
if self.0 == b' ' {
return write!(f, "' '");
}
// 10 bytes is enough to cover any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
/// A type that provides a human readable debug impl for arbitrary bytes.
///
/// This generally works best when the bytes are presumed to be mostly UTF-8,
/// but will work for anything.
///
/// N.B. This is copied nearly verbatim from regex-automata. Sigh.
pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]);
impl<'a> core::fmt::Debug for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8_decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(byte) => {
write!(f, r"\x{:02x}", byte)?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
// ASCII control characters except \0, \n, \r, \t
'\x01'..='\x08'
| '\x0b'
| '\x0c'
| '\x0e'..='\x19'
| '\x7f' => {
write!(f, "\\x{:02x}", u32::from(ch))?;
}
'\n' | '\r' | '\t' | _ => {
write!(f, "{}", ch.escape_debug())?;
}
}
}
write!(f, "\"")?;
Ok(())
}
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
fn len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
return Some(1);
} else if byte & 0b1100_0000 == 0b1000_0000 {
return None;
} else if byte <= 0b1101_1111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}
if bytes.is_empty() {
return None;
}
let len = match len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}

8
vendor/regex-syntax/src/either.rs vendored Normal file
View File

@@ -0,0 +1,8 @@
/// A simple binary sum type.
///
/// This is occasionally useful in an ad hoc fashion.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Either<Left, Right> {
Left(Left),
Right(Right),
}

311
vendor/regex-syntax/src/error.rs vendored Normal file
View File

@@ -0,0 +1,311 @@
use alloc::{
format,
string::{String, ToString},
vec,
vec::Vec,
};
use crate::{ast, hir};
/// This error type encompasses any error that can be returned by this crate.
///
/// This error type is marked as `non_exhaustive`. This means that adding a
/// new variant is not considered a breaking change.
#[non_exhaustive]
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Error {
/// An error that occurred while translating concrete syntax into abstract
/// syntax (AST).
Parse(ast::Error),
/// An error that occurred while translating abstract syntax into a high
/// level intermediate representation (HIR).
Translate(hir::Error),
}
impl From<ast::Error> for Error {
fn from(err: ast::Error) -> Error {
Error::Parse(err)
}
}
impl From<hir::Error> for Error {
fn from(err: hir::Error) -> Error {
Error::Translate(err)
}
}
#[cfg(feature = "std")]
impl std::error::Error for Error {}
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
Error::Parse(ref x) => x.fmt(f),
Error::Translate(ref x) => x.fmt(f),
}
}
}
/// A helper type for formatting nice error messages.
///
/// This type is responsible for reporting regex parse errors in a nice human
/// readable format. Most of its complexity is from interspersing notational
/// markers pointing out the position where an error occurred.
#[derive(Debug)]
pub struct Formatter<'e, E> {
/// The original regex pattern in which the error occurred.
pattern: &'e str,
/// The error kind. It must impl fmt::Display.
err: &'e E,
/// The primary span of the error.
span: &'e ast::Span,
/// An auxiliary and optional span, in case the error needs to point to
/// two locations (e.g., when reporting a duplicate capture group name).
aux_span: Option<&'e ast::Span>,
}
impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> {
fn from(err: &'e ast::Error) -> Self {
Formatter {
pattern: err.pattern(),
err: err.kind(),
span: err.span(),
aux_span: err.auxiliary_span(),
}
}
}
impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> {
fn from(err: &'e hir::Error) -> Self {
Formatter {
pattern: err.pattern(),
err: err.kind(),
span: err.span(),
aux_span: None,
}
}
}
impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let spans = Spans::from_formatter(self);
if self.pattern.contains('\n') {
let divider = repeat_char('~', 79);
writeln!(f, "regex parse error:")?;
writeln!(f, "{}", divider)?;
let notated = spans.notate();
write!(f, "{}", notated)?;
writeln!(f, "{}", divider)?;
// If we have error spans that cover multiple lines, then we just
// note the line numbers.
if !spans.multi_line.is_empty() {
let mut notes = vec![];
for span in &spans.multi_line {
notes.push(format!(
"on line {} (column {}) through line {} (column {})",
span.start.line,
span.start.column,
span.end.line,
span.end.column - 1
));
}
writeln!(f, "{}", notes.join("\n"))?;
}
write!(f, "error: {}", self.err)?;
} else {
writeln!(f, "regex parse error:")?;
let notated = Spans::from_formatter(self).notate();
write!(f, "{}", notated)?;
write!(f, "error: {}", self.err)?;
}
Ok(())
}
}
/// This type represents an arbitrary number of error spans in a way that makes
/// it convenient to notate the regex pattern. ("Notate" means "point out
/// exactly where the error occurred in the regex pattern.")
///
/// Technically, we can only ever have two spans given our current error
/// structure. However, after toiling with a specific algorithm for handling
/// two spans, it became obvious that an algorithm to handle an arbitrary
/// number of spans was actually much simpler.
struct Spans<'p> {
/// The original regex pattern string.
pattern: &'p str,
/// The total width that should be used for line numbers. The width is
/// used for left padding the line numbers for alignment.
///
/// A value of `0` means line numbers should not be displayed. That is,
/// the pattern is itself only one line.
line_number_width: usize,
/// All error spans that occur on a single line. This sequence always has
/// length equivalent to the number of lines in `pattern`, where the index
/// of the sequence represents a line number, starting at `0`. The spans
/// in each line are sorted in ascending order.
by_line: Vec<Vec<ast::Span>>,
/// All error spans that occur over one or more lines. That is, the start
/// and end position of the span have different line numbers. The spans are
/// sorted in ascending order.
multi_line: Vec<ast::Span>,
}
impl<'p> Spans<'p> {
/// Build a sequence of spans from a formatter.
fn from_formatter<'e, E: core::fmt::Display>(
fmter: &'p Formatter<'e, E>,
) -> Spans<'p> {
let mut line_count = fmter.pattern.lines().count();
// If the pattern ends with a `\n` literal, then our line count is
// off by one, since a span can occur immediately after the last `\n`,
// which is consider to be an additional line.
if fmter.pattern.ends_with('\n') {
line_count += 1;
}
let line_number_width =
if line_count <= 1 { 0 } else { line_count.to_string().len() };
let mut spans = Spans {
pattern: &fmter.pattern,
line_number_width,
by_line: vec![vec![]; line_count],
multi_line: vec![],
};
spans.add(fmter.span.clone());
if let Some(span) = fmter.aux_span {
spans.add(span.clone());
}
spans
}
/// Add the given span to this sequence, putting it in the right place.
fn add(&mut self, span: ast::Span) {
// This is grossly inefficient since we sort after each add, but right
// now, we only ever add two spans at most.
if span.is_one_line() {
let i = span.start.line - 1; // because lines are 1-indexed
self.by_line[i].push(span);
self.by_line[i].sort();
} else {
self.multi_line.push(span);
self.multi_line.sort();
}
}
/// Notate the pattern string with carets (`^`) pointing at each span
/// location. This only applies to spans that occur within a single line.
fn notate(&self) -> String {
let mut notated = String::new();
for (i, line) in self.pattern.lines().enumerate() {
if self.line_number_width > 0 {
notated.push_str(&self.left_pad_line_number(i + 1));
notated.push_str(": ");
} else {
notated.push_str(" ");
}
notated.push_str(line);
notated.push('\n');
if let Some(notes) = self.notate_line(i) {
notated.push_str(&notes);
notated.push('\n');
}
}
notated
}
/// Return notes for the line indexed at `i` (zero-based). If there are no
/// spans for the given line, then `None` is returned. Otherwise, an
/// appropriately space padded string with correctly positioned `^` is
/// returned, accounting for line numbers.
fn notate_line(&self, i: usize) -> Option<String> {
let spans = &self.by_line[i];
if spans.is_empty() {
return None;
}
let mut notes = String::new();
for _ in 0..self.line_number_padding() {
notes.push(' ');
}
let mut pos = 0;
for span in spans {
for _ in pos..(span.start.column - 1) {
notes.push(' ');
pos += 1;
}
let note_len = span.end.column.saturating_sub(span.start.column);
for _ in 0..core::cmp::max(1, note_len) {
notes.push('^');
pos += 1;
}
}
Some(notes)
}
/// Left pad the given line number with spaces such that it is aligned with
/// other line numbers.
fn left_pad_line_number(&self, n: usize) -> String {
let n = n.to_string();
let pad = self.line_number_width.checked_sub(n.len()).unwrap();
let mut result = repeat_char(' ', pad);
result.push_str(&n);
result
}
/// Return the line number padding beginning at the start of each line of
/// the pattern.
///
/// If the pattern is only one line, then this returns a fixed padding
/// for visual indentation.
fn line_number_padding(&self) -> usize {
if self.line_number_width == 0 {
4
} else {
2 + self.line_number_width
}
}
}
fn repeat_char(c: char, count: usize) -> String {
core::iter::repeat(c).take(count).collect()
}
#[cfg(test)]
mod tests {
use alloc::string::ToString;
use crate::ast::parse::Parser;
fn assert_panic_message(pattern: &str, expected_msg: &str) {
let result = Parser::new().parse(pattern);
match result {
Ok(_) => {
panic!("regex should not have parsed");
}
Err(err) => {
assert_eq!(err.to_string(), expected_msg.trim());
}
}
}
// See: https://github.com/rust-lang/regex/issues/464
#[test]
fn regression_464() {
let err = Parser::new().parse("a{\n").unwrap_err();
// This test checks that the error formatter doesn't panic.
assert!(!err.to_string().is_empty());
}
// See: https://github.com/rust-lang/regex/issues/545
#[test]
fn repetition_quantifier_expects_a_valid_decimal() {
assert_panic_message(
r"\\u{[^}]*}",
r#"
regex parse error:
\\u{[^}]*}
^
error: repetition quantifier expects a valid decimal
"#,
);
}
}

564
vendor/regex-syntax/src/hir/interval.rs vendored Normal file
View File

@@ -0,0 +1,564 @@
use core::{char, cmp, fmt::Debug, slice};
use alloc::vec::Vec;
use crate::unicode;
// This module contains an *internal* implementation of interval sets.
//
// The primary invariant that interval sets guards is canonical ordering. That
// is, every interval set contains an ordered sequence of intervals where
// no two intervals are overlapping or adjacent. While this invariant is
// occasionally broken within the implementation, it should be impossible for
// callers to observe it.
//
// Since case folding (as implemented below) breaks that invariant, we roll
// that into this API even though it is a little out of place in an otherwise
// generic interval set. (Hence the reason why the `unicode` module is imported
// here.)
//
// Some of the implementation complexity here is a result of me wanting to
// preserve the sequential representation without using additional memory.
// In many cases, we do use linear extra memory, but it is at most 2x and it
// is amortized. If we relaxed the memory requirements, this implementation
// could become much simpler. The extra memory is honestly probably OK, but
// character classes (especially of the Unicode variety) can become quite
// large, and it would be nice to keep regex compilation snappy even in debug
// builds. (In the past, I have been careless with this area of code and it has
// caused slow regex compilations in debug mode, so this isn't entirely
// unwarranted.)
//
// Tests on this are relegated to the public API of HIR in src/hir.rs.
#[derive(Clone, Debug)]
pub struct IntervalSet<I> {
/// A sorted set of non-overlapping ranges.
ranges: Vec<I>,
/// While not required at all for correctness, we keep track of whether an
/// interval set has been case folded or not. This helps us avoid doing
/// redundant work if, for example, a set has already been cased folded.
/// And note that whether a set is folded or not is preserved through
/// all of the pairwise set operations. That is, if both interval sets
/// have been case folded, then any of difference, union, intersection or
/// symmetric difference all produce a case folded set.
///
/// Note that when this is true, it *must* be the case that the set is case
/// folded. But when it's false, the set *may* be case folded. In other
/// words, we only set this to true when we know it to be case, but we're
/// okay with it being false if it would otherwise be costly to determine
/// whether it should be true. This means code cannot assume that a false
/// value necessarily indicates that the set is not case folded.
///
/// Bottom line: this is a performance optimization.
folded: bool,
}
impl<I: Interval> Eq for IntervalSet<I> {}
// We implement PartialEq manually so that we don't consider the set's internal
// 'folded' property to be part of its identity. The 'folded' property is
// strictly an optimization.
impl<I: Interval> PartialEq for IntervalSet<I> {
fn eq(&self, other: &IntervalSet<I>) -> bool {
self.ranges.eq(&other.ranges)
}
}
impl<I: Interval> IntervalSet<I> {
/// Create a new set from a sequence of intervals. Each interval is
/// specified as a pair of bounds, where both bounds are inclusive.
///
/// The given ranges do not need to be in any specific order, and ranges
/// may overlap.
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
let ranges: Vec<I> = intervals.into_iter().collect();
// An empty set is case folded.
let folded = ranges.is_empty();
let mut set = IntervalSet { ranges, folded };
set.canonicalize();
set
}
/// Add a new interval to this set.
pub fn push(&mut self, interval: I) {
// TODO: This could be faster. e.g., Push the interval such that
// it preserves canonicalization.
self.ranges.push(interval);
self.canonicalize();
// We don't know whether the new interval added here is considered
// case folded, so we conservatively assume that the entire set is
// no longer case folded if it was previously.
self.folded = false;
}
/// Return an iterator over all intervals in this set.
///
/// The iterator yields intervals in ascending order.
pub fn iter(&self) -> IntervalSetIter<'_, I> {
IntervalSetIter(self.ranges.iter())
}
/// Return an immutable slice of intervals in this set.
///
/// The sequence returned is in canonical ordering.
pub fn intervals(&self) -> &[I] {
&self.ranges
}
/// Expand this interval set such that it contains all case folded
/// characters. For example, if this class consists of the range `a-z`,
/// then applying case folding will result in the class containing both the
/// ranges `a-z` and `A-Z`.
///
/// This returns an error if the necessary case mapping data is not
/// available.
pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
if self.folded {
return Ok(());
}
let len = self.ranges.len();
for i in 0..len {
let range = self.ranges[i];
if let Err(err) = range.case_fold_simple(&mut self.ranges) {
self.canonicalize();
return Err(err);
}
}
self.canonicalize();
self.folded = true;
Ok(())
}
/// Union this set with the given set, in place.
pub fn union(&mut self, other: &IntervalSet<I>) {
if other.ranges.is_empty() || self.ranges == other.ranges {
return;
}
// This could almost certainly be done more efficiently.
self.ranges.extend(&other.ranges);
self.canonicalize();
self.folded = self.folded && other.folded;
}
/// Intersect this set with the given set, in place.
pub fn intersect(&mut self, other: &IntervalSet<I>) {
if self.ranges.is_empty() {
return;
}
if other.ranges.is_empty() {
self.ranges.clear();
// An empty set is case folded.
self.folded = true;
return;
}
// There should be a way to do this in-place with constant memory,
// but I couldn't figure out a simple way to do it. So just append
// the intersection to the end of this range, and then drain it before
// we're done.
let drain_end = self.ranges.len();
let mut ita = 0..drain_end;
let mut itb = 0..other.ranges.len();
let mut a = ita.next().unwrap();
let mut b = itb.next().unwrap();
loop {
if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
self.ranges.push(ab);
}
let (it, aorb) =
if self.ranges[a].upper() < other.ranges[b].upper() {
(&mut ita, &mut a)
} else {
(&mut itb, &mut b)
};
match it.next() {
Some(v) => *aorb = v,
None => break,
}
}
self.ranges.drain(..drain_end);
self.folded = self.folded && other.folded;
}
/// Subtract the given set from this set, in place.
pub fn difference(&mut self, other: &IntervalSet<I>) {
if self.ranges.is_empty() || other.ranges.is_empty() {
return;
}
// This algorithm is (to me) surprisingly complex. A search of the
// interwebs indicate that this is a potentially interesting problem.
// Folks seem to suggest interval or segment trees, but I'd like to
// avoid the overhead (both runtime and conceptual) of that.
//
// The following is basically my Shitty First Draft. Therefore, in
// order to grok it, you probably need to read each line carefully.
// Simplifications are most welcome!
//
// Remember, we can assume the canonical format invariant here, which
// says that all ranges are sorted, not overlapping and not adjacent in
// each class.
let drain_end = self.ranges.len();
let (mut a, mut b) = (0, 0);
'LOOP: while a < drain_end && b < other.ranges.len() {
// Basically, the easy cases are when neither range overlaps with
// each other. If the `b` range is less than our current `a`
// range, then we can skip it and move on.
if other.ranges[b].upper() < self.ranges[a].lower() {
b += 1;
continue;
}
// ... similarly for the `a` range. If it's less than the smallest
// `b` range, then we can add it as-is.
if self.ranges[a].upper() < other.ranges[b].lower() {
let range = self.ranges[a];
self.ranges.push(range);
a += 1;
continue;
}
// Otherwise, we have overlapping ranges.
assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
// This part is tricky and was non-obvious to me without looking
// at explicit examples (see the tests). The trickiness stems from
// two things: 1) subtracting a range from another range could
// yield two ranges and 2) after subtracting a range, it's possible
// that future ranges can have an impact. The loop below advances
// the `b` ranges until they can't possible impact the current
// range.
//
// For example, if our `a` range is `a-t` and our next three `b`
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
// subtraction three times before moving on to the next `a` range.
let mut range = self.ranges[a];
while b < other.ranges.len()
&& !range.is_intersection_empty(&other.ranges[b])
{
let old_range = range;
range = match range.difference(&other.ranges[b]) {
(None, None) => {
// We lost the entire range, so move on to the next
// without adding this one.
a += 1;
continue 'LOOP;
}
(Some(range1), None) | (None, Some(range1)) => range1,
(Some(range1), Some(range2)) => {
self.ranges.push(range1);
range2
}
};
// It's possible that the `b` range has more to contribute
// here. In particular, if it is greater than the original
// range, then it might impact the next `a` range *and* it
// has impacted the current `a` range as much as possible,
// so we can quit. We don't bump `b` so that the next `a`
// range can apply it.
if other.ranges[b].upper() > old_range.upper() {
break;
}
// Otherwise, the next `b` range might apply to the current
// `a` range.
b += 1;
}
self.ranges.push(range);
a += 1;
}
while a < drain_end {
let range = self.ranges[a];
self.ranges.push(range);
a += 1;
}
self.ranges.drain(..drain_end);
self.folded = self.folded && other.folded;
}
/// Compute the symmetric difference of the two sets, in place.
///
/// This computes the symmetric difference of two interval sets. This
/// removes all elements in this set that are also in the given set,
/// but also adds all elements from the given set that aren't in this
/// set. That is, the set will contain all elements in either set,
/// but will not contain any elements that are in both sets.
pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
// TODO(burntsushi): Fix this so that it amortizes allocation.
let mut intersection = self.clone();
intersection.intersect(other);
self.union(other);
self.difference(&intersection);
}
/// Negate this interval set.
///
/// For all `x` where `x` is any element, if `x` was in this set, then it
/// will not be in this set after negation.
pub fn negate(&mut self) {
if self.ranges.is_empty() {
let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
self.ranges.push(I::create(min, max));
// The set containing everything must case folded.
self.folded = true;
return;
}
// There should be a way to do this in-place with constant memory,
// but I couldn't figure out a simple way to do it. So just append
// the negation to the end of this range, and then drain it before
// we're done.
let drain_end = self.ranges.len();
// We do checked arithmetic below because of the canonical ordering
// invariant.
if self.ranges[0].lower() > I::Bound::min_value() {
let upper = self.ranges[0].lower().decrement();
self.ranges.push(I::create(I::Bound::min_value(), upper));
}
for i in 1..drain_end {
let lower = self.ranges[i - 1].upper().increment();
let upper = self.ranges[i].lower().decrement();
self.ranges.push(I::create(lower, upper));
}
if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
let lower = self.ranges[drain_end - 1].upper().increment();
self.ranges.push(I::create(lower, I::Bound::max_value()));
}
self.ranges.drain(..drain_end);
// We don't need to update whether this set is folded or not, because
// it is conservatively preserved through negation. Namely, if a set
// is not folded, then it is possible that its negation is folded, for
// example, [^☃]. But we're fine with assuming that the set is not
// folded in that case. (`folded` permits false negatives but not false
// positives.)
//
// But what about when a set is folded, is its negation also
// necessarily folded? Yes. Because if a set is folded, then for every
// character in the set, it necessarily included its equivalence class
// of case folded characters. Negating it in turn means that all
// equivalence classes in the set are negated, and any equivalence
// class that was previously not in the set is now entirely in the set.
}
/// Converts this set into a canonical ordering.
fn canonicalize(&mut self) {
if self.is_canonical() {
return;
}
self.ranges.sort();
assert!(!self.ranges.is_empty());
// Is there a way to do this in-place with constant memory? I couldn't
// figure out a way to do it. So just append the canonicalization to
// the end of this range, and then drain it before we're done.
let drain_end = self.ranges.len();
for oldi in 0..drain_end {
// If we've added at least one new range, then check if we can
// merge this range in the previously added range.
if self.ranges.len() > drain_end {
let (last, rest) = self.ranges.split_last_mut().unwrap();
if let Some(union) = last.union(&rest[oldi]) {
*last = union;
continue;
}
}
let range = self.ranges[oldi];
self.ranges.push(range);
}
self.ranges.drain(..drain_end);
}
/// Returns true if and only if this class is in a canonical ordering.
fn is_canonical(&self) -> bool {
for pair in self.ranges.windows(2) {
if pair[0] >= pair[1] {
return false;
}
if pair[0].is_contiguous(&pair[1]) {
return false;
}
}
true
}
}
/// An iterator over intervals.
#[derive(Debug)]
pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
impl<'a, I> Iterator for IntervalSetIter<'a, I> {
type Item = &'a I;
fn next(&mut self) -> Option<&'a I> {
self.0.next()
}
}
pub trait Interval:
Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
{
type Bound: Bound;
fn lower(&self) -> Self::Bound;
fn upper(&self) -> Self::Bound;
fn set_lower(&mut self, bound: Self::Bound);
fn set_upper(&mut self, bound: Self::Bound);
fn case_fold_simple(
&self,
intervals: &mut Vec<Self>,
) -> Result<(), unicode::CaseFoldError>;
/// Create a new interval.
fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
let mut int = Self::default();
if lower <= upper {
int.set_lower(lower);
int.set_upper(upper);
} else {
int.set_lower(upper);
int.set_upper(lower);
}
int
}
/// Union the given overlapping range into this range.
///
/// If the two ranges aren't contiguous, then this returns `None`.
fn union(&self, other: &Self) -> Option<Self> {
if !self.is_contiguous(other) {
return None;
}
let lower = cmp::min(self.lower(), other.lower());
let upper = cmp::max(self.upper(), other.upper());
Some(Self::create(lower, upper))
}
/// Intersect this range with the given range and return the result.
///
/// If the intersection is empty, then this returns `None`.
fn intersect(&self, other: &Self) -> Option<Self> {
let lower = cmp::max(self.lower(), other.lower());
let upper = cmp::min(self.upper(), other.upper());
if lower <= upper {
Some(Self::create(lower, upper))
} else {
None
}
}
/// Subtract the given range from this range and return the resulting
/// ranges.
///
/// If subtraction would result in an empty range, then no ranges are
/// returned.
fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
if self.is_subset(other) {
return (None, None);
}
if self.is_intersection_empty(other) {
return (Some(self.clone()), None);
}
let add_lower = other.lower() > self.lower();
let add_upper = other.upper() < self.upper();
// We know this because !self.is_subset(other) and the ranges have
// a non-empty intersection.
assert!(add_lower || add_upper);
let mut ret = (None, None);
if add_lower {
let upper = other.lower().decrement();
ret.0 = Some(Self::create(self.lower(), upper));
}
if add_upper {
let lower = other.upper().increment();
let range = Self::create(lower, self.upper());
if ret.0.is_none() {
ret.0 = Some(range);
} else {
ret.1 = Some(range);
}
}
ret
}
/// Returns true if and only if the two ranges are contiguous. Two ranges
/// are contiguous if and only if the ranges are either overlapping or
/// adjacent.
fn is_contiguous(&self, other: &Self) -> bool {
let lower1 = self.lower().as_u32();
let upper1 = self.upper().as_u32();
let lower2 = other.lower().as_u32();
let upper2 = other.upper().as_u32();
cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
}
/// Returns true if and only if the intersection of this range and the
/// other range is empty.
fn is_intersection_empty(&self, other: &Self) -> bool {
let (lower1, upper1) = (self.lower(), self.upper());
let (lower2, upper2) = (other.lower(), other.upper());
cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
}
/// Returns true if and only if this range is a subset of the other range.
fn is_subset(&self, other: &Self) -> bool {
let (lower1, upper1) = (self.lower(), self.upper());
let (lower2, upper2) = (other.lower(), other.upper());
(lower2 <= lower1 && lower1 <= upper2)
&& (lower2 <= upper1 && upper1 <= upper2)
}
}
pub trait Bound:
Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
{
fn min_value() -> Self;
fn max_value() -> Self;
fn as_u32(self) -> u32;
fn increment(self) -> Self;
fn decrement(self) -> Self;
}
impl Bound for u8 {
fn min_value() -> Self {
u8::MIN
}
fn max_value() -> Self {
u8::MAX
}
fn as_u32(self) -> u32 {
u32::from(self)
}
fn increment(self) -> Self {
self.checked_add(1).unwrap()
}
fn decrement(self) -> Self {
self.checked_sub(1).unwrap()
}
}
impl Bound for char {
fn min_value() -> Self {
'\x00'
}
fn max_value() -> Self {
'\u{10FFFF}'
}
fn as_u32(self) -> u32 {
u32::from(self)
}
fn increment(self) -> Self {
match self {
'\u{D7FF}' => '\u{E000}',
c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
}
}
fn decrement(self) -> Self {
match self {
'\u{E000}' => '\u{D7FF}',
c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
}
}
}
// Tests for interval sets are written in src/hir.rs against the public API.

3214
vendor/regex-syntax/src/hir/literal.rs vendored Normal file

File diff suppressed because it is too large Load Diff

3873
vendor/regex-syntax/src/hir/mod.rs vendored Normal file

File diff suppressed because it is too large Load Diff

608
vendor/regex-syntax/src/hir/print.rs vendored Normal file
View File

@@ -0,0 +1,608 @@
/*!
This module provides a regular expression printer for `Hir`.
*/
use core::fmt;
use crate::{
hir::{
self,
visitor::{self, Visitor},
Hir, HirKind,
},
is_meta_character,
};
/// A builder for constructing a printer.
///
/// Note that since a printer doesn't have any configuration knobs, this type
/// remains unexported.
#[derive(Clone, Debug)]
struct PrinterBuilder {
_priv: (),
}
impl Default for PrinterBuilder {
fn default() -> PrinterBuilder {
PrinterBuilder::new()
}
}
impl PrinterBuilder {
fn new() -> PrinterBuilder {
PrinterBuilder { _priv: () }
}
fn build(&self) -> Printer {
Printer { _priv: () }
}
}
/// A printer for a regular expression's high-level intermediate
/// representation.
///
/// A printer converts a high-level intermediate representation (HIR) to a
/// regular expression pattern string. This particular printer uses constant
/// stack space and heap space proportional to the size of the HIR.
///
/// Since this printer is only using the HIR, the pattern it prints will likely
/// not resemble the original pattern at all. For example, a pattern like
/// `\pL` will have its entire class written out.
///
/// The purpose of this printer is to provide a means to mutate an HIR and then
/// build a regular expression from the result of that mutation. (A regex
/// library could provide a constructor from this HIR explicitly, but that
/// creates an unnecessary public coupling between the regex library and this
/// specific HIR representation.)
#[derive(Debug)]
pub struct Printer {
_priv: (),
}
impl Printer {
/// Create a new printer.
pub fn new() -> Printer {
PrinterBuilder::new().build()
}
/// Print the given `Ast` to the given writer. The writer must implement
/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
/// here are a `fmt::Formatter` (which is available in `fmt::Display`
/// implementations) or a `&mut String`.
pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
visitor::visit(hir, Writer { wtr })
}
}
#[derive(Debug)]
struct Writer<W> {
wtr: W,
}
impl<W: fmt::Write> Visitor for Writer<W> {
type Output = ();
type Err = fmt::Error;
fn finish(self) -> fmt::Result {
Ok(())
}
fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
HirKind::Empty => {
// Technically an empty sub-expression could be "printed" by
// just ignoring it, but in practice, you could have a
// repetition operator attached to an empty expression, and you
// really need something in the concrete syntax to make that
// work as you'd expect.
self.wtr.write_str(r"(?:)")?;
}
// Repetition operators are strictly suffix oriented.
HirKind::Repetition(_) => {}
HirKind::Literal(hir::Literal(ref bytes)) => {
// See the comment on the 'Concat' and 'Alternation' case below
// for why we put parens here. Literals are, conceptually,
// a special case of concatenation where each element is a
// character. The HIR flattens this into a Box<[u8]>, but we
// still need to treat it like a concatenation for correct
// printing. As a special case, we don't write parens if there
// is only one character. One character means there is no
// concat so we don't need parens. Adding parens would still be
// correct, but we drop them here because it tends to create
// rather noisy regexes even in simple cases.
let result = core::str::from_utf8(bytes);
let len = result.map_or(bytes.len(), |s| s.chars().count());
if len > 1 {
self.wtr.write_str(r"(?:")?;
}
match result {
Ok(string) => {
for c in string.chars() {
self.write_literal_char(c)?;
}
}
Err(_) => {
for &b in bytes.iter() {
self.write_literal_byte(b)?;
}
}
}
if len > 1 {
self.wtr.write_str(r")")?;
}
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if cls.ranges().is_empty() {
return self.wtr.write_str("[a&&b]");
}
self.wtr.write_str("[")?;
for range in cls.iter() {
if range.start() == range.end() {
self.write_literal_char(range.start())?;
} else if u32::from(range.start()) + 1
== u32::from(range.end())
{
self.write_literal_char(range.start())?;
self.write_literal_char(range.end())?;
} else {
self.write_literal_char(range.start())?;
self.wtr.write_str("-")?;
self.write_literal_char(range.end())?;
}
}
self.wtr.write_str("]")?;
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if cls.ranges().is_empty() {
return self.wtr.write_str("[a&&b]");
}
self.wtr.write_str("(?-u:[")?;
for range in cls.iter() {
if range.start() == range.end() {
self.write_literal_class_byte(range.start())?;
} else if range.start() + 1 == range.end() {
self.write_literal_class_byte(range.start())?;
self.write_literal_class_byte(range.end())?;
} else {
self.write_literal_class_byte(range.start())?;
self.wtr.write_str("-")?;
self.write_literal_class_byte(range.end())?;
}
}
self.wtr.write_str("])")?;
}
HirKind::Look(ref look) => match *look {
hir::Look::Start => {
self.wtr.write_str(r"\A")?;
}
hir::Look::End => {
self.wtr.write_str(r"\z")?;
}
hir::Look::StartLF => {
self.wtr.write_str("(?m:^)")?;
}
hir::Look::EndLF => {
self.wtr.write_str("(?m:$)")?;
}
hir::Look::StartCRLF => {
self.wtr.write_str("(?mR:^)")?;
}
hir::Look::EndCRLF => {
self.wtr.write_str("(?mR:$)")?;
}
hir::Look::WordAscii => {
self.wtr.write_str(r"(?-u:\b)")?;
}
hir::Look::WordAsciiNegate => {
self.wtr.write_str(r"(?-u:\B)")?;
}
hir::Look::WordUnicode => {
self.wtr.write_str(r"\b")?;
}
hir::Look::WordUnicodeNegate => {
self.wtr.write_str(r"\B")?;
}
hir::Look::WordStartAscii => {
self.wtr.write_str(r"(?-u:\b{start})")?;
}
hir::Look::WordEndAscii => {
self.wtr.write_str(r"(?-u:\b{end})")?;
}
hir::Look::WordStartUnicode => {
self.wtr.write_str(r"\b{start}")?;
}
hir::Look::WordEndUnicode => {
self.wtr.write_str(r"\b{end}")?;
}
hir::Look::WordStartHalfAscii => {
self.wtr.write_str(r"(?-u:\b{start-half})")?;
}
hir::Look::WordEndHalfAscii => {
self.wtr.write_str(r"(?-u:\b{end-half})")?;
}
hir::Look::WordStartHalfUnicode => {
self.wtr.write_str(r"\b{start-half}")?;
}
hir::Look::WordEndHalfUnicode => {
self.wtr.write_str(r"\b{end-half}")?;
}
},
HirKind::Capture(hir::Capture { ref name, .. }) => {
self.wtr.write_str("(")?;
if let Some(ref name) = *name {
write!(self.wtr, "?P<{}>", name)?;
}
}
// Why do this? Wrapping concats and alts in non-capturing groups
// is not *always* necessary, but is sometimes necessary. For
// example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'
// and not 'ab|c'. The former is clearly the intended meaning, but
// the latter is actually 'alt(concat(a, b), c)'.
//
// It would be possible to only group these things in cases where
// it's strictly necessary, but it requires knowing the parent
// expression. And since this technique is simpler and always
// correct, we take this route. More to the point, it is a non-goal
// of an HIR printer to show a nice easy-to-read regex. Indeed,
// its construction forbids it from doing so. Therefore, inserting
// extra groups where they aren't necessary is perfectly okay.
HirKind::Concat(_) | HirKind::Alternation(_) => {
self.wtr.write_str(r"(?:")?;
}
}
Ok(())
}
fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
// Handled during visit_pre
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_) => {}
HirKind::Repetition(ref x) => {
match (x.min, x.max) {
(0, Some(1)) => {
self.wtr.write_str("?")?;
}
(0, None) => {
self.wtr.write_str("*")?;
}
(1, None) => {
self.wtr.write_str("+")?;
}
(1, Some(1)) => {
// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
return Ok(());
}
(m, None) => {
write!(self.wtr, "{{{},}}", m)?;
}
(m, Some(n)) if m == n => {
write!(self.wtr, "{{{}}}", m)?;
// a{m} and a{m}? are always exactly equivalent.
return Ok(());
}
(m, Some(n)) => {
write!(self.wtr, "{{{},{}}}", m, n)?;
}
}
if !x.greedy {
self.wtr.write_str("?")?;
}
}
HirKind::Capture(_)
| HirKind::Concat(_)
| HirKind::Alternation(_) => {
self.wtr.write_str(r")")?;
}
}
Ok(())
}
fn visit_alternation_in(&mut self) -> fmt::Result {
self.wtr.write_str("|")
}
}
impl<W: fmt::Write> Writer<W> {
fn write_literal_char(&mut self, c: char) -> fmt::Result {
if is_meta_character(c) {
self.wtr.write_str("\\")?;
}
self.wtr.write_char(c)
}
fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "(?-u:\\x{:02X})", b)
}
}
fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "\\x{:02X}", b)
}
}
}
#[cfg(test)]
mod tests {
use alloc::{
boxed::Box,
string::{String, ToString},
};
use crate::ParserBuilder;
use super::*;
fn roundtrip(given: &str, expected: &str) {
roundtrip_with(|b| b, given, expected);
}
fn roundtrip_bytes(given: &str, expected: &str) {
roundtrip_with(|b| b.utf8(false), given, expected);
}
fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
where
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
{
let mut builder = ParserBuilder::new();
f(&mut builder);
let hir = builder.build().parse(given).unwrap();
let mut printer = Printer::new();
let mut dst = String::new();
printer.print(&hir, &mut dst).unwrap();
// Check that the result is actually valid.
builder.build().parse(&dst).unwrap();
assert_eq!(expected, dst);
}
#[test]
fn print_literal() {
roundtrip("a", "a");
roundtrip(r"\xff", "\u{FF}");
roundtrip_bytes(r"\xff", "\u{FF}");
roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
roundtrip("", "");
}
#[test]
fn print_class() {
roundtrip(r"[a]", r"a");
roundtrip(r"[ab]", r"[ab]");
roundtrip(r"[a-z]", r"[a-z]");
roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}");
roundtrip(r"[-]", r"\-");
roundtrip(r"[☃-⛄]", r"[☃-⛄]");
roundtrip(r"(?-u)[a]", r"a");
roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
// The following test that the printer escapes meta characters
// in character classes.
roundtrip(r"[\[]", r"\[");
roundtrip(r"[Z-_]", r"[Z-_]");
roundtrip(r"[Z-_--Z]", r"[\[-_]");
// The following test that the printer escapes meta characters
// in byte oriented character classes.
roundtrip_bytes(r"(?-u)[\[]", r"\[");
roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
// This tests that an empty character class is correctly roundtripped.
#[cfg(feature = "unicode-gencat")]
roundtrip(r"\P{any}", r"[a&&b]");
roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
}
#[test]
fn print_anchor() {
roundtrip(r"^", r"\A");
roundtrip(r"$", r"\z");
roundtrip(r"(?m)^", r"(?m:^)");
roundtrip(r"(?m)$", r"(?m:$)");
}
#[test]
fn print_word_boundary() {
roundtrip(r"\b", r"\b");
roundtrip(r"\B", r"\B");
roundtrip(r"(?-u)\b", r"(?-u:\b)");
roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
}
#[test]
fn print_repetition() {
roundtrip("a?", "a?");
roundtrip("a??", "a??");
roundtrip("(?U)a?", "a??");
roundtrip("a*", "a*");
roundtrip("a*?", "a*?");
roundtrip("(?U)a*", "a*?");
roundtrip("a+", "a+");
roundtrip("a+?", "a+?");
roundtrip("(?U)a+", "a+?");
roundtrip("a{1}", "a");
roundtrip("a{2}", "a{2}");
roundtrip("a{1,}", "a+");
roundtrip("a{1,5}", "a{1,5}");
roundtrip("a{1}?", "a");
roundtrip("a{2}?", "a{2}");
roundtrip("a{1,}?", "a+?");
roundtrip("a{1,5}?", "a{1,5}?");
roundtrip("(?U)a{1}", "a");
roundtrip("(?U)a{2}", "a{2}");
roundtrip("(?U)a{1,}", "a+?");
roundtrip("(?U)a{1,5}", "a{1,5}?");
// Test that various zero-length repetitions always translate to an
// empty regex. This is more a property of HIR's smart constructors
// than the printer though.
roundtrip("a{0}", "(?:)");
roundtrip("(?:ab){0}", "(?:)");
#[cfg(feature = "unicode-gencat")]
{
roundtrip(r"\p{any}{0}", "(?:)");
roundtrip(r"\P{any}{0}", "(?:)");
}
}
#[test]
fn print_group() {
roundtrip("()", "((?:))");
roundtrip("(?P<foo>)", "(?P<foo>(?:))");
roundtrip("(?:)", "(?:)");
roundtrip("(a)", "(a)");
roundtrip("(?P<foo>a)", "(?P<foo>a)");
roundtrip("(?:a)", "a");
roundtrip("((((a))))", "((((a))))");
}
#[test]
fn print_alternation() {
roundtrip("|", "(?:(?:)|(?:))");
roundtrip("||", "(?:(?:)|(?:)|(?:))");
roundtrip("a|b", "[ab]");
roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
roundtrip("a|b|c", "[a-c]");
roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))");
roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))");
}
// This is a regression test that stresses a peculiarity of how the HIR
// is both constructed and printed. Namely, it is legal for a repetition
// to directly contain a concatenation. This particular construct isn't
// really possible to build from the concrete syntax directly, since you'd
// be forced to put the concatenation into (at least) a non-capturing
// group. Concurrently, the printer doesn't consider this case and just
// kind of naively prints the child expression and tacks on the repetition
// operator.
//
// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
// you 'ab+', but clearly it really should be '(?:ab)+'.
//
// This bug isn't easy to surface because most ways of building an HIR
// come directly from the concrete syntax, and as mentioned above, it just
// isn't possible to build this kind of HIR from the concrete syntax.
// Nevertheless, this is definitely a bug.
//
// See: https://github.com/rust-lang/regex/issues/731
#[test]
fn regression_repetition_concat() {
let expr = Hir::concat(alloc::vec![
Hir::literal("x".as_bytes()),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::literal("ab".as_bytes())),
}),
Hir::literal("y".as_bytes()),
]);
assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
let expr = Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::look(hir::Look::End),
])),
}),
Hir::look(hir::Look::End),
]);
assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
}
// Just like regression_repetition_concat, but with the repetition using
// an alternation as a child expression instead.
//
// See: https://github.com/rust-lang/regex/issues/731
#[test]
fn regression_repetition_alternation() {
let expr = Hir::concat(alloc::vec![
Hir::literal("ab".as_bytes()),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::alternation(alloc::vec![
Hir::literal("cd".as_bytes()),
Hir::literal("ef".as_bytes()),
])),
}),
Hir::literal("gh".as_bytes()),
]);
assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());
let expr = Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::alternation(alloc::vec![
Hir::look(hir::Look::Start),
Hir::look(hir::Look::End),
])),
}),
Hir::look(hir::Look::End),
]);
assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
}
// This regression test is very similar in flavor to
// regression_repetition_concat in that the root of the issue lies in a
// peculiarity of how the HIR is represented and how the printer writes it
// out. Like the other regression, this one is also rooted in the fact that
// you can't produce the peculiar HIR from the concrete syntax. Namely, you
// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
// be in (at least) a non-capturing group. Why? Because the '|' has very
// low precedence (lower that concatenation), and so something like 'ab|c'
// is actually 'alt(ab, c)'.
//
// See: https://github.com/rust-lang/regex/issues/516
#[test]
fn regression_alternation_concat() {
let expr = Hir::concat(alloc::vec![
Hir::literal("ab".as_bytes()),
Hir::alternation(alloc::vec![
Hir::literal("mn".as_bytes()),
Hir::literal("xy".as_bytes()),
]),
]);
assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());
let expr = Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::alternation(alloc::vec![
Hir::look(hir::Look::Start),
Hir::look(hir::Look::End),
]),
]);
assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
}
}

3744
vendor/regex-syntax/src/hir/translate.rs vendored Normal file

File diff suppressed because it is too large Load Diff

215
vendor/regex-syntax/src/hir/visitor.rs vendored Normal file
View File

@@ -0,0 +1,215 @@
use alloc::{vec, vec::Vec};
use crate::hir::{self, Hir, HirKind};
/// A trait for visiting the high-level IR (HIR) in depth first order.
///
/// The principle aim of this trait is to enable callers to perform case
/// analysis on a high-level intermediate representation of a regular
/// expression without necessarily using recursion. In particular, this permits
/// callers to do case analysis with constant stack usage, which can be
/// important since the size of an HIR may be proportional to end user input.
///
/// Typical usage of this trait involves providing an implementation and then
/// running it using the [`visit`] function.
pub trait Visitor {
/// The result of visiting an HIR.
type Output;
/// An error that visiting an HIR might return.
type Err;
/// All implementors of `Visitor` must provide a `finish` method, which
/// yields the result of visiting the HIR or an error.
fn finish(self) -> Result<Self::Output, Self::Err>;
/// This method is called before beginning traversal of the HIR.
fn start(&mut self) {}
/// This method is called on an `Hir` before descending into child `Hir`
/// nodes.
fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on an `Hir` after descending all of its child
/// `Hir` nodes.
fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of an alternation.
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of a concatenation.
fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
}
/// Executes an implementation of `Visitor` in constant stack space.
///
/// This function will visit every node in the given `Hir` while calling
/// appropriate methods provided by the [`Visitor`] trait.
///
/// The primary use case for this method is when one wants to perform case
/// analysis over an `Hir` without using a stack size proportional to the depth
/// of the `Hir`. Namely, this method will instead use constant stack space,
/// but will use heap space proportional to the size of the `Hir`. This may be
/// desirable in cases where the size of `Hir` is proportional to end user
/// input.
///
/// If the visitor returns an error at any point, then visiting is stopped and
/// the error is returned.
pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> {
HeapVisitor::new().visit(hir, visitor)
}
/// HeapVisitor visits every item in an `Hir` recursively using constant stack
/// size and a heap size proportional to the size of the `Hir`.
struct HeapVisitor<'a> {
/// A stack of `Hir` nodes. This is roughly analogous to the call stack
/// used in a typical recursive visitor.
stack: Vec<(&'a Hir, Frame<'a>)>,
}
/// Represents a single stack frame while performing structural induction over
/// an `Hir`.
enum Frame<'a> {
/// A stack frame allocated just before descending into a repetition
/// operator's child node.
Repetition(&'a hir::Repetition),
/// A stack frame allocated just before descending into a capture's child
/// node.
Capture(&'a hir::Capture),
/// The stack frame used while visiting every child node of a concatenation
/// of expressions.
Concat {
/// The child node we are currently visiting.
head: &'a Hir,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Hir],
},
/// The stack frame used while visiting every child node of an alternation
/// of expressions.
Alternation {
/// The child node we are currently visiting.
head: &'a Hir,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Hir],
},
}
impl<'a> HeapVisitor<'a> {
fn new() -> HeapVisitor<'a> {
HeapVisitor { stack: vec![] }
}
fn visit<V: Visitor>(
&mut self,
mut hir: &'a Hir,
mut visitor: V,
) -> Result<V::Output, V::Err> {
self.stack.clear();
visitor.start();
loop {
visitor.visit_pre(hir)?;
if let Some(x) = self.induct(hir) {
let child = x.child();
self.stack.push((hir, x));
hir = child;
continue;
}
// No induction means we have a base case, so we can post visit
// it now.
visitor.visit_post(hir)?;
// At this point, we now try to pop our call stack until it is
// either empty or we hit another inductive case.
loop {
let (post_hir, frame) = match self.stack.pop() {
None => return visitor.finish(),
Some((post_hir, frame)) => (post_hir, frame),
};
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
match x {
Frame::Alternation { .. } => {
visitor.visit_alternation_in()?;
}
Frame::Concat { .. } => {
visitor.visit_concat_in()?;
}
_ => {}
}
hir = x.child();
self.stack.push((post_hir, x));
break;
}
// Otherwise, we've finished visiting all the child nodes for
// this HIR, so we can post visit it now.
visitor.visit_post(post_hir)?;
}
}
}
/// Build a stack frame for the given HIR if one is needed (which occurs if
/// and only if there are child nodes in the HIR). Otherwise, return None.
fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
match *hir.kind() {
HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
HirKind::Capture(ref x) => Some(Frame::Capture(x)),
HirKind::Concat(ref x) if x.is_empty() => None,
HirKind::Concat(ref x) => {
Some(Frame::Concat { head: &x[0], tail: &x[1..] })
}
HirKind::Alternation(ref x) if x.is_empty() => None,
HirKind::Alternation(ref x) => {
Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
}
_ => None,
}
}
/// Pops the given frame. If the frame has an additional inductive step,
/// then return it, otherwise return `None`.
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
match induct {
Frame::Repetition(_) => None,
Frame::Capture(_) => None,
Frame::Concat { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
}
}
Frame::Alternation { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Alternation {
head: &tail[0],
tail: &tail[1..],
})
}
}
}
}
}
impl<'a> Frame<'a> {
/// Perform the next inductive step on this frame and return the next
/// child HIR node to visit.
fn child(&self) -> &'a Hir {
match *self {
Frame::Repetition(rep) => &rep.sub,
Frame::Capture(capture) => &capture.sub,
Frame::Concat { head, .. } => head,
Frame::Alternation { head, .. } => head,
}
}
}

431
vendor/regex-syntax/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,431 @@
/*!
This crate provides a robust regular expression parser.
This crate defines two primary types:
* [`Ast`](ast::Ast) is the abstract syntax of a regular expression.
An abstract syntax corresponds to a *structured representation* of the
concrete syntax of a regular expression, where the concrete syntax is the
pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it
can be converted back to the original concrete syntax (modulo some details,
like whitespace). To a first approximation, the abstract syntax is complex
and difficult to analyze.
* [`Hir`](hir::Hir) is the high-level intermediate representation
("HIR" or "high-level IR" for short) of regular expression. It corresponds to
an intermediate state of a regular expression that sits between the abstract
syntax and the low level compiled opcodes that are eventually responsible for
executing a regular expression search. Given some high-level IR, it is not
possible to produce the original concrete syntax (although it is possible to
produce an equivalent concrete syntax, but it will likely scarcely resemble
the original pattern). To a first approximation, the high-level IR is simple
and easy to analyze.
These two types come with conversion routines:
* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an
[`Ast`](ast::Ast).
* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a
[`Hir`](hir::Hir).
As a convenience, the above two conversion routines are combined into one via
the top-level [`Parser`] type. This `Parser` will first convert your pattern to
an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level
[`parse`] free function.
# Example
This example shows how to parse a pattern string into its HIR:
```
use regex_syntax::{hir::Hir, parse};
let hir = parse("a|b")?;
assert_eq!(hir, Hir::alternation(vec![
Hir::literal("a".as_bytes()),
Hir::literal("b".as_bytes()),
]));
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Concrete syntax supported
The concrete syntax is documented as part of the public API of the
[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax).
# Input safety
A key feature of this library is that it is safe to use with end user facing
input. This plays a significant role in the internal implementation. In
particular:
1. Parsers provide a `nest_limit` option that permits callers to control how
deeply nested a regular expression is allowed to be. This makes it possible
to do case analysis over an `Ast` or an `Hir` using recursion without
worrying about stack overflow.
2. Since relying on a particular stack size is brittle, this crate goes to
great lengths to ensure that all interactions with both the `Ast` and the
`Hir` do not use recursion. Namely, they use constant stack space and heap
space proportional to the size of the original pattern string (in bytes).
This includes the type's corresponding destructors. (One exception to this
is literal extraction, but this will eventually get fixed.)
# Error reporting
The `Display` implementations on all `Error` types exposed in this library
provide nice human readable errors that are suitable for showing to end users
in a monospace font.
# Literal extraction
This crate provides limited support for [literal extraction from `Hir`
values](hir::literal). Be warned that literal extraction uses recursion, and
therefore, stack size proportional to the size of the `Hir`.
The purpose of literal extraction is to speed up searches. That is, if you
know a regular expression must match a prefix or suffix literal, then it is
often quicker to search for instances of that literal, and then confirm or deny
the match using the full regular expression engine. These optimizations are
done automatically in the `regex` crate.
# Crate features
An important feature provided by this crate is its Unicode support. This
includes things like case folding, boolean properties, general categories,
scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`.
However, a downside of this support is that it requires bundling several
Unicode data tables that are substantial in size.
A fair number of use cases do not require full Unicode support. For this
reason, this crate exposes a number of features to control which Unicode
data is available.
If a regular expression attempts to use a Unicode feature that is not available
because the corresponding crate feature was disabled, then translating that
regular expression to an `Hir` will return an error. (It is still possible
construct an `Ast` for such a regular expression, since Unicode data is not
used until translation to an `Hir`.) Stated differently, enabling or disabling
any of the features below can only add or subtract from the total set of valid
regular expressions. Enabling or disabling a feature will never modify the
match semantics of a regular expression.
The following features are available:
* **std** -
Enables support for the standard library. This feature is enabled by default.
When disabled, only `core` and `alloc` are used. Otherwise, enabling `std`
generally just enables `std::error::Error` trait impls for the various error
types.
* **unicode** -
Enables all Unicode features. This feature is enabled by default, and will
always cover all Unicode features, even if more are added in the future.
* **unicode-age** -
Provide the data for the
[Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
This makes it possible to use classes like `\p{Age:6.0}` to refer to all
codepoints first introduced in Unicode 6.0
* **unicode-bool** -
Provide the data for numerous Unicode boolean properties. The full list
is not included here, but contains properties like `Alphabetic`, `Emoji`,
`Lowercase`, `Math`, `Uppercase` and `White_Space`.
* **unicode-case** -
Provide the data for case insensitive matching using
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
* **unicode-gencat** -
Provide the data for
[Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
This includes, but is not limited to, `Decimal_Number`, `Letter`,
`Math_Symbol`, `Number` and `Punctuation`.
* **unicode-perl** -
Provide the data for supporting the Unicode-aware Perl character classes,
corresponding to `\w`, `\s` and `\d`. This is also necessary for using
Unicode-aware word boundary assertions. Note that if this feature is
disabled, the `\s` and `\d` character classes are still available if the
`unicode-bool` and `unicode-gencat` features are enabled, respectively.
* **unicode-script** -
Provide the data for
[Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
`Latin` and `Thai`.
* **unicode-segment** -
Provide the data necessary to provide the properties used to implement the
[Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
`\p{sb=ATerm}`.
* **arbitrary** -
Enabling this feature introduces a public dependency on the
[`arbitrary`](https://crates.io/crates/arbitrary)
crate. Namely, it implements the `Arbitrary` trait from that crate for the
[`Ast`](crate::ast::Ast) type. This feature is disabled by default.
*/
#![no_std]
#![forbid(unsafe_code)]
#![deny(missing_docs, rustdoc::broken_intra_doc_links)]
#![warn(missing_debug_implementations)]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
#[cfg(any(test, feature = "std"))]
extern crate std;
extern crate alloc;
pub use crate::{
error::Error,
parser::{parse, Parser, ParserBuilder},
unicode::UnicodeWordError,
};
use alloc::string::String;
pub mod ast;
mod debug;
mod either;
mod error;
pub mod hir;
mod parser;
mod rank;
mod unicode;
mod unicode_tables;
pub mod utf8;
/// Escapes all regular expression meta characters in `text`.
///
/// The string returned may be safely used as a literal in a regular
/// expression.
pub fn escape(text: &str) -> String {
let mut quoted = String::new();
escape_into(text, &mut quoted);
quoted
}
/// Escapes all meta characters in `text` and writes the result into `buf`.
///
/// This will append escape characters into the given buffer. The characters
/// that are appended are safe to use as a literal in a regular expression.
pub fn escape_into(text: &str, buf: &mut String) {
buf.reserve(text.len());
for c in text.chars() {
if is_meta_character(c) {
buf.push('\\');
}
buf.push(c);
}
}
/// Returns true if the given character has significance in a regex.
///
/// Generally speaking, these are the only characters which _must_ be escaped
/// in order to match their literal meaning. For example, to match a literal
/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
/// example, `-` is treated as a meta character because of its significance
/// for writing ranges inside of character classes, but the regex `-` will
/// match a literal `-` because `-` has no special meaning outside of character
/// classes.
///
/// In order to determine whether a character may be escaped at all, the
/// [`is_escapeable_character`] routine should be used. The difference between
/// `is_meta_character` and `is_escapeable_character` is that the latter will
/// return true for some characters that are _not_ meta characters. For
/// example, `%` and `\%` both match a literal `%` in all contexts. In other
/// words, `is_escapeable_character` includes "superfluous" escapes.
///
/// Note that the set of characters for which this function returns `true` or
/// `false` is fixed and won't change in a semver compatible release. (In this
/// case, "semver compatible release" actually refers to the `regex` crate
/// itself, since reducing or expanding the set of meta characters would be a
/// breaking change for not just `regex-syntax` but also `regex` itself.)
///
/// # Example
///
/// ```
/// use regex_syntax::is_meta_character;
///
/// assert!(is_meta_character('?'));
/// assert!(is_meta_character('-'));
/// assert!(is_meta_character('&'));
/// assert!(is_meta_character('#'));
///
/// assert!(!is_meta_character('%'));
/// assert!(!is_meta_character('/'));
/// assert!(!is_meta_character('!'));
/// assert!(!is_meta_character('"'));
/// assert!(!is_meta_character('e'));
/// ```
pub fn is_meta_character(c: char) -> bool {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
| '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
_ => false,
}
}
/// Returns true if the given character can be escaped in a regex.
///
/// This returns true in all cases that `is_meta_character` returns true, but
/// also returns true in some cases where `is_meta_character` returns false.
/// For example, `%` is not a meta character, but it is escapable. That is,
/// `%` and `\%` both match a literal `%` in all contexts.
///
/// The purpose of this routine is to provide knowledge about what characters
/// may be escaped. Namely, most regex engines permit "superfluous" escapes
/// where characters without any special significance may be escaped even
/// though there is no actual _need_ to do so.
///
/// This will return false for some characters. For example, `e` is not
/// escapable. Therefore, `\e` will either result in a parse error (which is
/// true today), or it could backwards compatibly evolve into a new construct
/// with its own meaning. Indeed, that is the purpose of banning _some_
/// superfluous escapes: it provides a way to evolve the syntax in a compatible
/// manner.
///
/// # Example
///
/// ```
/// use regex_syntax::is_escapeable_character;
///
/// assert!(is_escapeable_character('?'));
/// assert!(is_escapeable_character('-'));
/// assert!(is_escapeable_character('&'));
/// assert!(is_escapeable_character('#'));
/// assert!(is_escapeable_character('%'));
/// assert!(is_escapeable_character('/'));
/// assert!(is_escapeable_character('!'));
/// assert!(is_escapeable_character('"'));
///
/// assert!(!is_escapeable_character('e'));
/// ```
pub fn is_escapeable_character(c: char) -> bool {
// Certainly escapable if it's a meta character.
if is_meta_character(c) {
return true;
}
// Any character that isn't ASCII is definitely not escapable. There's
// no real need to allow things like \☃ right?
if !c.is_ascii() {
return false;
}
// Otherwise, we basically say that everything is escapable unless it's a
// letter or digit. Things like \3 are either octal (when enabled) or an
// error, and we should keep it that way. Otherwise, letters are reserved
// for adding new syntax in a backwards compatible way.
match c {
'0'..='9' | 'A'..='Z' | 'a'..='z' => false,
// While not currently supported, we keep these as not escapable to
// give us some flexibility with respect to supporting the \< and
// \> word boundary assertions in the future. By rejecting them as
// escapable, \< and \> will result in a parse error. Thus, we can
// turn them into something else in the future without it being a
// backwards incompatible change.
//
// OK, now we support \< and \>, and we need to retain them as *not*
// escapable here since the escape sequence is significant.
'<' | '>' => false,
_ => true,
}
}
/// Returns true if and only if the given character is a Unicode word
/// character.
///
/// A Unicode word character is defined by
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
/// In particular, a character
/// is considered a word character if it is in either of the `Alphabetic` or
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
/// or `Connector_Punctuation` general categories.
///
/// # Panics
///
/// If the `unicode-perl` feature is not enabled, then this function
/// panics. For this reason, it is recommended that callers use
/// [`try_is_word_character`] instead.
pub fn is_word_character(c: char) -> bool {
try_is_word_character(c).expect("unicode-perl feature must be enabled")
}
/// Returns true if and only if the given character is a Unicode word
/// character.
///
/// A Unicode word character is defined by
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
/// In particular, a character
/// is considered a word character if it is in either of the `Alphabetic` or
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
/// or `Connector_Punctuation` general categories.
///
/// # Errors
///
/// If the `unicode-perl` feature is not enabled, then this function always
/// returns an error.
pub fn try_is_word_character(
c: char,
) -> core::result::Result<bool, UnicodeWordError> {
unicode::is_word_character(c)
}
/// Returns true if and only if the given character is an ASCII word character.
///
/// An ASCII word character is defined by the following character class:
/// `[_0-9a-zA-Z]`.
pub fn is_word_byte(c: u8) -> bool {
match c {
b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use alloc::string::ToString;
use super::*;
#[test]
fn escape_meta() {
assert_eq!(
escape(r"\.+*?()|[]{}^$#&-~"),
r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string()
);
}
#[test]
fn word_byte() {
assert!(is_word_byte(b'a'));
assert!(!is_word_byte(b'-'));
}
#[test]
#[cfg(feature = "unicode-perl")]
fn word_char() {
assert!(is_word_character('a'), "ASCII");
assert!(is_word_character('à'), "Latin-1");
assert!(is_word_character('β'), "Greek");
assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)");
assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)");
assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)");
assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)");
assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)");
assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)");
assert!(!is_word_character('-'));
assert!(!is_word_character('☃'));
}
#[test]
#[should_panic]
#[cfg(not(feature = "unicode-perl"))]
fn word_char_disabled_panic() {
assert!(is_word_character('a'));
}
#[test]
#[cfg(not(feature = "unicode-perl"))]
fn word_char_disabled_error() {
assert!(try_is_word_character('a').is_err());
}
}

254
vendor/regex-syntax/src/parser.rs vendored Normal file
View File

@@ -0,0 +1,254 @@
use crate::{ast, hir, Error};
/// A convenience routine for parsing a regex using default options.
///
/// This is equivalent to `Parser::new().parse(pattern)`.
///
/// If you need to set non-default options, then use a [`ParserBuilder`].
///
/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
/// you should use a [`ast::parse::Parser`].
pub fn parse(pattern: &str) -> Result<hir::Hir, Error> {
Parser::new().parse(pattern)
}
/// A builder for a regular expression parser.
///
/// This builder permits modifying configuration options for the parser.
///
/// This type combines the builder options for both the [AST
/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR
/// `TranslatorBuilder`](hir::translate::TranslatorBuilder).
#[derive(Clone, Debug, Default)]
pub struct ParserBuilder {
ast: ast::parse::ParserBuilder,
hir: hir::translate::TranslatorBuilder,
}
impl ParserBuilder {
/// Create a new parser builder with a default configuration.
pub fn new() -> ParserBuilder {
ParserBuilder::default()
}
/// Build a parser from this configuration with the given pattern.
pub fn build(&self) -> Parser {
Parser { ast: self.ast.build(), hir: self.hir.build() }
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an `Ast` using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire Ast is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since this parser
/// implementation will limit itself to heap space proportional to the
/// length of the pattern string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation, which results in a nest
/// depth of `1`. In general, a nest limit is not something that manifests
/// in an obvious way in the concrete syntax, therefore, it should not be
/// used in a granular way.
pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
self.ast.nest_limit(limit);
self
}
/// Whether to support octal syntax or not.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints in
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
/// `\141` are all equivalent regular expressions, where the last example
/// shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem, it does
/// make good error messages harder. That is, in PCRE based regex engines,
/// syntax like `\0` invokes a backreference, which is explicitly
/// unsupported in Rust's regex engine. However, many users expect it to
/// be supported. Therefore, when octal support is disabled, the error
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
self.ast.octal(yes);
self
}
/// When disabled, translation will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// When enabled (the default), the translator is guaranteed to produce an
/// expression that, for non-empty matches, will only ever produce spans
/// that are entirely valid UTF-8 (otherwise, the translator will return an
/// error).
///
/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
/// syntax) will be allowed even though they can produce matches that split
/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
/// matches, and it is expected that the regex engine itself must handle
/// these cases if necessary (perhaps by suppressing any zero-width matches
/// that split a codepoint).
pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.utf8(yes);
self
}
/// Enable verbose mode in the regular expression.
///
/// When enabled, verbose mode permits insignificant whitespace in many
/// places in the regular expression, as well as comments. Comments are
/// started using `#` and continue until the end of the line.
///
/// By default, this is disabled. It may be selectively enabled in the
/// regular expression by using the `x` flag regardless of this setting.
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
self.ast.ignore_whitespace(yes);
self
}
/// Enable or disable the case insensitive flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `i` flag.
pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.case_insensitive(yes);
self
}
/// Enable or disable the multi-line matching flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `m` flag.
pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.multi_line(yes);
self
}
/// Enable or disable the "dot matches any character" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `s` flag.
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.dot_matches_new_line(yes);
self
}
/// Enable or disable the CRLF mode flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `R` flag.
///
/// When CRLF mode is enabled, the following happens:
///
/// * Unless `dot_matches_new_line` is enabled, `.` will match any character
/// except for `\r` and `\n`.
/// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
/// `\r` and `\n` as line terminators. And in particular, neither will
/// match between a `\r` and a `\n`.
pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.crlf(yes);
self
}
/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
///
/// Namely, instead of `.` (by default) matching everything except for `\n`,
/// this will cause `.` to match everything except for the byte given.
///
/// If `.` is used in a context where Unicode mode is enabled and this byte
/// isn't ASCII, then an error will be returned. When Unicode mode is
/// disabled, then any byte is permitted, but will return an error if UTF-8
/// mode is enabled and it is a non-ASCII byte.
///
/// In short, any ASCII value for a line terminator is always okay. But a
/// non-ASCII byte might result in an error depending on whether Unicode
/// mode or UTF-8 mode are enabled.
///
/// Note that if `R` mode is enabled then it always takes precedence and
/// the line terminator will be treated as `\r` and `\n` simultaneously.
///
/// Note also that this *doesn't* impact the look-around assertions
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
/// configuration in the regex engine itself.
pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
self.hir.line_terminator(byte);
self
}
/// Enable or disable the "swap greed" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `U` flag.
pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.swap_greed(yes);
self
}
/// Enable or disable the Unicode flag (`u`) by default.
///
/// By default this is **enabled**. It may alternatively be selectively
/// disabled in the regular expression itself via the `u` flag.
///
/// Note that unless `utf8` is disabled (it's enabled by default), a
/// regular expression will fail to parse if Unicode mode is disabled and a
/// sub-expression could possibly match invalid UTF-8.
pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.unicode(yes);
self
}
}
/// A convenience parser for regular expressions.
///
/// This parser takes as input a regular expression pattern string (the
/// "concrete syntax") and returns a high-level intermediate representation
/// (the HIR) suitable for most types of analysis. In particular, this parser
/// hides the intermediate state of producing an AST (the "abstract syntax").
/// The AST is itself far more complex than the HIR, so this parser serves as a
/// convenience for never having to deal with it at all.
///
/// If callers have more fine grained use cases that need an AST, then please
/// see the [`ast::parse`] module.
///
/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
#[derive(Clone, Debug)]
pub struct Parser {
ast: ast::parse::Parser,
hir: hir::translate::Translator,
}
impl Parser {
/// Create a new parser with a default configuration.
///
/// The parser can be run with `parse` method. The parse method returns
/// a high level intermediate representation of the given regular
/// expression.
///
/// To set configuration options on the parser, use [`ParserBuilder`].
pub fn new() -> Parser {
ParserBuilder::new().build()
}
/// Parse the regular expression into a high level intermediate
/// representation.
pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> {
let ast = self.ast.parse(pattern)?;
let hir = self.hir.translate(pattern, &ast)?;
Ok(hir)
}
}

258
vendor/regex-syntax/src/rank.rs vendored Normal file
View File

@@ -0,0 +1,258 @@
pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [
55, // '\x00'
52, // '\x01'
51, // '\x02'
50, // '\x03'
49, // '\x04'
48, // '\x05'
47, // '\x06'
46, // '\x07'
45, // '\x08'
103, // '\t'
242, // '\n'
66, // '\x0b'
67, // '\x0c'
229, // '\r'
44, // '\x0e'
43, // '\x0f'
42, // '\x10'
41, // '\x11'
40, // '\x12'
39, // '\x13'
38, // '\x14'
37, // '\x15'
36, // '\x16'
35, // '\x17'
34, // '\x18'
33, // '\x19'
56, // '\x1a'
32, // '\x1b'
31, // '\x1c'
30, // '\x1d'
29, // '\x1e'
28, // '\x1f'
255, // ' '
148, // '!'
164, // '"'
149, // '#'
136, // '$'
160, // '%'
155, // '&'
173, // "'"
221, // '('
222, // ')'
134, // '*'
122, // '+'
232, // ','
202, // '-'
215, // '.'
224, // '/'
208, // '0'
220, // '1'
204, // '2'
187, // '3'
183, // '4'
179, // '5'
177, // '6'
168, // '7'
178, // '8'
200, // '9'
226, // ':'
195, // ';'
154, // '<'
184, // '='
174, // '>'
126, // '?'
120, // '@'
191, // 'A'
157, // 'B'
194, // 'C'
170, // 'D'
189, // 'E'
162, // 'F'
161, // 'G'
150, // 'H'
193, // 'I'
142, // 'J'
137, // 'K'
171, // 'L'
176, // 'M'
185, // 'N'
167, // 'O'
186, // 'P'
112, // 'Q'
175, // 'R'
192, // 'S'
188, // 'T'
156, // 'U'
140, // 'V'
143, // 'W'
123, // 'X'
133, // 'Y'
128, // 'Z'
147, // '['
138, // '\\'
146, // ']'
114, // '^'
223, // '_'
151, // '`'
249, // 'a'
216, // 'b'
238, // 'c'
236, // 'd'
253, // 'e'
227, // 'f'
218, // 'g'
230, // 'h'
247, // 'i'
135, // 'j'
180, // 'k'
241, // 'l'
233, // 'm'
246, // 'n'
244, // 'o'
231, // 'p'
139, // 'q'
245, // 'r'
243, // 's'
251, // 't'
235, // 'u'
201, // 'v'
196, // 'w'
240, // 'x'
214, // 'y'
152, // 'z'
182, // '{'
205, // '|'
181, // '}'
127, // '~'
27, // '\x7f'
212, // '\x80'
211, // '\x81'
210, // '\x82'
213, // '\x83'
228, // '\x84'
197, // '\x85'
169, // '\x86'
159, // '\x87'
131, // '\x88'
172, // '\x89'
105, // '\x8a'
80, // '\x8b'
98, // '\x8c'
96, // '\x8d'
97, // '\x8e'
81, // '\x8f'
207, // '\x90'
145, // '\x91'
116, // '\x92'
115, // '\x93'
144, // '\x94'
130, // '\x95'
153, // '\x96'
121, // '\x97'
107, // '\x98'
132, // '\x99'
109, // '\x9a'
110, // '\x9b'
124, // '\x9c'
111, // '\x9d'
82, // '\x9e'
108, // '\x9f'
118, // '\xa0'
141, // '¡'
113, // '¢'
129, // '£'
119, // '¤'
125, // '¥'
165, // '¦'
117, // '§'
92, // '¨'
106, // '©'
83, // 'ª'
72, // '«'
99, // '¬'
93, // '\xad'
65, // '®'
79, // '¯'
166, // '°'
237, // '±'
163, // '²'
199, // '³'
190, // '´'
225, // 'µ'
209, // '¶'
203, // '·'
198, // '¸'
217, // '¹'
219, // 'º'
206, // '»'
234, // '¼'
248, // '½'
158, // '¾'
239, // '¿'
255, // 'À'
255, // 'Á'
255, // 'Â'
255, // 'Ã'
255, // 'Ä'
255, // 'Å'
255, // 'Æ'
255, // 'Ç'
255, // 'È'
255, // 'É'
255, // 'Ê'
255, // 'Ë'
255, // 'Ì'
255, // 'Í'
255, // 'Î'
255, // 'Ï'
255, // 'Ð'
255, // 'Ñ'
255, // 'Ò'
255, // 'Ó'
255, // 'Ô'
255, // 'Õ'
255, // 'Ö'
255, // '×'
255, // 'Ø'
255, // 'Ù'
255, // 'Ú'
255, // 'Û'
255, // 'Ü'
255, // 'Ý'
255, // 'Þ'
255, // 'ß'
255, // 'à'
255, // 'á'
255, // 'â'
255, // 'ã'
255, // 'ä'
255, // 'å'
255, // 'æ'
255, // 'ç'
255, // 'è'
255, // 'é'
255, // 'ê'
255, // 'ë'
255, // 'ì'
255, // 'í'
255, // 'î'
255, // 'ï'
255, // 'ð'
255, // 'ñ'
255, // 'ò'
255, // 'ó'
255, // 'ô'
255, // 'õ'
255, // 'ö'
255, // '÷'
255, // 'ø'
255, // 'ù'
255, // 'ú'
255, // 'û'
255, // 'ü'
255, // 'ý'
255, // 'þ'
255, // 'ÿ'
];

1041
vendor/regex-syntax/src/unicode.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,57 @@
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
Unicode Data Files include all data files under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
Unicode Data Files do not include PDF online code charts under the
directory http://www.unicode.org/Public/.
Software includes any source code published in the Unicode Standard
or under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
NOTICE TO USER: Carefully read the following legal agreement.
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT.
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
THE DATA FILES OR SOFTWARE.
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,57 @@
#[cfg(feature = "unicode-age")]
pub mod age;
#[cfg(feature = "unicode-case")]
pub mod case_folding_simple;
#[cfg(feature = "unicode-gencat")]
pub mod general_category;
#[cfg(feature = "unicode-segment")]
pub mod grapheme_cluster_break;
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
#[allow(dead_code)]
pub mod perl_decimal;
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
#[allow(dead_code)]
pub mod perl_space;
#[cfg(feature = "unicode-perl")]
pub mod perl_word;
#[cfg(feature = "unicode-bool")]
pub mod property_bool;
#[cfg(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
))]
pub mod property_names;
#[cfg(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
))]
pub mod property_values;
#[cfg(feature = "unicode-script")]
pub mod script;
#[cfg(feature = "unicode-script")]
pub mod script_extension;
#[cfg(feature = "unicode-segment")]
pub mod sentence_break;
#[cfg(feature = "unicode-segment")]
pub mod word_break;

View File

@@ -0,0 +1,84 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate general-category ucd-16.0.0 --chars --include decimalnumber
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
&[("Decimal_Number", DECIMAL_NUMBER)];
pub const DECIMAL_NUMBER: &'static [(char, char)] = &[
('0', '9'),
('٠', '٩'),
('۰', '۹'),
('߀', '߉'),
('', '९'),
('', '৯'),
('', '੯'),
('', '૯'),
('', '୯'),
('', '௯'),
('', '౯'),
('', '೯'),
('', '൯'),
('෦', '෯'),
('', '๙'),
('', '໙'),
('༠', '༩'),
('', '၉'),
('႐', '႙'),
('០', '៩'),
('᠐', '᠙'),
('᥆', '᥏'),
('᧐', '᧙'),
('᪀', '᪉'),
('᪐', '᪙'),
('᭐', '᭙'),
('᮰', '᮹'),
('᱀', '᱉'),
('᱐', '᱙'),
('꘠', '꘩'),
('꣐', '꣙'),
('꤀', '꤉'),
('꧐', '꧙'),
('꧰', '꧹'),
('꩐', '꩙'),
('꯰', '꯹'),
('', ''),
('𐒠', '𐒩'),
('𐴰', '𐴹'),
('𐵀', '𐵉'),
('𑁦', '𑁯'),
('𑃰', '𑃹'),
('𑄶', '𑄿'),
('𑇐', '𑇙'),
('𑋰', '𑋹'),
('𑑐', '𑑙'),
('𑓐', '𑓙'),
('𑙐', '𑙙'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜰', '𑜹'),
('𑣠', '𑣩'),
('𑥐', '𑥙'),
('𑯰', '𑯹'),
('𑱐', '𑱙'),
('𑵐', '𑵙'),
('𑶠', '𑶩'),
('𑽐', '𑽙'),
('𖄰', '𖄹'),
('𖩠', '𖩩'),
('𖫀', '𖫉'),
('𖭐', '𖭙'),
('𖵰', '𖵹'),
('𜳰', '𜳹'),
('𝟎', '𝟿'),
('𞅀', '𞅉'),
('𞋰', '𞋹'),
('𞓰', '𞓹'),
('𞗱', '𞗺'),
('𞥐', '𞥙'),
('🯰', '🯹'),
];

View File

@@ -0,0 +1,23 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-bool ucd-16.0.0 --chars --include whitespace
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
&[("White_Space", WHITE_SPACE)];
pub const WHITE_SPACE: &'static [(char, char)] = &[
('\t', '\r'),
(' ', ' '),
('\u{85}', '\u{85}'),
('\u{a0}', '\u{a0}'),
('\u{1680}', '\u{1680}'),
('\u{2000}', '\u{200a}'),
('\u{2028}', '\u{2029}'),
('\u{202f}', '\u{202f}'),
('\u{205f}', '\u{205f}'),
('\u{3000}', '\u{3000}'),
];

View File

@@ -0,0 +1,806 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate perl-word ucd-16.0.0 --chars
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PERL_WORD: &'static [(char, char)] = &[
('0', '9'),
('A', 'Z'),
('_', '_'),
('a', 'z'),
('ª', 'ª'),
('µ', 'µ'),
('º', 'º'),
('À', 'Ö'),
('Ø', 'ö'),
('ø', 'ˁ'),
('ˆ', 'ˑ'),
('ˠ', 'ˤ'),
('ˬ', 'ˬ'),
('ˮ', 'ˮ'),
('\u{300}', 'ʹ'),
('Ͷ', 'ͷ'),
('ͺ', 'ͽ'),
('Ϳ', 'Ϳ'),
('Ά', 'Ά'),
('Έ', 'Ί'),
('Ό', 'Ό'),
('Ύ', 'Ρ'),
('Σ', 'ϵ'),
('Ϸ', 'ҁ'),
('\u{483}', 'ԯ'),
('Ա', 'Ֆ'),
('ՙ', 'ՙ'),
('ՠ', 'ֈ'),
('\u{591}', '\u{5bd}'),
('\u{5bf}', '\u{5bf}'),
('\u{5c1}', '\u{5c2}'),
('\u{5c4}', '\u{5c5}'),
('\u{5c7}', '\u{5c7}'),
('א', 'ת'),
('ׯ', 'ײ'),
('\u{610}', '\u{61a}'),
('ؠ', '٩'),
('ٮ', 'ۓ'),
('ە', '\u{6dc}'),
('\u{6df}', '\u{6e8}'),
('\u{6ea}', 'ۼ'),
('ۿ', 'ۿ'),
('ܐ', '\u{74a}'),
('ݍ', 'ޱ'),
('߀', 'ߵ'),
('ߺ', 'ߺ'),
('\u{7fd}', '\u{7fd}'),
('ࠀ', '\u{82d}'),
('ࡀ', '\u{85b}'),
('ࡠ', 'ࡪ'),
('ࡰ', 'ࢇ'),
('ࢉ', 'ࢎ'),
('\u{897}', '\u{8e1}'),
('\u{8e3}', '\u{963}'),
('', '९'),
('ॱ', 'ঃ'),
('অ', 'ঌ'),
('এ', 'ঐ'),
('ও', 'ন'),
('প', 'র'),
('ল', 'ল'),
('শ', 'হ'),
('\u{9bc}', '\u{9c4}'),
('ে', 'ৈ'),
('ো', 'ৎ'),
('\u{9d7}', '\u{9d7}'),
('ড়', 'ঢ়'),
('য়', '\u{9e3}'),
('', 'ৱ'),
('ৼ', 'ৼ'),
('\u{9fe}', '\u{9fe}'),
('\u{a01}', 'ਃ'),
('ਅ', 'ਊ'),
('ਏ', 'ਐ'),
('ਓ', 'ਨ'),
('ਪ', 'ਰ'),
('ਲ', 'ਲ਼'),
('ਵ', 'ਸ਼'),
('ਸ', 'ਹ'),
('\u{a3c}', '\u{a3c}'),
('ਾ', '\u{a42}'),
('\u{a47}', '\u{a48}'),
('\u{a4b}', '\u{a4d}'),
('\u{a51}', '\u{a51}'),
('ਖ਼', 'ੜ'),
('ਫ਼', 'ਫ਼'),
('', '\u{a75}'),
('\u{a81}', ''),
('અ', 'ઍ'),
('એ', 'ઑ'),
('ઓ', 'ન'),
('પ', 'ર'),
('લ', 'ળ'),
('વ', 'હ'),
('\u{abc}', '\u{ac5}'),
('\u{ac7}', 'ૉ'),
('ો', '\u{acd}'),
('ૐ', 'ૐ'),
('ૠ', '\u{ae3}'),
('', '૯'),
('ૹ', '\u{aff}'),
('\u{b01}', ''),
('ଅ', 'ଌ'),
('ଏ', 'ଐ'),
('ଓ', 'ନ'),
('ପ', 'ର'),
('ଲ', 'ଳ'),
('ଵ', 'ହ'),
('\u{b3c}', '\u{b44}'),
('େ', 'ୈ'),
('ୋ', '\u{b4d}'),
('\u{b55}', '\u{b57}'),
('ଡ଼', 'ଢ଼'),
('ୟ', '\u{b63}'),
('', '୯'),
('ୱ', 'ୱ'),
('\u{b82}', 'ஃ'),
('அ', 'ஊ'),
('எ', 'ஐ'),
('ஒ', 'க'),
('ங', 'ச'),
('ஜ', 'ஜ'),
('ஞ', 'ட'),
('ண', 'த'),
('ந', 'ப'),
('ம', 'ஹ'),
('\u{bbe}', 'ூ'),
('ெ', 'ை'),
('ொ', '\u{bcd}'),
('ௐ', 'ௐ'),
('\u{bd7}', '\u{bd7}'),
('', '௯'),
('\u{c00}', 'ఌ'),
('ఎ', 'ఐ'),
('ఒ', 'న'),
('ప', 'హ'),
('\u{c3c}', 'ౄ'),
('\u{c46}', '\u{c48}'),
('\u{c4a}', '\u{c4d}'),
('\u{c55}', '\u{c56}'),
('ౘ', 'ౚ'),
('ౝ', 'ౝ'),
('ౠ', '\u{c63}'),
('', '౯'),
('ಀ', 'ಃ'),
('ಅ', 'ಌ'),
('ಎ', 'ಐ'),
('ಒ', 'ನ'),
('ಪ', 'ಳ'),
('ವ', 'ಹ'),
('\u{cbc}', 'ೄ'),
('\u{cc6}', '\u{cc8}'),
('\u{cca}', '\u{ccd}'),
('\u{cd5}', '\u{cd6}'),
('ೝ', 'ೞ'),
('ೠ', '\u{ce3}'),
('', '೯'),
('ೱ', 'ೳ'),
('\u{d00}', 'ഌ'),
('എ', 'ഐ'),
('ഒ', '\u{d44}'),
('െ', 'ൈ'),
('ൊ', 'ൎ'),
('ൔ', '\u{d57}'),
('ൟ', '\u{d63}'),
('', '൯'),
('ൺ', 'ൿ'),
('\u{d81}', 'ඃ'),
('අ', 'ඖ'),
('ක', 'න'),
('ඳ', 'ර'),
('ල', 'ල'),
('ව', 'ෆ'),
('\u{dca}', '\u{dca}'),
('\u{dcf}', '\u{dd4}'),
('\u{dd6}', '\u{dd6}'),
('ෘ', '\u{ddf}'),
('෦', '෯'),
('ෲ', 'ෳ'),
('ก', '\u{e3a}'),
('เ', '\u{e4e}'),
('', '๙'),
('ກ', 'ຂ'),
('ຄ', 'ຄ'),
('ຆ', 'ຊ'),
('ຌ', 'ຣ'),
('ລ', 'ລ'),
('ວ', 'ຽ'),
('ເ', 'ໄ'),
('ໆ', 'ໆ'),
('\u{ec8}', '\u{ece}'),
('', '໙'),
('ໜ', 'ໟ'),
('ༀ', 'ༀ'),
('\u{f18}', '\u{f19}'),
('༠', '༩'),
('\u{f35}', '\u{f35}'),
('\u{f37}', '\u{f37}'),
('\u{f39}', '\u{f39}'),
('༾', 'ཇ'),
('ཉ', 'ཬ'),
('\u{f71}', '\u{f84}'),
('\u{f86}', '\u{f97}'),
('\u{f99}', '\u{fbc}'),
('\u{fc6}', '\u{fc6}'),
('က', '၉'),
('ၐ', '\u{109d}'),
('Ⴀ', 'Ⴥ'),
('Ⴧ', 'Ⴧ'),
('Ⴭ', 'Ⴭ'),
('ა', 'ჺ'),
('ჼ', 'ቈ'),
('ቊ', 'ቍ'),
('ቐ', 'ቖ'),
('ቘ', 'ቘ'),
('ቚ', 'ቝ'),
('በ', 'ኈ'),
('ኊ', 'ኍ'),
('ነ', 'ኰ'),
('ኲ', 'ኵ'),
('ኸ', 'ኾ'),
('ዀ', 'ዀ'),
('ዂ', 'ዅ'),
('ወ', 'ዖ'),
('ዘ', 'ጐ'),
('ጒ', 'ጕ'),
('ጘ', 'ፚ'),
('\u{135d}', '\u{135f}'),
('ᎀ', 'ᎏ'),
('', 'Ᏽ'),
('ᏸ', 'ᏽ'),
('ᐁ', 'ᙬ'),
('ᙯ', 'ᙿ'),
('ᚁ', 'ᚚ'),
('ᚠ', 'ᛪ'),
('ᛮ', 'ᛸ'),
('ᜀ', '\u{1715}'),
('ᜟ', '\u{1734}'),
('ᝀ', '\u{1753}'),
('ᝠ', 'ᝬ'),
('ᝮ', 'ᝰ'),
('\u{1772}', '\u{1773}'),
('ក', '\u{17d3}'),
('ៗ', 'ៗ'),
('ៜ', '\u{17dd}'),
('០', '៩'),
('\u{180b}', '\u{180d}'),
('\u{180f}', '᠙'),
('ᠠ', 'ᡸ'),
('ᢀ', 'ᢪ'),
('ᢰ', 'ᣵ'),
('ᤀ', 'ᤞ'),
('\u{1920}', 'ᤫ'),
('ᤰ', '\u{193b}'),
('᥆', 'ᥭ'),
('ᥰ', 'ᥴ'),
('ᦀ', 'ᦫ'),
('ᦰ', 'ᧉ'),
('᧐', '᧙'),
('ᨀ', '\u{1a1b}'),
('ᨠ', '\u{1a5e}'),
('\u{1a60}', '\u{1a7c}'),
('\u{1a7f}', '᪉'),
('᪐', '᪙'),
('ᪧ', 'ᪧ'),
('\u{1ab0}', '\u{1ace}'),
('\u{1b00}', 'ᭌ'),
('᭐', '᭙'),
('\u{1b6b}', '\u{1b73}'),
('\u{1b80}', '\u{1bf3}'),
('ᰀ', '\u{1c37}'),
('᱀', '᱉'),
('ᱍ', 'ᱽ'),
('ᲀ', 'ᲊ'),
('Ა', 'Ჺ'),
('Ჽ', 'Ჿ'),
('\u{1cd0}', '\u{1cd2}'),
('\u{1cd4}', 'ᳺ'),
('ᴀ', 'ἕ'),
('Ἐ', 'Ἕ'),
('ἠ', 'ὅ'),
('Ὀ', 'Ὅ'),
('ὐ', 'ὗ'),
('Ὑ', 'Ὑ'),
('Ὓ', 'Ὓ'),
('Ὕ', 'Ὕ'),
('Ὗ', 'ώ'),
('ᾀ', 'ᾴ'),
('ᾶ', 'ᾼ'),
('', ''),
('ῂ', 'ῄ'),
('ῆ', 'ῌ'),
('ῐ', 'ΐ'),
('ῖ', 'Ί'),
('ῠ', 'Ῥ'),
('ῲ', 'ῴ'),
('ῶ', 'ῼ'),
('\u{200c}', '\u{200d}'),
('‿', '⁀'),
('⁔', '⁔'),
('ⁱ', 'ⁱ'),
('ⁿ', 'ⁿ'),
('ₐ', 'ₜ'),
('\u{20d0}', '\u{20f0}'),
('', ''),
('ℇ', 'ℇ'),
('', ''),
('', ''),
('', ''),
('', ''),
('Ω', 'Ω'),
('', ''),
('', ''),
('', ''),
('ℼ', 'ℿ'),
('', ''),
('ⅎ', 'ⅎ'),
('', 'ↈ'),
('Ⓐ', 'ⓩ'),
('Ⰰ', 'ⳤ'),
('Ⳬ', 'ⳳ'),
('ⴀ', 'ⴥ'),
('ⴧ', 'ⴧ'),
('ⴭ', 'ⴭ'),
('ⴰ', 'ⵧ'),
('ⵯ', 'ⵯ'),
('\u{2d7f}', 'ⶖ'),
('ⶠ', 'ⶦ'),
('ⶨ', 'ⶮ'),
('ⶰ', 'ⶶ'),
('ⶸ', 'ⶾ'),
('ⷀ', 'ⷆ'),
('ⷈ', 'ⷎ'),
('ⷐ', 'ⷖ'),
('ⷘ', 'ⷞ'),
('\u{2de0}', '\u{2dff}'),
('ⸯ', 'ⸯ'),
('々', ''),
('〡', '\u{302f}'),
('〱', '〵'),
('〸', '〼'),
('ぁ', 'ゖ'),
('\u{3099}', '\u{309a}'),
('ゝ', 'ゟ'),
('ァ', 'ヺ'),
('ー', 'ヿ'),
('ㄅ', 'ㄯ'),
('ㄱ', 'ㆎ'),
('ㆠ', 'ㆿ'),
('ㇰ', 'ㇿ'),
('㐀', '䶿'),
('一', 'ꒌ'),
('', ''),
('ꔀ', 'ꘌ'),
('ꘐ', 'ꘫ'),
('Ꙁ', '\u{a672}'),
('\u{a674}', '\u{a67d}'),
('ꙿ', '\u{a6f1}'),
('ꜗ', 'ꜟ'),
('Ꜣ', 'ꞈ'),
('Ꞌ', 'ꟍ'),
('Ꟑ', 'ꟑ'),
('ꟓ', 'ꟓ'),
('ꟕ', 'Ƛ'),
('ꟲ', 'ꠧ'),
('\u{a82c}', '\u{a82c}'),
('ꡀ', 'ꡳ'),
('ꢀ', '\u{a8c5}'),
('꣐', '꣙'),
('\u{a8e0}', 'ꣷ'),
('ꣻ', 'ꣻ'),
('ꣽ', '\u{a92d}'),
('ꤰ', '\u{a953}'),
('ꥠ', 'ꥼ'),
('\u{a980}', '\u{a9c0}'),
('ꧏ', '꧙'),
('ꧠ', 'ꧾ'),
('ꨀ', '\u{aa36}'),
('ꩀ', 'ꩍ'),
('꩐', '꩙'),
('ꩠ', 'ꩶ'),
('ꩺ', 'ꫂ'),
('ꫛ', 'ꫝ'),
('ꫠ', 'ꫯ'),
('ꫲ', '\u{aaf6}'),
('ꬁ', 'ꬆ'),
('ꬉ', 'ꬎ'),
('ꬑ', 'ꬖ'),
('ꬠ', 'ꬦ'),
('ꬨ', 'ꬮ'),
('ꬰ', ''),
('ꭜ', 'ꭩ'),
('ꭰ', 'ꯪ'),
('꯬', '\u{abed}'),
('꯰', '꯹'),
('가', '힣'),
('ힰ', 'ퟆ'),
('ퟋ', 'ퟻ'),
('豈', '舘'),
('並', '龎'),
('ff', 'st'),
('ﬓ', 'ﬗ'),
('יִ', 'ﬨ'),
('שׁ', 'זּ'),
('טּ', 'לּ'),
('מּ', 'מּ'),
('נּ', 'סּ'),
('ףּ', 'פּ'),
('צּ', 'ﮱ'),
('ﯓ', 'ﴽ'),
('ﵐ', 'ﶏ'),
('ﶒ', 'ﷇ'),
('ﷰ', 'ﷻ'),
('\u{fe00}', '\u{fe0f}'),
('\u{fe20}', '\u{fe2f}'),
('︳', '︴'),
('', ''),
('ﹰ', 'ﹴ'),
('ﹶ', 'ﻼ'),
('', ''),
('', ''),
('_', '_'),
('', ''),
('ヲ', 'ᄒ'),
('ᅡ', 'ᅦ'),
('ᅧ', 'ᅬ'),
('ᅭ', 'ᅲ'),
('ᅳ', 'ᅵ'),
('𐀀', '𐀋'),
('𐀍', '𐀦'),
('𐀨', '𐀺'),
('𐀼', '𐀽'),
('𐀿', '𐁍'),
('𐁐', '𐁝'),
('𐂀', '𐃺'),
('𐅀', '𐅴'),
('\u{101fd}', '\u{101fd}'),
('𐊀', '𐊜'),
('𐊠', '𐋐'),
('\u{102e0}', '\u{102e0}'),
('𐌀', '𐌟'),
('𐌭', '𐍊'),
('𐍐', '\u{1037a}'),
('𐎀', '𐎝'),
('𐎠', '𐏃'),
('𐏈', '𐏏'),
('𐏑', '𐏕'),
('𐐀', '𐒝'),
('𐒠', '𐒩'),
('𐒰', '𐓓'),
('𐓘', '𐓻'),
('𐔀', '𐔧'),
('𐔰', '𐕣'),
('𐕰', '𐕺'),
('𐕼', '𐖊'),
('𐖌', '𐖒'),
('𐖔', '𐖕'),
('𐖗', '𐖡'),
('𐖣', '𐖱'),
('𐖳', '𐖹'),
('𐖻', '𐖼'),
('𐗀', '𐗳'),
('𐘀', '𐜶'),
('𐝀', '𐝕'),
('𐝠', '𐝧'),
('𐞀', '𐞅'),
('𐞇', '𐞰'),
('𐞲', '𐞺'),
('𐠀', '𐠅'),
('𐠈', '𐠈'),
('𐠊', '𐠵'),
('𐠷', '𐠸'),
('𐠼', '𐠼'),
('𐠿', '𐡕'),
('𐡠', '𐡶'),
('𐢀', '𐢞'),
('𐣠', '𐣲'),
('𐣴', '𐣵'),
('𐤀', '𐤕'),
('𐤠', '𐤹'),
('𐦀', '𐦷'),
('𐦾', '𐦿'),
('𐨀', '\u{10a03}'),
('\u{10a05}', '\u{10a06}'),
('\u{10a0c}', '𐨓'),
('𐨕', '𐨗'),
('𐨙', '𐨵'),
('\u{10a38}', '\u{10a3a}'),
('\u{10a3f}', '\u{10a3f}'),
('𐩠', '𐩼'),
('𐪀', '𐪜'),
('𐫀', '𐫇'),
('𐫉', '\u{10ae6}'),
('𐬀', '𐬵'),
('𐭀', '𐭕'),
('𐭠', '𐭲'),
('𐮀', '𐮑'),
('𐰀', '𐱈'),
('𐲀', '𐲲'),
('𐳀', '𐳲'),
('𐴀', '\u{10d27}'),
('𐴰', '𐴹'),
('𐵀', '𐵥'),
('\u{10d69}', '\u{10d6d}'),
('𐵯', '𐶅'),
('𐺀', '𐺩'),
('\u{10eab}', '\u{10eac}'),
('𐺰', '𐺱'),
('𐻂', '𐻄'),
('\u{10efc}', '𐼜'),
('𐼧', '𐼧'),
('𐼰', '\u{10f50}'),
('𐽰', '\u{10f85}'),
('𐾰', '𐿄'),
('𐿠', '𐿶'),
('𑀀', '\u{11046}'),
('𑁦', '𑁵'),
('\u{1107f}', '\u{110ba}'),
('\u{110c2}', '\u{110c2}'),
('𑃐', '𑃨'),
('𑃰', '𑃹'),
('\u{11100}', '\u{11134}'),
('𑄶', '𑄿'),
('𑅄', '𑅇'),
('𑅐', '\u{11173}'),
('𑅶', '𑅶'),
('\u{11180}', '𑇄'),
('\u{111c9}', '\u{111cc}'),
('𑇎', '𑇚'),
('𑇜', '𑇜'),
('𑈀', '𑈑'),
('𑈓', '\u{11237}'),
('\u{1123e}', '\u{11241}'),
('𑊀', '𑊆'),
('𑊈', '𑊈'),
('𑊊', '𑊍'),
('𑊏', '𑊝'),
('𑊟', '𑊨'),
('𑊰', '\u{112ea}'),
('𑋰', '𑋹'),
('\u{11300}', '𑌃'),
('𑌅', '𑌌'),
('𑌏', '𑌐'),
('𑌓', '𑌨'),
('𑌪', '𑌰'),
('𑌲', '𑌳'),
('𑌵', '𑌹'),
('\u{1133b}', '𑍄'),
('𑍇', '𑍈'),
('𑍋', '\u{1134d}'),
('𑍐', '𑍐'),
('\u{11357}', '\u{11357}'),
('𑍝', '𑍣'),
('\u{11366}', '\u{1136c}'),
('\u{11370}', '\u{11374}'),
('𑎀', '𑎉'),
('𑎋', '𑎋'),
('𑎎', '𑎎'),
('𑎐', '𑎵'),
('𑎷', '\u{113c0}'),
('\u{113c2}', '\u{113c2}'),
('\u{113c5}', '\u{113c5}'),
('\u{113c7}', '𑏊'),
('𑏌', '𑏓'),
('\u{113e1}', '\u{113e2}'),
('𑐀', '𑑊'),
('𑑐', '𑑙'),
('\u{1145e}', '𑑡'),
('𑒀', '𑓅'),
('𑓇', '𑓇'),
('𑓐', '𑓙'),
('𑖀', '\u{115b5}'),
('𑖸', '\u{115c0}'),
('𑗘', '\u{115dd}'),
('𑘀', '\u{11640}'),
('𑙄', '𑙄'),
('𑙐', '𑙙'),
('𑚀', '𑚸'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜀', '𑜚'),
('\u{1171d}', '\u{1172b}'),
('𑜰', '𑜹'),
('𑝀', '𑝆'),
('𑠀', '\u{1183a}'),
('𑢠', '𑣩'),
('𑣿', '𑤆'),
('𑤉', '𑤉'),
('𑤌', '𑤓'),
('𑤕', '𑤖'),
('𑤘', '𑤵'),
('𑤷', '𑤸'),
('\u{1193b}', '\u{11943}'),
('𑥐', '𑥙'),
('𑦠', '𑦧'),
('𑦪', '\u{119d7}'),
('\u{119da}', '𑧡'),
('𑧣', '𑧤'),
('𑨀', '\u{11a3e}'),
('\u{11a47}', '\u{11a47}'),
('𑩐', '\u{11a99}'),
('𑪝', '𑪝'),
('𑪰', '𑫸'),
('𑯀', '𑯠'),
('𑯰', '𑯹'),
('𑰀', '𑰈'),
('𑰊', '\u{11c36}'),
('\u{11c38}', '𑱀'),
('𑱐', '𑱙'),
('𑱲', '𑲏'),
('\u{11c92}', '\u{11ca7}'),
('𑲩', '\u{11cb6}'),
('𑴀', '𑴆'),
('𑴈', '𑴉'),
('𑴋', '\u{11d36}'),
('\u{11d3a}', '\u{11d3a}'),
('\u{11d3c}', '\u{11d3d}'),
('\u{11d3f}', '\u{11d47}'),
('𑵐', '𑵙'),
('𑵠', '𑵥'),
('𑵧', '𑵨'),
('𑵪', '𑶎'),
('\u{11d90}', '\u{11d91}'),
('𑶓', '𑶘'),
('𑶠', '𑶩'),
('𑻠', '𑻶'),
('\u{11f00}', '𑼐'),
('𑼒', '\u{11f3a}'),
('𑼾', '\u{11f42}'),
('𑽐', '\u{11f5a}'),
('𑾰', '𑾰'),
('𒀀', '𒎙'),
('𒐀', '𒑮'),
('𒒀', '𒕃'),
('𒾐', '𒿰'),
('𓀀', '𓐯'),
('\u{13440}', '\u{13455}'),
('𓑠', '𔏺'),
('𔐀', '𔙆'),
('𖄀', '𖄹'),
('𖠀', '𖨸'),
('𖩀', '𖩞'),
('𖩠', '𖩩'),
('𖩰', '𖪾'),
('𖫀', '𖫉'),
('𖫐', '𖫭'),
('\u{16af0}', '\u{16af4}'),
('𖬀', '\u{16b36}'),
('𖭀', '𖭃'),
('𖭐', '𖭙'),
('𖭣', '𖭷'),
('𖭽', '𖮏'),
('𖵀', '𖵬'),
('𖵰', '𖵹'),
('𖹀', '𖹿'),
('𖼀', '𖽊'),
('\u{16f4f}', '𖾇'),
('\u{16f8f}', '𖾟'),
('𖿠', '𖿡'),
('𖿣', '\u{16fe4}'),
('\u{16ff0}', '\u{16ff1}'),
('𗀀', '𘟷'),
('𘠀', '𘳕'),
('𘳿', '𘴈'),
('𚿰', '𚿳'),
('𚿵', '𚿻'),
('𚿽', '𚿾'),
('𛀀', '𛄢'),
('𛄲', '𛄲'),
('𛅐', '𛅒'),
('𛅕', '𛅕'),
('𛅤', '𛅧'),
('𛅰', '𛋻'),
('𛰀', '𛱪'),
('𛱰', '𛱼'),
('𛲀', '𛲈'),
('𛲐', '𛲙'),
('\u{1bc9d}', '\u{1bc9e}'),
('𜳰', '𜳹'),
('\u{1cf00}', '\u{1cf2d}'),
('\u{1cf30}', '\u{1cf46}'),
('\u{1d165}', '\u{1d169}'),
('\u{1d16d}', '\u{1d172}'),
('\u{1d17b}', '\u{1d182}'),
('\u{1d185}', '\u{1d18b}'),
('\u{1d1aa}', '\u{1d1ad}'),
('\u{1d242}', '\u{1d244}'),
('𝐀', '𝑔'),
('𝑖', '𝒜'),
('𝒞', '𝒟'),
('𝒢', '𝒢'),
('𝒥', '𝒦'),
('𝒩', '𝒬'),
('𝒮', '𝒹'),
('𝒻', '𝒻'),
('𝒽', '𝓃'),
('𝓅', '𝔅'),
('𝔇', '𝔊'),
('𝔍', '𝔔'),
('𝔖', '𝔜'),
('𝔞', '𝔹'),
('𝔻', '𝔾'),
('𝕀', '𝕄'),
('𝕆', '𝕆'),
('𝕊', '𝕐'),
('𝕒', '𝚥'),
('𝚨', '𝛀'),
('𝛂', '𝛚'),
('𝛜', '𝛺'),
('𝛼', '𝜔'),
('𝜖', '𝜴'),
('𝜶', '𝝎'),
('𝝐', '𝝮'),
('𝝰', '𝞈'),
('𝞊', '𝞨'),
('𝞪', '𝟂'),
('𝟄', '𝟋'),
('𝟎', '𝟿'),
('\u{1da00}', '\u{1da36}'),
('\u{1da3b}', '\u{1da6c}'),
('\u{1da75}', '\u{1da75}'),
('\u{1da84}', '\u{1da84}'),
('\u{1da9b}', '\u{1da9f}'),
('\u{1daa1}', '\u{1daaf}'),
('𝼀', '𝼞'),
('𝼥', '𝼪'),
('\u{1e000}', '\u{1e006}'),
('\u{1e008}', '\u{1e018}'),
('\u{1e01b}', '\u{1e021}'),
('\u{1e023}', '\u{1e024}'),
('\u{1e026}', '\u{1e02a}'),
('𞀰', '𞁭'),
('\u{1e08f}', '\u{1e08f}'),
('𞄀', '𞄬'),
('\u{1e130}', '𞄽'),
('𞅀', '𞅉'),
('𞅎', '𞅎'),
('𞊐', '\u{1e2ae}'),
('𞋀', '𞋹'),
('𞓐', '𞓹'),
('𞗐', '𞗺'),
('𞟠', '𞟦'),
('𞟨', '𞟫'),
('𞟭', '𞟮'),
('𞟰', '𞟾'),
('𞠀', '𞣄'),
('\u{1e8d0}', '\u{1e8d6}'),
('𞤀', '𞥋'),
('𞥐', '𞥙'),
('𞸀', '𞸃'),
('𞸅', '𞸟'),
('𞸡', '𞸢'),
('𞸤', '𞸤'),
('𞸧', '𞸧'),
('𞸩', '𞸲'),
('𞸴', '𞸷'),
('𞸹', '𞸹'),
('𞸻', '𞸻'),
('𞹂', '𞹂'),
('𞹇', '𞹇'),
('𞹉', '𞹉'),
('𞹋', '𞹋'),
('𞹍', '𞹏'),
('𞹑', '𞹒'),
('𞹔', '𞹔'),
('𞹗', '𞹗'),
('𞹙', '𞹙'),
('𞹛', '𞹛'),
('𞹝', '𞹝'),
('𞹟', '𞹟'),
('𞹡', '𞹢'),
('𞹤', '𞹤'),
('𞹧', '𞹪'),
('𞹬', '𞹲'),
('𞹴', '𞹷'),
('𞹹', '𞹼'),
('𞹾', '𞹾'),
('𞺀', '𞺉'),
('𞺋', '𞺛'),
('𞺡', '𞺣'),
('𞺥', '𞺩'),
('𞺫', '𞺻'),
('🄰', '🅉'),
('🅐', '🅩'),
('🅰', '🆉'),
('🯰', '🯹'),
('𠀀', '𪛟'),
('𪜀', '𫜹'),
('𫝀', '𫠝'),
('𫠠', '𬺡'),
('𬺰', '𮯠'),
('𮯰', '𮹝'),
('丽', '𪘀'),
('𰀀', '𱍊'),
('𱍐', '𲎯'),
('\u{e0100}', '\u{e01ef}'),
];

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,281 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-names ucd-16.0.0
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
("age", "Age"),
("ahex", "ASCII_Hex_Digit"),
("alpha", "Alphabetic"),
("alphabetic", "Alphabetic"),
("asciihexdigit", "ASCII_Hex_Digit"),
("bc", "Bidi_Class"),
("bidic", "Bidi_Control"),
("bidiclass", "Bidi_Class"),
("bidicontrol", "Bidi_Control"),
("bidim", "Bidi_Mirrored"),
("bidimirrored", "Bidi_Mirrored"),
("bidimirroringglyph", "Bidi_Mirroring_Glyph"),
("bidipairedbracket", "Bidi_Paired_Bracket"),
("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"),
("blk", "Block"),
("block", "Block"),
("bmg", "Bidi_Mirroring_Glyph"),
("bpb", "Bidi_Paired_Bracket"),
("bpt", "Bidi_Paired_Bracket_Type"),
("canonicalcombiningclass", "Canonical_Combining_Class"),
("cased", "Cased"),
("casefolding", "Case_Folding"),
("caseignorable", "Case_Ignorable"),
("ccc", "Canonical_Combining_Class"),
("ce", "Composition_Exclusion"),
("cf", "Case_Folding"),
("changeswhencasefolded", "Changes_When_Casefolded"),
("changeswhencasemapped", "Changes_When_Casemapped"),
("changeswhenlowercased", "Changes_When_Lowercased"),
("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"),
("changeswhentitlecased", "Changes_When_Titlecased"),
("changeswhenuppercased", "Changes_When_Uppercased"),
("ci", "Case_Ignorable"),
("cjkaccountingnumeric", "kAccountingNumeric"),
("cjkcompatibilityvariant", "kCompatibilityVariant"),
("cjkiicore", "kIICore"),
("cjkirggsource", "kIRG_GSource"),
("cjkirghsource", "kIRG_HSource"),
("cjkirgjsource", "kIRG_JSource"),
("cjkirgkpsource", "kIRG_KPSource"),
("cjkirgksource", "kIRG_KSource"),
("cjkirgmsource", "kIRG_MSource"),
("cjkirgssource", "kIRG_SSource"),
("cjkirgtsource", "kIRG_TSource"),
("cjkirguksource", "kIRG_UKSource"),
("cjkirgusource", "kIRG_USource"),
("cjkirgvsource", "kIRG_VSource"),
("cjkothernumeric", "kOtherNumeric"),
("cjkprimarynumeric", "kPrimaryNumeric"),
("cjkrsunicode", "kRSUnicode"),
("compex", "Full_Composition_Exclusion"),
("compositionexclusion", "Composition_Exclusion"),
("cwcf", "Changes_When_Casefolded"),
("cwcm", "Changes_When_Casemapped"),
("cwkcf", "Changes_When_NFKC_Casefolded"),
("cwl", "Changes_When_Lowercased"),
("cwt", "Changes_When_Titlecased"),
("cwu", "Changes_When_Uppercased"),
("dash", "Dash"),
("decompositionmapping", "Decomposition_Mapping"),
("decompositiontype", "Decomposition_Type"),
("defaultignorablecodepoint", "Default_Ignorable_Code_Point"),
("dep", "Deprecated"),
("deprecated", "Deprecated"),
("di", "Default_Ignorable_Code_Point"),
("dia", "Diacritic"),
("diacritic", "Diacritic"),
("dm", "Decomposition_Mapping"),
("dt", "Decomposition_Type"),
("ea", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"),
("ebase", "Emoji_Modifier_Base"),
("ecomp", "Emoji_Component"),
("emod", "Emoji_Modifier"),
("emoji", "Emoji"),
("emojicomponent", "Emoji_Component"),
("emojimodifier", "Emoji_Modifier"),
("emojimodifierbase", "Emoji_Modifier_Base"),
("emojipresentation", "Emoji_Presentation"),
("epres", "Emoji_Presentation"),
("equideo", "Equivalent_Unified_Ideograph"),
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
("expandsonnfc", "Expands_On_NFC"),
("expandsonnfd", "Expands_On_NFD"),
("expandsonnfkc", "Expands_On_NFKC"),
("expandsonnfkd", "Expands_On_NFKD"),
("ext", "Extender"),
("extendedpictographic", "Extended_Pictographic"),
("extender", "Extender"),
("extpict", "Extended_Pictographic"),
("fcnfkc", "FC_NFKC_Closure"),
("fcnfkcclosure", "FC_NFKC_Closure"),
("fullcompositionexclusion", "Full_Composition_Exclusion"),
("gc", "General_Category"),
("gcb", "Grapheme_Cluster_Break"),
("generalcategory", "General_Category"),
("graphemebase", "Grapheme_Base"),
("graphemeclusterbreak", "Grapheme_Cluster_Break"),
("graphemeextend", "Grapheme_Extend"),
("graphemelink", "Grapheme_Link"),
("grbase", "Grapheme_Base"),
("grext", "Grapheme_Extend"),
("grlink", "Grapheme_Link"),
("hangulsyllabletype", "Hangul_Syllable_Type"),
("hex", "Hex_Digit"),
("hexdigit", "Hex_Digit"),
("hst", "Hangul_Syllable_Type"),
("hyphen", "Hyphen"),
("idc", "ID_Continue"),
("idcompatmathcontinue", "ID_Compat_Math_Continue"),
("idcompatmathstart", "ID_Compat_Math_Start"),
("idcontinue", "ID_Continue"),
("ideo", "Ideographic"),
("ideographic", "Ideographic"),
("ids", "ID_Start"),
("idsb", "IDS_Binary_Operator"),
("idsbinaryoperator", "IDS_Binary_Operator"),
("idst", "IDS_Trinary_Operator"),
("idstart", "ID_Start"),
("idstrinaryoperator", "IDS_Trinary_Operator"),
("idsu", "IDS_Unary_Operator"),
("idsunaryoperator", "IDS_Unary_Operator"),
("incb", "Indic_Conjunct_Break"),
("indicconjunctbreak", "Indic_Conjunct_Break"),
("indicpositionalcategory", "Indic_Positional_Category"),
("indicsyllabiccategory", "Indic_Syllabic_Category"),
("inpc", "Indic_Positional_Category"),
("insc", "Indic_Syllabic_Category"),
("isc", "ISO_Comment"),
("jamoshortname", "Jamo_Short_Name"),
("jg", "Joining_Group"),
("joinc", "Join_Control"),
("joincontrol", "Join_Control"),
("joininggroup", "Joining_Group"),
("joiningtype", "Joining_Type"),
("jsn", "Jamo_Short_Name"),
("jt", "Joining_Type"),
("kaccountingnumeric", "kAccountingNumeric"),
("kcompatibilityvariant", "kCompatibilityVariant"),
("kehcat", "kEH_Cat"),
("kehdesc", "kEH_Desc"),
("kehhg", "kEH_HG"),
("kehifao", "kEH_IFAO"),
("kehjsesh", "kEH_JSesh"),
("kehnomirror", "kEH_NoMirror"),
("kehnorotate", "kEH_NoRotate"),
("kiicore", "kIICore"),
("kirggsource", "kIRG_GSource"),
("kirghsource", "kIRG_HSource"),
("kirgjsource", "kIRG_JSource"),
("kirgkpsource", "kIRG_KPSource"),
("kirgksource", "kIRG_KSource"),
("kirgmsource", "kIRG_MSource"),
("kirgssource", "kIRG_SSource"),
("kirgtsource", "kIRG_TSource"),
("kirguksource", "kIRG_UKSource"),
("kirgusource", "kIRG_USource"),
("kirgvsource", "kIRG_VSource"),
("kothernumeric", "kOtherNumeric"),
("kprimarynumeric", "kPrimaryNumeric"),
("krsunicode", "kRSUnicode"),
("lb", "Line_Break"),
("lc", "Lowercase_Mapping"),
("linebreak", "Line_Break"),
("loe", "Logical_Order_Exception"),
("logicalorderexception", "Logical_Order_Exception"),
("lower", "Lowercase"),
("lowercase", "Lowercase"),
("lowercasemapping", "Lowercase_Mapping"),
("math", "Math"),
("mcm", "Modifier_Combining_Mark"),
("modifiercombiningmark", "Modifier_Combining_Mark"),
("na", "Name"),
("na1", "Unicode_1_Name"),
("name", "Name"),
("namealias", "Name_Alias"),
("nchar", "Noncharacter_Code_Point"),
("nfcqc", "NFC_Quick_Check"),
("nfcquickcheck", "NFC_Quick_Check"),
("nfdqc", "NFD_Quick_Check"),
("nfdquickcheck", "NFD_Quick_Check"),
("nfkccasefold", "NFKC_Casefold"),
("nfkccf", "NFKC_Casefold"),
("nfkcqc", "NFKC_Quick_Check"),
("nfkcquickcheck", "NFKC_Quick_Check"),
("nfkcscf", "NFKC_Simple_Casefold"),
("nfkcsimplecasefold", "NFKC_Simple_Casefold"),
("nfkdqc", "NFKD_Quick_Check"),
("nfkdquickcheck", "NFKD_Quick_Check"),
("noncharactercodepoint", "Noncharacter_Code_Point"),
("nt", "Numeric_Type"),
("numerictype", "Numeric_Type"),
("numericvalue", "Numeric_Value"),
("nv", "Numeric_Value"),
("oalpha", "Other_Alphabetic"),
("ocomment", "ISO_Comment"),
("odi", "Other_Default_Ignorable_Code_Point"),
("ogrext", "Other_Grapheme_Extend"),
("oidc", "Other_ID_Continue"),
("oids", "Other_ID_Start"),
("olower", "Other_Lowercase"),
("omath", "Other_Math"),
("otheralphabetic", "Other_Alphabetic"),
("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"),
("othergraphemeextend", "Other_Grapheme_Extend"),
("otheridcontinue", "Other_ID_Continue"),
("otheridstart", "Other_ID_Start"),
("otherlowercase", "Other_Lowercase"),
("othermath", "Other_Math"),
("otheruppercase", "Other_Uppercase"),
("oupper", "Other_Uppercase"),
("patsyn", "Pattern_Syntax"),
("patternsyntax", "Pattern_Syntax"),
("patternwhitespace", "Pattern_White_Space"),
("patws", "Pattern_White_Space"),
("pcm", "Prepended_Concatenation_Mark"),
("prependedconcatenationmark", "Prepended_Concatenation_Mark"),
("qmark", "Quotation_Mark"),
("quotationmark", "Quotation_Mark"),
("radical", "Radical"),
("regionalindicator", "Regional_Indicator"),
("ri", "Regional_Indicator"),
("sb", "Sentence_Break"),
("sc", "Script"),
("scf", "Simple_Case_Folding"),
("script", "Script"),
("scriptextensions", "Script_Extensions"),
("scx", "Script_Extensions"),
("sd", "Soft_Dotted"),
("sentencebreak", "Sentence_Break"),
("sentenceterminal", "Sentence_Terminal"),
("sfc", "Simple_Case_Folding"),
("simplecasefolding", "Simple_Case_Folding"),
("simplelowercasemapping", "Simple_Lowercase_Mapping"),
("simpletitlecasemapping", "Simple_Titlecase_Mapping"),
("simpleuppercasemapping", "Simple_Uppercase_Mapping"),
("slc", "Simple_Lowercase_Mapping"),
("softdotted", "Soft_Dotted"),
("space", "White_Space"),
("stc", "Simple_Titlecase_Mapping"),
("sterm", "Sentence_Terminal"),
("suc", "Simple_Uppercase_Mapping"),
("tc", "Titlecase_Mapping"),
("term", "Terminal_Punctuation"),
("terminalpunctuation", "Terminal_Punctuation"),
("titlecasemapping", "Titlecase_Mapping"),
("uc", "Uppercase_Mapping"),
("uideo", "Unified_Ideograph"),
("unicode1name", "Unicode_1_Name"),
("unicoderadicalstroke", "kRSUnicode"),
("unifiedideograph", "Unified_Ideograph"),
("upper", "Uppercase"),
("uppercase", "Uppercase"),
("uppercasemapping", "Uppercase_Mapping"),
("urs", "kRSUnicode"),
("variationselector", "Variation_Selector"),
("verticalorientation", "Vertical_Orientation"),
("vo", "Vertical_Orientation"),
("vs", "Variation_Selector"),
("wb", "Word_Break"),
("whitespace", "White_Space"),
("wordbreak", "Word_Break"),
("wspace", "White_Space"),
("xidc", "XID_Continue"),
("xidcontinue", "XID_Continue"),
("xids", "XID_Start"),
("xidstart", "XID_Start"),
("xonfc", "Expands_On_NFC"),
("xonfd", "Expands_On_NFD"),
("xonfkc", "Expands_On_NFKC"),
("xonfkd", "Expands_On_NFKD"),
];

View File

@@ -0,0 +1,956 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-values ucd-16.0.0 --include gc,script,scx,age,gcb,wb,sb
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PROPERTY_VALUES: &'static [(
&'static str,
&'static [(&'static str, &'static str)],
)] = &[
(
"Age",
&[
("1.1", "V1_1"),
("10.0", "V10_0"),
("11.0", "V11_0"),
("12.0", "V12_0"),
("12.1", "V12_1"),
("13.0", "V13_0"),
("14.0", "V14_0"),
("15.0", "V15_0"),
("15.1", "V15_1"),
("16.0", "V16_0"),
("2.0", "V2_0"),
("2.1", "V2_1"),
("3.0", "V3_0"),
("3.1", "V3_1"),
("3.2", "V3_2"),
("4.0", "V4_0"),
("4.1", "V4_1"),
("5.0", "V5_0"),
("5.1", "V5_1"),
("5.2", "V5_2"),
("6.0", "V6_0"),
("6.1", "V6_1"),
("6.2", "V6_2"),
("6.3", "V6_3"),
("7.0", "V7_0"),
("8.0", "V8_0"),
("9.0", "V9_0"),
("na", "Unassigned"),
("unassigned", "Unassigned"),
("v100", "V10_0"),
("v11", "V1_1"),
("v110", "V11_0"),
("v120", "V12_0"),
("v121", "V12_1"),
("v130", "V13_0"),
("v140", "V14_0"),
("v150", "V15_0"),
("v151", "V15_1"),
("v160", "V16_0"),
("v20", "V2_0"),
("v21", "V2_1"),
("v30", "V3_0"),
("v31", "V3_1"),
("v32", "V3_2"),
("v40", "V4_0"),
("v41", "V4_1"),
("v50", "V5_0"),
("v51", "V5_1"),
("v52", "V5_2"),
("v60", "V6_0"),
("v61", "V6_1"),
("v62", "V6_2"),
("v63", "V6_3"),
("v70", "V7_0"),
("v80", "V8_0"),
("v90", "V9_0"),
],
),
(
"General_Category",
&[
("c", "Other"),
("casedletter", "Cased_Letter"),
("cc", "Control"),
("cf", "Format"),
("closepunctuation", "Close_Punctuation"),
("cn", "Unassigned"),
("cntrl", "Control"),
("co", "Private_Use"),
("combiningmark", "Mark"),
("connectorpunctuation", "Connector_Punctuation"),
("control", "Control"),
("cs", "Surrogate"),
("currencysymbol", "Currency_Symbol"),
("dashpunctuation", "Dash_Punctuation"),
("decimalnumber", "Decimal_Number"),
("digit", "Decimal_Number"),
("enclosingmark", "Enclosing_Mark"),
("finalpunctuation", "Final_Punctuation"),
("format", "Format"),
("initialpunctuation", "Initial_Punctuation"),
("l", "Letter"),
("lc", "Cased_Letter"),
("letter", "Letter"),
("letternumber", "Letter_Number"),
("lineseparator", "Line_Separator"),
("ll", "Lowercase_Letter"),
("lm", "Modifier_Letter"),
("lo", "Other_Letter"),
("lowercaseletter", "Lowercase_Letter"),
("lt", "Titlecase_Letter"),
("lu", "Uppercase_Letter"),
("m", "Mark"),
("mark", "Mark"),
("mathsymbol", "Math_Symbol"),
("mc", "Spacing_Mark"),
("me", "Enclosing_Mark"),
("mn", "Nonspacing_Mark"),
("modifierletter", "Modifier_Letter"),
("modifiersymbol", "Modifier_Symbol"),
("n", "Number"),
("nd", "Decimal_Number"),
("nl", "Letter_Number"),
("no", "Other_Number"),
("nonspacingmark", "Nonspacing_Mark"),
("number", "Number"),
("openpunctuation", "Open_Punctuation"),
("other", "Other"),
("otherletter", "Other_Letter"),
("othernumber", "Other_Number"),
("otherpunctuation", "Other_Punctuation"),
("othersymbol", "Other_Symbol"),
("p", "Punctuation"),
("paragraphseparator", "Paragraph_Separator"),
("pc", "Connector_Punctuation"),
("pd", "Dash_Punctuation"),
("pe", "Close_Punctuation"),
("pf", "Final_Punctuation"),
("pi", "Initial_Punctuation"),
("po", "Other_Punctuation"),
("privateuse", "Private_Use"),
("ps", "Open_Punctuation"),
("punct", "Punctuation"),
("punctuation", "Punctuation"),
("s", "Symbol"),
("sc", "Currency_Symbol"),
("separator", "Separator"),
("sk", "Modifier_Symbol"),
("sm", "Math_Symbol"),
("so", "Other_Symbol"),
("spaceseparator", "Space_Separator"),
("spacingmark", "Spacing_Mark"),
("surrogate", "Surrogate"),
("symbol", "Symbol"),
("titlecaseletter", "Titlecase_Letter"),
("unassigned", "Unassigned"),
("uppercaseletter", "Uppercase_Letter"),
("z", "Separator"),
("zl", "Line_Separator"),
("zp", "Paragraph_Separator"),
("zs", "Space_Separator"),
],
),
(
"Grapheme_Cluster_Break",
&[
("cn", "Control"),
("control", "Control"),
("cr", "CR"),
("eb", "E_Base"),
("ebase", "E_Base"),
("ebasegaz", "E_Base_GAZ"),
("ebg", "E_Base_GAZ"),
("em", "E_Modifier"),
("emodifier", "E_Modifier"),
("ex", "Extend"),
("extend", "Extend"),
("gaz", "Glue_After_Zwj"),
("glueafterzwj", "Glue_After_Zwj"),
("l", "L"),
("lf", "LF"),
("lv", "LV"),
("lvt", "LVT"),
("other", "Other"),
("pp", "Prepend"),
("prepend", "Prepend"),
("regionalindicator", "Regional_Indicator"),
("ri", "Regional_Indicator"),
("sm", "SpacingMark"),
("spacingmark", "SpacingMark"),
("t", "T"),
("v", "V"),
("xx", "Other"),
("zwj", "ZWJ"),
],
),
(
"Script",
&[
("adlam", "Adlam"),
("adlm", "Adlam"),
("aghb", "Caucasian_Albanian"),
("ahom", "Ahom"),
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
("arab", "Arabic"),
("arabic", "Arabic"),
("armenian", "Armenian"),
("armi", "Imperial_Aramaic"),
("armn", "Armenian"),
("avestan", "Avestan"),
("avst", "Avestan"),
("bali", "Balinese"),
("balinese", "Balinese"),
("bamu", "Bamum"),
("bamum", "Bamum"),
("bass", "Bassa_Vah"),
("bassavah", "Bassa_Vah"),
("batak", "Batak"),
("batk", "Batak"),
("beng", "Bengali"),
("bengali", "Bengali"),
("bhaiksuki", "Bhaiksuki"),
("bhks", "Bhaiksuki"),
("bopo", "Bopomofo"),
("bopomofo", "Bopomofo"),
("brah", "Brahmi"),
("brahmi", "Brahmi"),
("brai", "Braille"),
("braille", "Braille"),
("bugi", "Buginese"),
("buginese", "Buginese"),
("buhd", "Buhid"),
("buhid", "Buhid"),
("cakm", "Chakma"),
("canadianaboriginal", "Canadian_Aboriginal"),
("cans", "Canadian_Aboriginal"),
("cari", "Carian"),
("carian", "Carian"),
("caucasianalbanian", "Caucasian_Albanian"),
("chakma", "Chakma"),
("cham", "Cham"),
("cher", "Cherokee"),
("cherokee", "Cherokee"),
("chorasmian", "Chorasmian"),
("chrs", "Chorasmian"),
("common", "Common"),
("copt", "Coptic"),
("coptic", "Coptic"),
("cpmn", "Cypro_Minoan"),
("cprt", "Cypriot"),
("cuneiform", "Cuneiform"),
("cypriot", "Cypriot"),
("cyprominoan", "Cypro_Minoan"),
("cyrillic", "Cyrillic"),
("cyrl", "Cyrillic"),
("deseret", "Deseret"),
("deva", "Devanagari"),
("devanagari", "Devanagari"),
("diak", "Dives_Akuru"),
("divesakuru", "Dives_Akuru"),
("dogr", "Dogra"),
("dogra", "Dogra"),
("dsrt", "Deseret"),
("dupl", "Duployan"),
("duployan", "Duployan"),
("egyp", "Egyptian_Hieroglyphs"),
("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
("elba", "Elbasan"),
("elbasan", "Elbasan"),
("elym", "Elymaic"),
("elymaic", "Elymaic"),
("ethi", "Ethiopic"),
("ethiopic", "Ethiopic"),
("gara", "Garay"),
("garay", "Garay"),
("geor", "Georgian"),
("georgian", "Georgian"),
("glag", "Glagolitic"),
("glagolitic", "Glagolitic"),
("gong", "Gunjala_Gondi"),
("gonm", "Masaram_Gondi"),
("goth", "Gothic"),
("gothic", "Gothic"),
("gran", "Grantha"),
("grantha", "Grantha"),
("greek", "Greek"),
("grek", "Greek"),
("gujarati", "Gujarati"),
("gujr", "Gujarati"),
("gukh", "Gurung_Khema"),
("gunjalagondi", "Gunjala_Gondi"),
("gurmukhi", "Gurmukhi"),
("guru", "Gurmukhi"),
("gurungkhema", "Gurung_Khema"),
("han", "Han"),
("hang", "Hangul"),
("hangul", "Hangul"),
("hani", "Han"),
("hanifirohingya", "Hanifi_Rohingya"),
("hano", "Hanunoo"),
("hanunoo", "Hanunoo"),
("hatr", "Hatran"),
("hatran", "Hatran"),
("hebr", "Hebrew"),
("hebrew", "Hebrew"),
("hira", "Hiragana"),
("hiragana", "Hiragana"),
("hluw", "Anatolian_Hieroglyphs"),
("hmng", "Pahawh_Hmong"),
("hmnp", "Nyiakeng_Puachue_Hmong"),
("hrkt", "Katakana_Or_Hiragana"),
("hung", "Old_Hungarian"),
("imperialaramaic", "Imperial_Aramaic"),
("inherited", "Inherited"),
("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
("inscriptionalparthian", "Inscriptional_Parthian"),
("ital", "Old_Italic"),
("java", "Javanese"),
("javanese", "Javanese"),
("kaithi", "Kaithi"),
("kali", "Kayah_Li"),
("kana", "Katakana"),
("kannada", "Kannada"),
("katakana", "Katakana"),
("katakanaorhiragana", "Katakana_Or_Hiragana"),
("kawi", "Kawi"),
("kayahli", "Kayah_Li"),
("khar", "Kharoshthi"),
("kharoshthi", "Kharoshthi"),
("khitansmallscript", "Khitan_Small_Script"),
("khmer", "Khmer"),
("khmr", "Khmer"),
("khoj", "Khojki"),
("khojki", "Khojki"),
("khudawadi", "Khudawadi"),
("kiratrai", "Kirat_Rai"),
("kits", "Khitan_Small_Script"),
("knda", "Kannada"),
("krai", "Kirat_Rai"),
("kthi", "Kaithi"),
("lana", "Tai_Tham"),
("lao", "Lao"),
("laoo", "Lao"),
("latin", "Latin"),
("latn", "Latin"),
("lepc", "Lepcha"),
("lepcha", "Lepcha"),
("limb", "Limbu"),
("limbu", "Limbu"),
("lina", "Linear_A"),
("linb", "Linear_B"),
("lineara", "Linear_A"),
("linearb", "Linear_B"),
("lisu", "Lisu"),
("lyci", "Lycian"),
("lycian", "Lycian"),
("lydi", "Lydian"),
("lydian", "Lydian"),
("mahajani", "Mahajani"),
("mahj", "Mahajani"),
("maka", "Makasar"),
("makasar", "Makasar"),
("malayalam", "Malayalam"),
("mand", "Mandaic"),
("mandaic", "Mandaic"),
("mani", "Manichaean"),
("manichaean", "Manichaean"),
("marc", "Marchen"),
("marchen", "Marchen"),
("masaramgondi", "Masaram_Gondi"),
("medefaidrin", "Medefaidrin"),
("medf", "Medefaidrin"),
("meeteimayek", "Meetei_Mayek"),
("mend", "Mende_Kikakui"),
("mendekikakui", "Mende_Kikakui"),
("merc", "Meroitic_Cursive"),
("mero", "Meroitic_Hieroglyphs"),
("meroiticcursive", "Meroitic_Cursive"),
("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
("miao", "Miao"),
("mlym", "Malayalam"),
("modi", "Modi"),
("mong", "Mongolian"),
("mongolian", "Mongolian"),
("mro", "Mro"),
("mroo", "Mro"),
("mtei", "Meetei_Mayek"),
("mult", "Multani"),
("multani", "Multani"),
("myanmar", "Myanmar"),
("mymr", "Myanmar"),
("nabataean", "Nabataean"),
("nagm", "Nag_Mundari"),
("nagmundari", "Nag_Mundari"),
("nand", "Nandinagari"),
("nandinagari", "Nandinagari"),
("narb", "Old_North_Arabian"),
("nbat", "Nabataean"),
("newa", "Newa"),
("newtailue", "New_Tai_Lue"),
("nko", "Nko"),
("nkoo", "Nko"),
("nshu", "Nushu"),
("nushu", "Nushu"),
("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
("ogam", "Ogham"),
("ogham", "Ogham"),
("olchiki", "Ol_Chiki"),
("olck", "Ol_Chiki"),
("oldhungarian", "Old_Hungarian"),
("olditalic", "Old_Italic"),
("oldnortharabian", "Old_North_Arabian"),
("oldpermic", "Old_Permic"),
("oldpersian", "Old_Persian"),
("oldsogdian", "Old_Sogdian"),
("oldsoutharabian", "Old_South_Arabian"),
("oldturkic", "Old_Turkic"),
("olduyghur", "Old_Uyghur"),
("olonal", "Ol_Onal"),
("onao", "Ol_Onal"),
("oriya", "Oriya"),
("orkh", "Old_Turkic"),
("orya", "Oriya"),
("osage", "Osage"),
("osge", "Osage"),
("osma", "Osmanya"),
("osmanya", "Osmanya"),
("ougr", "Old_Uyghur"),
("pahawhhmong", "Pahawh_Hmong"),
("palm", "Palmyrene"),
("palmyrene", "Palmyrene"),
("pauc", "Pau_Cin_Hau"),
("paucinhau", "Pau_Cin_Hau"),
("perm", "Old_Permic"),
("phag", "Phags_Pa"),
("phagspa", "Phags_Pa"),
("phli", "Inscriptional_Pahlavi"),
("phlp", "Psalter_Pahlavi"),
("phnx", "Phoenician"),
("phoenician", "Phoenician"),
("plrd", "Miao"),
("prti", "Inscriptional_Parthian"),
("psalterpahlavi", "Psalter_Pahlavi"),
("qaac", "Coptic"),
("qaai", "Inherited"),
("rejang", "Rejang"),
("rjng", "Rejang"),
("rohg", "Hanifi_Rohingya"),
("runic", "Runic"),
("runr", "Runic"),
("samaritan", "Samaritan"),
("samr", "Samaritan"),
("sarb", "Old_South_Arabian"),
("saur", "Saurashtra"),
("saurashtra", "Saurashtra"),
("sgnw", "SignWriting"),
("sharada", "Sharada"),
("shavian", "Shavian"),
("shaw", "Shavian"),
("shrd", "Sharada"),
("sidd", "Siddham"),
("siddham", "Siddham"),
("signwriting", "SignWriting"),
("sind", "Khudawadi"),
("sinh", "Sinhala"),
("sinhala", "Sinhala"),
("sogd", "Sogdian"),
("sogdian", "Sogdian"),
("sogo", "Old_Sogdian"),
("sora", "Sora_Sompeng"),
("sorasompeng", "Sora_Sompeng"),
("soyo", "Soyombo"),
("soyombo", "Soyombo"),
("sund", "Sundanese"),
("sundanese", "Sundanese"),
("sunu", "Sunuwar"),
("sunuwar", "Sunuwar"),
("sylo", "Syloti_Nagri"),
("sylotinagri", "Syloti_Nagri"),
("syrc", "Syriac"),
("syriac", "Syriac"),
("tagalog", "Tagalog"),
("tagb", "Tagbanwa"),
("tagbanwa", "Tagbanwa"),
("taile", "Tai_Le"),
("taitham", "Tai_Tham"),
("taiviet", "Tai_Viet"),
("takr", "Takri"),
("takri", "Takri"),
("tale", "Tai_Le"),
("talu", "New_Tai_Lue"),
("tamil", "Tamil"),
("taml", "Tamil"),
("tang", "Tangut"),
("tangsa", "Tangsa"),
("tangut", "Tangut"),
("tavt", "Tai_Viet"),
("telu", "Telugu"),
("telugu", "Telugu"),
("tfng", "Tifinagh"),
("tglg", "Tagalog"),
("thaa", "Thaana"),
("thaana", "Thaana"),
("thai", "Thai"),
("tibetan", "Tibetan"),
("tibt", "Tibetan"),
("tifinagh", "Tifinagh"),
("tirh", "Tirhuta"),
("tirhuta", "Tirhuta"),
("tnsa", "Tangsa"),
("todhri", "Todhri"),
("todr", "Todhri"),
("toto", "Toto"),
("tulutigalari", "Tulu_Tigalari"),
("tutg", "Tulu_Tigalari"),
("ugar", "Ugaritic"),
("ugaritic", "Ugaritic"),
("unknown", "Unknown"),
("vai", "Vai"),
("vaii", "Vai"),
("vith", "Vithkuqi"),
("vithkuqi", "Vithkuqi"),
("wancho", "Wancho"),
("wara", "Warang_Citi"),
("warangciti", "Warang_Citi"),
("wcho", "Wancho"),
("xpeo", "Old_Persian"),
("xsux", "Cuneiform"),
("yezi", "Yezidi"),
("yezidi", "Yezidi"),
("yi", "Yi"),
("yiii", "Yi"),
("zanabazarsquare", "Zanabazar_Square"),
("zanb", "Zanabazar_Square"),
("zinh", "Inherited"),
("zyyy", "Common"),
("zzzz", "Unknown"),
],
),
(
"Script_Extensions",
&[
("adlam", "Adlam"),
("adlm", "Adlam"),
("aghb", "Caucasian_Albanian"),
("ahom", "Ahom"),
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
("arab", "Arabic"),
("arabic", "Arabic"),
("armenian", "Armenian"),
("armi", "Imperial_Aramaic"),
("armn", "Armenian"),
("avestan", "Avestan"),
("avst", "Avestan"),
("bali", "Balinese"),
("balinese", "Balinese"),
("bamu", "Bamum"),
("bamum", "Bamum"),
("bass", "Bassa_Vah"),
("bassavah", "Bassa_Vah"),
("batak", "Batak"),
("batk", "Batak"),
("beng", "Bengali"),
("bengali", "Bengali"),
("bhaiksuki", "Bhaiksuki"),
("bhks", "Bhaiksuki"),
("bopo", "Bopomofo"),
("bopomofo", "Bopomofo"),
("brah", "Brahmi"),
("brahmi", "Brahmi"),
("brai", "Braille"),
("braille", "Braille"),
("bugi", "Buginese"),
("buginese", "Buginese"),
("buhd", "Buhid"),
("buhid", "Buhid"),
("cakm", "Chakma"),
("canadianaboriginal", "Canadian_Aboriginal"),
("cans", "Canadian_Aboriginal"),
("cari", "Carian"),
("carian", "Carian"),
("caucasianalbanian", "Caucasian_Albanian"),
("chakma", "Chakma"),
("cham", "Cham"),
("cher", "Cherokee"),
("cherokee", "Cherokee"),
("chorasmian", "Chorasmian"),
("chrs", "Chorasmian"),
("common", "Common"),
("copt", "Coptic"),
("coptic", "Coptic"),
("cpmn", "Cypro_Minoan"),
("cprt", "Cypriot"),
("cuneiform", "Cuneiform"),
("cypriot", "Cypriot"),
("cyprominoan", "Cypro_Minoan"),
("cyrillic", "Cyrillic"),
("cyrl", "Cyrillic"),
("deseret", "Deseret"),
("deva", "Devanagari"),
("devanagari", "Devanagari"),
("diak", "Dives_Akuru"),
("divesakuru", "Dives_Akuru"),
("dogr", "Dogra"),
("dogra", "Dogra"),
("dsrt", "Deseret"),
("dupl", "Duployan"),
("duployan", "Duployan"),
("egyp", "Egyptian_Hieroglyphs"),
("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
("elba", "Elbasan"),
("elbasan", "Elbasan"),
("elym", "Elymaic"),
("elymaic", "Elymaic"),
("ethi", "Ethiopic"),
("ethiopic", "Ethiopic"),
("gara", "Garay"),
("garay", "Garay"),
("geor", "Georgian"),
("georgian", "Georgian"),
("glag", "Glagolitic"),
("glagolitic", "Glagolitic"),
("gong", "Gunjala_Gondi"),
("gonm", "Masaram_Gondi"),
("goth", "Gothic"),
("gothic", "Gothic"),
("gran", "Grantha"),
("grantha", "Grantha"),
("greek", "Greek"),
("grek", "Greek"),
("gujarati", "Gujarati"),
("gujr", "Gujarati"),
("gukh", "Gurung_Khema"),
("gunjalagondi", "Gunjala_Gondi"),
("gurmukhi", "Gurmukhi"),
("guru", "Gurmukhi"),
("gurungkhema", "Gurung_Khema"),
("han", "Han"),
("hang", "Hangul"),
("hangul", "Hangul"),
("hani", "Han"),
("hanifirohingya", "Hanifi_Rohingya"),
("hano", "Hanunoo"),
("hanunoo", "Hanunoo"),
("hatr", "Hatran"),
("hatran", "Hatran"),
("hebr", "Hebrew"),
("hebrew", "Hebrew"),
("hira", "Hiragana"),
("hiragana", "Hiragana"),
("hluw", "Anatolian_Hieroglyphs"),
("hmng", "Pahawh_Hmong"),
("hmnp", "Nyiakeng_Puachue_Hmong"),
("hrkt", "Katakana_Or_Hiragana"),
("hung", "Old_Hungarian"),
("imperialaramaic", "Imperial_Aramaic"),
("inherited", "Inherited"),
("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
("inscriptionalparthian", "Inscriptional_Parthian"),
("ital", "Old_Italic"),
("java", "Javanese"),
("javanese", "Javanese"),
("kaithi", "Kaithi"),
("kali", "Kayah_Li"),
("kana", "Katakana"),
("kannada", "Kannada"),
("katakana", "Katakana"),
("katakanaorhiragana", "Katakana_Or_Hiragana"),
("kawi", "Kawi"),
("kayahli", "Kayah_Li"),
("khar", "Kharoshthi"),
("kharoshthi", "Kharoshthi"),
("khitansmallscript", "Khitan_Small_Script"),
("khmer", "Khmer"),
("khmr", "Khmer"),
("khoj", "Khojki"),
("khojki", "Khojki"),
("khudawadi", "Khudawadi"),
("kiratrai", "Kirat_Rai"),
("kits", "Khitan_Small_Script"),
("knda", "Kannada"),
("krai", "Kirat_Rai"),
("kthi", "Kaithi"),
("lana", "Tai_Tham"),
("lao", "Lao"),
("laoo", "Lao"),
("latin", "Latin"),
("latn", "Latin"),
("lepc", "Lepcha"),
("lepcha", "Lepcha"),
("limb", "Limbu"),
("limbu", "Limbu"),
("lina", "Linear_A"),
("linb", "Linear_B"),
("lineara", "Linear_A"),
("linearb", "Linear_B"),
("lisu", "Lisu"),
("lyci", "Lycian"),
("lycian", "Lycian"),
("lydi", "Lydian"),
("lydian", "Lydian"),
("mahajani", "Mahajani"),
("mahj", "Mahajani"),
("maka", "Makasar"),
("makasar", "Makasar"),
("malayalam", "Malayalam"),
("mand", "Mandaic"),
("mandaic", "Mandaic"),
("mani", "Manichaean"),
("manichaean", "Manichaean"),
("marc", "Marchen"),
("marchen", "Marchen"),
("masaramgondi", "Masaram_Gondi"),
("medefaidrin", "Medefaidrin"),
("medf", "Medefaidrin"),
("meeteimayek", "Meetei_Mayek"),
("mend", "Mende_Kikakui"),
("mendekikakui", "Mende_Kikakui"),
("merc", "Meroitic_Cursive"),
("mero", "Meroitic_Hieroglyphs"),
("meroiticcursive", "Meroitic_Cursive"),
("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
("miao", "Miao"),
("mlym", "Malayalam"),
("modi", "Modi"),
("mong", "Mongolian"),
("mongolian", "Mongolian"),
("mro", "Mro"),
("mroo", "Mro"),
("mtei", "Meetei_Mayek"),
("mult", "Multani"),
("multani", "Multani"),
("myanmar", "Myanmar"),
("mymr", "Myanmar"),
("nabataean", "Nabataean"),
("nagm", "Nag_Mundari"),
("nagmundari", "Nag_Mundari"),
("nand", "Nandinagari"),
("nandinagari", "Nandinagari"),
("narb", "Old_North_Arabian"),
("nbat", "Nabataean"),
("newa", "Newa"),
("newtailue", "New_Tai_Lue"),
("nko", "Nko"),
("nkoo", "Nko"),
("nshu", "Nushu"),
("nushu", "Nushu"),
("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
("ogam", "Ogham"),
("ogham", "Ogham"),
("olchiki", "Ol_Chiki"),
("olck", "Ol_Chiki"),
("oldhungarian", "Old_Hungarian"),
("olditalic", "Old_Italic"),
("oldnortharabian", "Old_North_Arabian"),
("oldpermic", "Old_Permic"),
("oldpersian", "Old_Persian"),
("oldsogdian", "Old_Sogdian"),
("oldsoutharabian", "Old_South_Arabian"),
("oldturkic", "Old_Turkic"),
("olduyghur", "Old_Uyghur"),
("olonal", "Ol_Onal"),
("onao", "Ol_Onal"),
("oriya", "Oriya"),
("orkh", "Old_Turkic"),
("orya", "Oriya"),
("osage", "Osage"),
("osge", "Osage"),
("osma", "Osmanya"),
("osmanya", "Osmanya"),
("ougr", "Old_Uyghur"),
("pahawhhmong", "Pahawh_Hmong"),
("palm", "Palmyrene"),
("palmyrene", "Palmyrene"),
("pauc", "Pau_Cin_Hau"),
("paucinhau", "Pau_Cin_Hau"),
("perm", "Old_Permic"),
("phag", "Phags_Pa"),
("phagspa", "Phags_Pa"),
("phli", "Inscriptional_Pahlavi"),
("phlp", "Psalter_Pahlavi"),
("phnx", "Phoenician"),
("phoenician", "Phoenician"),
("plrd", "Miao"),
("prti", "Inscriptional_Parthian"),
("psalterpahlavi", "Psalter_Pahlavi"),
("qaac", "Coptic"),
("qaai", "Inherited"),
("rejang", "Rejang"),
("rjng", "Rejang"),
("rohg", "Hanifi_Rohingya"),
("runic", "Runic"),
("runr", "Runic"),
("samaritan", "Samaritan"),
("samr", "Samaritan"),
("sarb", "Old_South_Arabian"),
("saur", "Saurashtra"),
("saurashtra", "Saurashtra"),
("sgnw", "SignWriting"),
("sharada", "Sharada"),
("shavian", "Shavian"),
("shaw", "Shavian"),
("shrd", "Sharada"),
("sidd", "Siddham"),
("siddham", "Siddham"),
("signwriting", "SignWriting"),
("sind", "Khudawadi"),
("sinh", "Sinhala"),
("sinhala", "Sinhala"),
("sogd", "Sogdian"),
("sogdian", "Sogdian"),
("sogo", "Old_Sogdian"),
("sora", "Sora_Sompeng"),
("sorasompeng", "Sora_Sompeng"),
("soyo", "Soyombo"),
("soyombo", "Soyombo"),
("sund", "Sundanese"),
("sundanese", "Sundanese"),
("sunu", "Sunuwar"),
("sunuwar", "Sunuwar"),
("sylo", "Syloti_Nagri"),
("sylotinagri", "Syloti_Nagri"),
("syrc", "Syriac"),
("syriac", "Syriac"),
("tagalog", "Tagalog"),
("tagb", "Tagbanwa"),
("tagbanwa", "Tagbanwa"),
("taile", "Tai_Le"),
("taitham", "Tai_Tham"),
("taiviet", "Tai_Viet"),
("takr", "Takri"),
("takri", "Takri"),
("tale", "Tai_Le"),
("talu", "New_Tai_Lue"),
("tamil", "Tamil"),
("taml", "Tamil"),
("tang", "Tangut"),
("tangsa", "Tangsa"),
("tangut", "Tangut"),
("tavt", "Tai_Viet"),
("telu", "Telugu"),
("telugu", "Telugu"),
("tfng", "Tifinagh"),
("tglg", "Tagalog"),
("thaa", "Thaana"),
("thaana", "Thaana"),
("thai", "Thai"),
("tibetan", "Tibetan"),
("tibt", "Tibetan"),
("tifinagh", "Tifinagh"),
("tirh", "Tirhuta"),
("tirhuta", "Tirhuta"),
("tnsa", "Tangsa"),
("todhri", "Todhri"),
("todr", "Todhri"),
("toto", "Toto"),
("tulutigalari", "Tulu_Tigalari"),
("tutg", "Tulu_Tigalari"),
("ugar", "Ugaritic"),
("ugaritic", "Ugaritic"),
("unknown", "Unknown"),
("vai", "Vai"),
("vaii", "Vai"),
("vith", "Vithkuqi"),
("vithkuqi", "Vithkuqi"),
("wancho", "Wancho"),
("wara", "Warang_Citi"),
("warangciti", "Warang_Citi"),
("wcho", "Wancho"),
("xpeo", "Old_Persian"),
("xsux", "Cuneiform"),
("yezi", "Yezidi"),
("yezidi", "Yezidi"),
("yi", "Yi"),
("yiii", "Yi"),
("zanabazarsquare", "Zanabazar_Square"),
("zanb", "Zanabazar_Square"),
("zinh", "Inherited"),
("zyyy", "Common"),
("zzzz", "Unknown"),
],
),
(
"Sentence_Break",
&[
("at", "ATerm"),
("aterm", "ATerm"),
("cl", "Close"),
("close", "Close"),
("cr", "CR"),
("ex", "Extend"),
("extend", "Extend"),
("fo", "Format"),
("format", "Format"),
("le", "OLetter"),
("lf", "LF"),
("lo", "Lower"),
("lower", "Lower"),
("nu", "Numeric"),
("numeric", "Numeric"),
("oletter", "OLetter"),
("other", "Other"),
("sc", "SContinue"),
("scontinue", "SContinue"),
("se", "Sep"),
("sep", "Sep"),
("sp", "Sp"),
("st", "STerm"),
("sterm", "STerm"),
("up", "Upper"),
("upper", "Upper"),
("xx", "Other"),
],
),
(
"Word_Break",
&[
("aletter", "ALetter"),
("cr", "CR"),
("doublequote", "Double_Quote"),
("dq", "Double_Quote"),
("eb", "E_Base"),
("ebase", "E_Base"),
("ebasegaz", "E_Base_GAZ"),
("ebg", "E_Base_GAZ"),
("em", "E_Modifier"),
("emodifier", "E_Modifier"),
("ex", "ExtendNumLet"),
("extend", "Extend"),
("extendnumlet", "ExtendNumLet"),
("fo", "Format"),
("format", "Format"),
("gaz", "Glue_After_Zwj"),
("glueafterzwj", "Glue_After_Zwj"),
("hebrewletter", "Hebrew_Letter"),
("hl", "Hebrew_Letter"),
("ka", "Katakana"),
("katakana", "Katakana"),
("le", "ALetter"),
("lf", "LF"),
("mb", "MidNumLet"),
("midletter", "MidLetter"),
("midnum", "MidNum"),
("midnumlet", "MidNumLet"),
("ml", "MidLetter"),
("mn", "MidNum"),
("newline", "Newline"),
("nl", "Newline"),
("nu", "Numeric"),
("numeric", "Numeric"),
("other", "Other"),
("regionalindicator", "Regional_Indicator"),
("ri", "Regional_Indicator"),
("singlequote", "Single_Quote"),
("sq", "Single_Quote"),
("wsegspace", "WSegSpace"),
("xx", "Other"),
("zwj", "ZWJ"),
],
),
];

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

592
vendor/regex-syntax/src/utf8.rs vendored Normal file
View File

@@ -0,0 +1,592 @@
/*!
Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
This is sub-module is useful for constructing byte based automatons that need
to embed UTF-8 decoding. The most common use of this module is in conjunction
with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
See the documentation on the `Utf8Sequences` iterator for more details and
an example.
# Wait, what is this?
This is simplest to explain with an example. Let's say you wanted to test
whether a particular byte sequence was a Cyrillic character. One possible
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
range can be expressed as a sequence of byte ranges:
```text
[D0-D3][80-BF]
```
This is simple enough: simply encode the boundaries, `0400` encodes to
`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
However, what if you wanted to add the Cyrillic Supplementary characters to
your range? Your range might then become `[0400-052F]`. The same procedure
as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
this isn't quite correct because this range doesn't capture many characters,
for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
Instead, you need multiple sequences of byte ranges:
```text
[D0-D3][80-BF] # matches codepoints 0400-04FF
[D4][80-AF] # matches codepoints 0500-052F
```
This gets even more complicated if you want bigger ranges, particularly if
they naively contain surrogate codepoints. For example, the sequence of byte
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
```text
[0-7F]
[C2-DF][80-BF]
[E0][A0-BF][80-BF]
[E1-EC][80-BF][80-BF]
[ED][80-9F][80-BF]
[EE-EF][80-BF][80-BF]
```
Note that the byte ranges above will *not* match any erroneous encoding of
UTF-8, including encodings of surrogate codepoints.
And, of course, for all of Unicode (`[000000-10FFFF]`):
```text
[0-7F]
[C2-DF][80-BF]
[E0][A0-BF][80-BF]
[E1-EC][80-BF][80-BF]
[ED][80-9F][80-BF]
[EE-EF][80-BF][80-BF]
[F0][90-BF][80-BF][80-BF]
[F1-F3][80-BF][80-BF][80-BF]
[F4][80-8F][80-BF][80-BF]
```
This module automates the process of creating these byte ranges from ranges of
Unicode scalar values.
# Lineage
I got the idea and general implementation strategy from Russ Cox in his
[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
I also got the idea from
[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
which uses it for executing automata on their term index.
*/
use core::{char, fmt, iter::FusedIterator, slice};
use alloc::{vec, vec::Vec};
const MAX_UTF8_BYTES: usize = 4;
/// Utf8Sequence represents a sequence of byte ranges.
///
/// To match a Utf8Sequence, a candidate byte sequence must match each
/// successive range.
///
/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
pub enum Utf8Sequence {
/// One byte range.
One(Utf8Range),
/// Two successive byte ranges.
Two([Utf8Range; 2]),
/// Three successive byte ranges.
Three([Utf8Range; 3]),
/// Four successive byte ranges.
Four([Utf8Range; 4]),
}
impl Utf8Sequence {
/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
/// range.
///
/// This assumes that `start` and `end` have the same length.
fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
assert_eq!(start.len(), end.len());
match start.len() {
2 => Utf8Sequence::Two([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
]),
3 => Utf8Sequence::Three([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
Utf8Range::new(start[2], end[2]),
]),
4 => Utf8Sequence::Four([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
Utf8Range::new(start[2], end[2]),
Utf8Range::new(start[3], end[3]),
]),
n => unreachable!("invalid encoded length: {}", n),
}
}
/// Returns the underlying sequence of byte ranges as a slice.
pub fn as_slice(&self) -> &[Utf8Range] {
use self::Utf8Sequence::*;
match *self {
One(ref r) => slice::from_ref(r),
Two(ref r) => &r[..],
Three(ref r) => &r[..],
Four(ref r) => &r[..],
}
}
/// Returns the number of byte ranges in this sequence.
///
/// The length is guaranteed to be in the closed interval `[1, 4]`.
pub fn len(&self) -> usize {
self.as_slice().len()
}
/// Reverses the ranges in this sequence.
///
/// For example, if this corresponds to the following sequence:
///
/// ```text
/// [D0-D3][80-BF]
/// ```
///
/// Then after reversal, it will be
///
/// ```text
/// [80-BF][D0-D3]
/// ```
///
/// This is useful when one is constructing a UTF-8 automaton to match
/// character classes in reverse.
pub fn reverse(&mut self) {
match *self {
Utf8Sequence::One(_) => {}
Utf8Sequence::Two(ref mut x) => x.reverse(),
Utf8Sequence::Three(ref mut x) => x.reverse(),
Utf8Sequence::Four(ref mut x) => x.reverse(),
}
}
/// Returns true if and only if a prefix of `bytes` matches this sequence
/// of byte ranges.
pub fn matches(&self, bytes: &[u8]) -> bool {
if bytes.len() < self.len() {
return false;
}
for (&b, r) in bytes.iter().zip(self) {
if !r.matches(b) {
return false;
}
}
true
}
}
impl<'a> IntoIterator for &'a Utf8Sequence {
type IntoIter = slice::Iter<'a, Utf8Range>;
type Item = &'a Utf8Range;
fn into_iter(self) -> Self::IntoIter {
self.as_slice().iter()
}
}
impl fmt::Debug for Utf8Sequence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use self::Utf8Sequence::*;
match *self {
One(ref r) => write!(f, "{:?}", r),
Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
Four(ref r) => {
write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
}
}
}
}
/// A single inclusive range of UTF-8 bytes.
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
pub struct Utf8Range {
/// Start of byte range (inclusive).
pub start: u8,
/// End of byte range (inclusive).
pub end: u8,
}
impl Utf8Range {
fn new(start: u8, end: u8) -> Self {
Utf8Range { start, end }
}
/// Returns true if and only if the given byte is in this range.
pub fn matches(&self, b: u8) -> bool {
self.start <= b && b <= self.end
}
}
impl fmt::Debug for Utf8Range {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.start == self.end {
write!(f, "[{:X}]", self.start)
} else {
write!(f, "[{:X}-{:X}]", self.start, self.end)
}
}
}
/// An iterator over ranges of matching UTF-8 byte sequences.
///
/// The iteration represents an alternation of comprehensive byte sequences
/// that match precisely the set of UTF-8 encoded scalar values.
///
/// A byte sequence corresponds to one of the scalar values in the range given
/// if and only if it completely matches exactly one of the sequences of byte
/// ranges produced by this iterator.
///
/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
/// sequences will match the same bytes.
///
/// # Example
///
/// This shows how to match an arbitrary byte sequence against a range of
/// scalar values.
///
/// ```rust
/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
///
/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
/// for range in seqs {
/// if range.matches(bytes) {
/// return true;
/// }
/// }
/// false
/// }
///
/// // Test the basic multilingual plane.
/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
///
/// // UTF-8 encoding of 'a'.
/// assert!(matches(&seqs, &[0x61]));
/// // UTF-8 encoding of '☃' (`\u{2603}`).
/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
/// // which is invalid UTF-8, and therefore fails, despite the fact that
/// // the corresponding codepoint (0xD800) falls in the range given.
/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
/// // And fails against plain old invalid UTF-8.
/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
/// ```
///
/// If this example seems circuitous, that's because it is! It's meant to be
/// illustrative. In practice, you could just try to decode your byte sequence
/// and compare it with the scalar value range directly. However, this is not
/// always possible (for example, in a byte based automaton).
#[derive(Debug)]
pub struct Utf8Sequences {
range_stack: Vec<ScalarRange>,
}
impl Utf8Sequences {
/// Create a new iterator over UTF-8 byte ranges for the scalar value range
/// given.
pub fn new(start: char, end: char) -> Self {
let range =
ScalarRange { start: u32::from(start), end: u32::from(end) };
Utf8Sequences { range_stack: vec![range] }
}
/// reset resets the scalar value range.
/// Any existing state is cleared, but resources may be reused.
///
/// N.B. Benchmarks say that this method is dubious.
#[doc(hidden)]
pub fn reset(&mut self, start: char, end: char) {
self.range_stack.clear();
self.push(u32::from(start), u32::from(end));
}
fn push(&mut self, start: u32, end: u32) {
self.range_stack.push(ScalarRange { start, end });
}
}
struct ScalarRange {
start: u32,
end: u32,
}
impl fmt::Debug for ScalarRange {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
}
}
impl Iterator for Utf8Sequences {
type Item = Utf8Sequence;
fn next(&mut self) -> Option<Self::Item> {
'TOP: while let Some(mut r) = self.range_stack.pop() {
'INNER: loop {
if let Some((r1, r2)) = r.split() {
self.push(r2.start, r2.end);
r.start = r1.start;
r.end = r1.end;
continue 'INNER;
}
if !r.is_valid() {
continue 'TOP;
}
for i in 1..MAX_UTF8_BYTES {
let max = max_scalar_value(i);
if r.start <= max && max < r.end {
self.push(max + 1, r.end);
r.end = max;
continue 'INNER;
}
}
if let Some(ascii_range) = r.as_ascii() {
return Some(Utf8Sequence::One(ascii_range));
}
for i in 1..MAX_UTF8_BYTES {
let m = (1 << (6 * i)) - 1;
if (r.start & !m) != (r.end & !m) {
if (r.start & m) != 0 {
self.push((r.start | m) + 1, r.end);
r.end = r.start | m;
continue 'INNER;
}
if (r.end & m) != m {
self.push(r.end & !m, r.end);
r.end = (r.end & !m) - 1;
continue 'INNER;
}
}
}
let mut start = [0; MAX_UTF8_BYTES];
let mut end = [0; MAX_UTF8_BYTES];
let n = r.encode(&mut start, &mut end);
return Some(Utf8Sequence::from_encoded_range(
&start[0..n],
&end[0..n],
));
}
}
None
}
}
impl FusedIterator for Utf8Sequences {}
impl ScalarRange {
/// split splits this range if it overlaps with a surrogate codepoint.
///
/// Either or both ranges may be invalid.
fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
if self.start < 0xE000 && self.end > 0xD7FF {
Some((
ScalarRange { start: self.start, end: 0xD7FF },
ScalarRange { start: 0xE000, end: self.end },
))
} else {
None
}
}
/// is_valid returns true if and only if start <= end.
fn is_valid(&self) -> bool {
self.start <= self.end
}
/// as_ascii returns this range as a Utf8Range if and only if all scalar
/// values in this range can be encoded as a single byte.
fn as_ascii(&self) -> Option<Utf8Range> {
if self.is_ascii() {
let start = u8::try_from(self.start).unwrap();
let end = u8::try_from(self.end).unwrap();
Some(Utf8Range::new(start, end))
} else {
None
}
}
/// is_ascii returns true if the range is ASCII only (i.e., takes a single
/// byte to encode any scalar value).
fn is_ascii(&self) -> bool {
self.is_valid() && self.end <= 0x7f
}
/// encode writes the UTF-8 encoding of the start and end of this range
/// to the corresponding destination slices, and returns the number of
/// bytes written.
///
/// The slices should have room for at least `MAX_UTF8_BYTES`.
fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
let cs = char::from_u32(self.start).unwrap();
let ce = char::from_u32(self.end).unwrap();
let ss = cs.encode_utf8(start);
let se = ce.encode_utf8(end);
assert_eq!(ss.len(), se.len());
ss.len()
}
}
fn max_scalar_value(nbytes: usize) -> u32 {
match nbytes {
1 => 0x007F,
2 => 0x07FF,
3 => 0xFFFF,
4 => 0x0010_FFFF,
_ => unreachable!("invalid UTF-8 byte sequence size"),
}
}
#[cfg(test)]
mod tests {
use core::char;
use alloc::{vec, vec::Vec};
use crate::utf8::{Utf8Range, Utf8Sequences};
fn rutf8(s: u8, e: u8) -> Utf8Range {
Utf8Range::new(s, e)
}
fn never_accepts_surrogate_codepoints(start: char, end: char) {
for cp in 0xD800..0xE000 {
let buf = encode_surrogate(cp);
for r in Utf8Sequences::new(start, end) {
if r.matches(&buf) {
panic!(
"Sequence ({:X}, {:X}) contains range {:?}, \
which matches surrogate code point {:X} \
with encoded bytes {:?}",
u32::from(start),
u32::from(end),
r,
cp,
buf,
);
}
}
}
}
#[test]
fn codepoints_no_surrogates() {
never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
}
#[test]
fn single_codepoint_one_sequence() {
// Tests that every range of scalar values that contains a single
// scalar value is recognized by one sequence of byte ranges.
for i in 0x0..=0x0010_FFFF {
let c = match char::from_u32(i) {
None => continue,
Some(c) => c,
};
let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
assert_eq!(seqs.len(), 1);
}
}
#[test]
fn bmp() {
use crate::utf8::Utf8Sequence::*;
let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
assert_eq!(
seqs,
vec![
One(rutf8(0x0, 0x7F)),
Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
Three([
rutf8(0xE0, 0xE0),
rutf8(0xA0, 0xBF),
rutf8(0x80, 0xBF)
]),
Three([
rutf8(0xE1, 0xEC),
rutf8(0x80, 0xBF),
rutf8(0x80, 0xBF)
]),
Three([
rutf8(0xED, 0xED),
rutf8(0x80, 0x9F),
rutf8(0x80, 0xBF)
]),
Three([
rutf8(0xEE, 0xEF),
rutf8(0x80, 0xBF),
rutf8(0x80, 0xBF)
]),
]
);
}
#[test]
fn reverse() {
use crate::utf8::Utf8Sequence::*;
let mut s = One(rutf8(0xA, 0xB));
s.reverse();
assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
s.reverse();
assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
s.reverse();
assert_eq!(
s.as_slice(),
&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
);
let mut s = Four([
rutf8(0xA, 0xB),
rutf8(0xB, 0xC),
rutf8(0xC, 0xD),
rutf8(0xD, 0xE),
]);
s.reverse();
assert_eq!(
s.as_slice(),
&[
rutf8(0xD, 0xE),
rutf8(0xC, 0xD),
rutf8(0xB, 0xC),
rutf8(0xA, 0xB)
]
);
}
fn encode_surrogate(cp: u32) -> [u8; 3] {
const TAG_CONT: u8 = 0b1000_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
assert!(0xD800 <= cp && cp < 0xE000);
let mut dst = [0; 3];
dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
dst
}
}