Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

View File

@@ -0,0 +1 @@
{"files":{"Cargo.lock":"6cb7822f0c5e6a847236fafab37349205f71eca17fc703893a9733d18ad0b379","Cargo.toml":"3760a36644cca75efbe8d22ac53de8b9b83ad4a06cc53b7886799aede8025d13","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"b2484aa7e66fb92d1378e9a7ce7605af18f77cb12c179866eaf92ba28cfec1d9","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"21cda9fe9e1810b285cb8f8a2aa5eeaff6c38e256ceed036b68c66fb6b0124d6","src/ast/parse.rs":"89a3701a9a95fea692be925e97b7dcfc5af1ac41f20e8f054eafaeb391e8dec2","src/ast/print.rs":"99cb69ece252ef31e0be177fb3364797eb30b785f936532b8dcd8106e7be0738","src/ast/visitor.rs":"f0fdf758801fe70e6b299b73ab63196e814af95ef6eccad7ef4f72075743fcf6","src/debug.rs":"7a16cca02be9715fdc8c26a32279465774623cd12fab1ec59ac25a6e3047817f","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"6bf241009184f3249e7bf2b80d156541847ebe0405287aeb146272376cd4c345","src/hir/interval.rs":"74d75837d24ab9a3cff33b375b70694cdd3b9a4610c799137533f365755ba604","src/hir/literal.rs":"61e9f54103c671694dd017c23c5c9263e032735921ef77527940e83b29ced540","src/hir/mod.rs":"13ee5b65fac1f2c9780ce48a500b1e9d198cb0bc07c0d7f4a4391aab87424563","src/hir/print.rs":"ad51c515c933bfd67d307ba3d7e6ac59c9c5903b4f393a9f9a4785c92b88348d","src/hir/translate.rs":"1014a4aee20f4d93a391fa70c55e7ad7d0f277f9741246066dd1e192926c5b16","src/hir/visitor.rs":"71ca9c93aa48a5ed445399659fa6455093a1bbd9ef44b66bc7095c1b08b2ec1f","src/lib.rs":"564dca9e8fc64fedd7ae2b940080d9f2acc9264b73fd6e4322b654dce86d3dbf","src/parser.rs":"6b2f4f27e3331a01a25b87c89368dd2e54396bd425dac57941f9c1ebfd238ac8","src/rank.rs":"ff3d58b0cc5ffa69e2e8c56fc7d9ef41dd399d59a639a253a51551b858cb5bbd","src/unicode.rs":"b2084dcbd4331501b9a895fd7e7575d93ff96eb661c6e6adbc8c66bb72685cde","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"71b7cf52acdb4aa98b44145303b8efbfa94913235493521941ef1e0092a0ffe2","src/unicode_tables/case_folding_simple.rs":"7622c7f7f03ac0dc2f2bcd51c81a217d64de0cc912f62f1add5f676603a02456","src/unicode_tables/general_category.rs":"9488e3721f7c2ae20e1b77fcff9a59b4ed8f22954b8645ea6d8592eac1856423","src/unicode_tables/grapheme_cluster_break.rs":"0dd9d66bad598f4ec3451b6699f05c17c52079e37d463baf6385bbe51aa218f1","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"6a59143db81a0bcaf0e8d0af265e711d1a6472e1f091ee9ee4377da5d5d0cd1f","src/unicode_tables/perl_space.rs":"ec9bb22ed7e99feef292249c7e6f4673ee0af9635d4d158f93923494c14cd5ed","src/unicode_tables/perl_word.rs":"30f073baae28ea34c373c7778c00f20c1621c3e644404eff031f7d1cc8e9c9e2","src/unicode_tables/property_bool.rs":"66cf5bd2a1438bf9694152f077a285cf014fbd50b9dd63a97233b2ea61d64962","src/unicode_tables/property_names.rs":"8c93985d1bcb01735667a3c4cb92f7e260d267326bde9d7f048bc77cd7e07855","src/unicode_tables/property_values.rs":"ef9131ce0a575c7327ec6d466aafd8b7c25600d80c232b5a4110bbf0a5a59136","src/unicode_tables/script.rs":"41bd424f1e3a03290cf4995ced678dcf24c94b38c905c62f6819bf67e098a2ec","src/unicode_tables/script_extension.rs":"a314099ddbf50a07fe350bb0835bf2fe494ed5ad278b30e171e21506eb557906","src/unicode_tables/sentence_break.rs":"be84fbe8c5c67e761b16fe6c27f16664dbb145357835cd6b92bc2a4a4c52ee79","src/unicode_tables/word_break.rs":"c551681ad49ec28c7ae32bab1371945821c736ca8f0de410cb89f28066ec2ecf","src/utf8.rs":"33657f668361b6648d74c92d3d59eab97e3747d785760f47e4d71c13af07bfba","test":"c7de5fbc0010d9b5b758cd49956375a64b88601c068167fd366808950257f108"},"package":"caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"}

65
vendor/regex-syntax/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,65 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
dependencies = [
"derive_arbitrary",
]
[[package]]
name = "derive_arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex-syntax"
version = "0.8.6"
dependencies = [
"arbitrary",
]
[[package]]
name = "syn"
version = "2.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"

76
vendor/regex-syntax/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,76 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.65"
name = "regex-syntax"
version = "0.8.6"
authors = [
"The Rust Project Developers",
"Andrew Gallant <jamslam@gmail.com>",
]
build = false
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "A regular expression parser."
homepage = "https://github.com/rust-lang/regex/tree/master/regex-syntax"
documentation = "https://docs.rs/regex-syntax"
readme = "README.md"
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-lang/regex"
[package.metadata.docs.rs]
all-features = true
rustdoc-args = [
"--cfg",
"docsrs",
]
[features]
arbitrary = ["dep:arbitrary"]
default = [
"std",
"unicode",
]
std = []
unicode = [
"unicode-age",
"unicode-bool",
"unicode-case",
"unicode-gencat",
"unicode-perl",
"unicode-script",
"unicode-segment",
]
unicode-age = []
unicode-bool = []
unicode-case = []
unicode-gencat = []
unicode-perl = []
unicode-script = []
unicode-segment = []
[lib]
name = "regex_syntax"
path = "src/lib.rs"
[[bench]]
name = "bench"
path = "benches/bench.rs"
[dependencies.arbitrary]
version = "1.3.0"
features = ["derive"]
optional = true

201
vendor/regex-syntax/LICENSE-APACHE vendored Normal file
View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

25
vendor/regex-syntax/LICENSE-MIT vendored Normal file
View File

@@ -0,0 +1,25 @@
Copyright (c) 2014 The Rust Project Developers
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

96
vendor/regex-syntax/README.md vendored Normal file
View File

@@ -0,0 +1,96 @@
regex-syntax
============
This crate provides a robust regular expression parser.
[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax)
### Documentation
https://docs.rs/regex-syntax
### Overview
There are two primary types exported by this crate: `Ast` and `Hir`. The former
is a faithful abstract syntax of a regular expression, and can convert regular
expressions back to their concrete syntax while mostly preserving its original
form. The latter type is a high level intermediate representation of a regular
expression that is amenable to analysis and compilation into byte codes or
automata. An `Hir` achieves this by drastically simplifying the syntactic
structure of the regular expression. While an `Hir` can be converted back to
its equivalent concrete syntax, the result is unlikely to resemble the original
concrete syntax that produced the `Hir`.
### Example
This example shows how to parse a pattern string into its HIR:
```rust
use regex_syntax::{hir::Hir, parse};
let hir = parse("a|b").unwrap();
assert_eq!(hir, Hir::alternation(vec![
Hir::literal("a".as_bytes()),
Hir::literal("b".as_bytes()),
]));
```
### Safety
This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's
possible this crate could use `unsafe` code in the future, the standard
for doing so is extremely high. In general, most code in this crate is not
performance critical, since it tends to be dwarfed by the time it takes to
compile a regular expression into an automaton. Therefore, there is little need
for extreme optimization, and therefore, use of `unsafe`.
The standard for using `unsafe` in this crate is extremely high because this
crate is intended to be reasonably safe to use with user supplied regular
expressions. Therefore, while there may be bugs in the regex parser itself,
they should _never_ result in memory unsafety unless there is either a bug
in the compiler or the standard library. (Since `regex-syntax` has zero
dependencies.)
### Crate features
By default, this crate bundles a fairly large amount of Unicode data tables
(a source size of ~750KB). Because of their large size, one can disable some
or all of these data tables. If a regular expression attempts to use Unicode
data that is not available, then an error will occur when translating the `Ast`
to the `Hir`.
The full set of features one can disable are
[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features).
### Testing
Simply running `cargo test` will give you very good coverage. However, because
of the large number of features exposed by this crate, a `test` script is
included in this directory which will test several feature combinations. This
is the same script that is run in CI.
### Motivation
The primary purpose of this crate is to provide the parser used by `regex`.
Specifically, this crate is treated as an implementation detail of the `regex`,
and is primarily developed for the needs of `regex`.
Since this crate is an implementation detail of `regex`, it may experience
breaking change releases at a different cadence from `regex`. This is only
possible because this crate is _not_ a public dependency of `regex`.
Another consequence of this de-coupling is that there is no direct way to
compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must
first convert the `Hir` to a string (via its `std::fmt::Display`) and then
compile that via `Regex::new`. While this does repeat some work, compilation
typically takes much longer than parsing.
Stated differently, the coupling between `regex` and `regex-syntax` exists only
at the level of the concrete syntax.

63
vendor/regex-syntax/benches/bench.rs vendored Normal file
View File

@@ -0,0 +1,63 @@
#![feature(test)]
extern crate test;
use regex_syntax::Parser;
use test::Bencher;
#[bench]
fn parse_simple1(b: &mut Bencher) {
b.iter(|| {
let re = r"^bc(d|e)*$";
Parser::new().parse(re).unwrap()
});
}
#[bench]
fn parse_simple2(b: &mut Bencher) {
b.iter(|| {
let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b";
Parser::new().parse(re).unwrap()
});
}
#[bench]
fn parse_small1(b: &mut Bencher) {
b.iter(|| {
let re = r"\p{L}|\p{N}|\s|.|\d";
Parser::new().parse(re).unwrap()
});
}
#[bench]
fn parse_medium1(b: &mut Bencher) {
b.iter(|| {
let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}";
Parser::new().parse(re).unwrap()
});
}
#[bench]
fn parse_medium2(b: &mut Bencher) {
b.iter(|| {
let re = r"\s\S\w\W\d\D";
Parser::new().parse(re).unwrap()
});
}
#[bench]
fn parse_medium3(b: &mut Bencher) {
b.iter(|| {
let re =
r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL";
Parser::new().parse(re).unwrap()
});
}
#[bench]
fn parse_huge(b: &mut Bencher) {
b.iter(|| {
let re = r"\p{L}{100}";
Parser::new().parse(re).unwrap()
});
}

1809
vendor/regex-syntax/src/ast/mod.rs vendored Normal file

File diff suppressed because it is too large Load Diff

6377
vendor/regex-syntax/src/ast/parse.rs vendored Normal file

File diff suppressed because it is too large Load Diff

577
vendor/regex-syntax/src/ast/print.rs vendored Normal file
View File

@@ -0,0 +1,577 @@
/*!
This module provides a regular expression printer for `Ast`.
*/
use core::fmt;
use crate::ast::{
self,
visitor::{self, Visitor},
Ast,
};
/// A builder for constructing a printer.
///
/// Note that since a printer doesn't have any configuration knobs, this type
/// remains unexported.
#[derive(Clone, Debug)]
struct PrinterBuilder {
_priv: (),
}
impl Default for PrinterBuilder {
fn default() -> PrinterBuilder {
PrinterBuilder::new()
}
}
impl PrinterBuilder {
fn new() -> PrinterBuilder {
PrinterBuilder { _priv: () }
}
fn build(&self) -> Printer {
Printer { _priv: () }
}
}
/// A printer for a regular expression abstract syntax tree.
///
/// A printer converts an abstract syntax tree (AST) to a regular expression
/// pattern string. This particular printer uses constant stack space and heap
/// space proportional to the size of the AST.
///
/// This printer will not necessarily preserve the original formatting of the
/// regular expression pattern string. For example, all whitespace and comments
/// are ignored.
#[derive(Debug)]
pub struct Printer {
_priv: (),
}
impl Printer {
/// Create a new printer.
pub fn new() -> Printer {
PrinterBuilder::new().build()
}
/// Print the given `Ast` to the given writer. The writer must implement
/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
/// here are a `fmt::Formatter` (which is available in `fmt::Display`
/// implementations) or a `&mut String`.
pub fn print<W: fmt::Write>(&mut self, ast: &Ast, wtr: W) -> fmt::Result {
visitor::visit(ast, Writer { wtr })
}
}
#[derive(Debug)]
struct Writer<W> {
wtr: W,
}
impl<W: fmt::Write> Visitor for Writer<W> {
type Output = ();
type Err = fmt::Error;
fn finish(self) -> fmt::Result {
Ok(())
}
fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
match *ast {
Ast::Group(ref x) => self.fmt_group_pre(x),
Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
_ => Ok(()),
}
}
fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
match *ast {
Ast::Empty(_) => Ok(()),
Ast::Flags(ref x) => self.fmt_set_flags(x),
Ast::Literal(ref x) => self.fmt_literal(x),
Ast::Dot(_) => self.wtr.write_str("."),
Ast::Assertion(ref x) => self.fmt_assertion(x),
Ast::ClassPerl(ref x) => self.fmt_class_perl(x),
Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x),
Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
Ast::Repetition(ref x) => self.fmt_repetition(x),
Ast::Group(ref x) => self.fmt_group_post(x),
Ast::Alternation(_) => Ok(()),
Ast::Concat(_) => Ok(()),
}
}
fn visit_alternation_in(&mut self) -> fmt::Result {
self.wtr.write_str("|")
}
fn visit_class_set_item_pre(
&mut self,
ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
match *ast {
ast::ClassSetItem::Bracketed(ref x) => {
self.fmt_class_bracketed_pre(x)
}
_ => Ok(()),
}
}
fn visit_class_set_item_post(
&mut self,
ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
use crate::ast::ClassSetItem::*;
match *ast {
Empty(_) => Ok(()),
Literal(ref x) => self.fmt_literal(x),
Range(ref x) => {
self.fmt_literal(&x.start)?;
self.wtr.write_str("-")?;
self.fmt_literal(&x.end)?;
Ok(())
}
Ascii(ref x) => self.fmt_class_ascii(x),
Unicode(ref x) => self.fmt_class_unicode(x),
Perl(ref x) => self.fmt_class_perl(x),
Bracketed(ref x) => self.fmt_class_bracketed_post(x),
Union(_) => Ok(()),
}
}
fn visit_class_set_binary_op_in(
&mut self,
ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
self.fmt_class_set_binary_op_kind(&ast.kind)
}
}
impl<W: fmt::Write> Writer<W> {
fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result {
use crate::ast::GroupKind::*;
match ast.kind {
CaptureIndex(_) => self.wtr.write_str("("),
CaptureName { ref name, starts_with_p } => {
let start = if starts_with_p { "(?P<" } else { "(?<" };
self.wtr.write_str(start)?;
self.wtr.write_str(&name.name)?;
self.wtr.write_str(">")?;
Ok(())
}
NonCapturing(ref flags) => {
self.wtr.write_str("(?")?;
self.fmt_flags(flags)?;
self.wtr.write_str(":")?;
Ok(())
}
}
}
fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result {
self.wtr.write_str(")")
}
fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result {
use crate::ast::RepetitionKind::*;
match ast.op.kind {
ZeroOrOne if ast.greedy => self.wtr.write_str("?"),
ZeroOrOne => self.wtr.write_str("??"),
ZeroOrMore if ast.greedy => self.wtr.write_str("*"),
ZeroOrMore => self.wtr.write_str("*?"),
OneOrMore if ast.greedy => self.wtr.write_str("+"),
OneOrMore => self.wtr.write_str("+?"),
Range(ref x) => {
self.fmt_repetition_range(x)?;
if !ast.greedy {
self.wtr.write_str("?")?;
}
Ok(())
}
}
}
fn fmt_repetition_range(
&mut self,
ast: &ast::RepetitionRange,
) -> fmt::Result {
use crate::ast::RepetitionRange::*;
match *ast {
Exactly(x) => write!(self.wtr, "{{{}}}", x),
AtLeast(x) => write!(self.wtr, "{{{},}}", x),
Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y),
}
}
fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result {
use crate::ast::LiteralKind::*;
match ast.kind {
Verbatim => self.wtr.write_char(ast.c),
Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
HexFixed(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
}
HexFixed(ast::HexLiteralKind::UnicodeShort) => {
write!(self.wtr, r"\u{:04X}", u32::from(ast.c))
}
HexFixed(ast::HexLiteralKind::UnicodeLong) => {
write!(self.wtr, r"\U{:08X}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::UnicodeShort) => {
write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::UnicodeLong) => {
write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c))
}
Special(ast::SpecialLiteralKind::Bell) => {
self.wtr.write_str(r"\a")
}
Special(ast::SpecialLiteralKind::FormFeed) => {
self.wtr.write_str(r"\f")
}
Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"),
Special(ast::SpecialLiteralKind::LineFeed) => {
self.wtr.write_str(r"\n")
}
Special(ast::SpecialLiteralKind::CarriageReturn) => {
self.wtr.write_str(r"\r")
}
Special(ast::SpecialLiteralKind::VerticalTab) => {
self.wtr.write_str(r"\v")
}
Special(ast::SpecialLiteralKind::Space) => {
self.wtr.write_str(r"\ ")
}
}
}
fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result {
use crate::ast::AssertionKind::*;
match ast.kind {
StartLine => self.wtr.write_str("^"),
EndLine => self.wtr.write_str("$"),
StartText => self.wtr.write_str(r"\A"),
EndText => self.wtr.write_str(r"\z"),
WordBoundary => self.wtr.write_str(r"\b"),
NotWordBoundary => self.wtr.write_str(r"\B"),
WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
}
}
fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result {
self.wtr.write_str("(?")?;
self.fmt_flags(&ast.flags)?;
self.wtr.write_str(")")?;
Ok(())
}
fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result {
use crate::ast::{Flag, FlagsItemKind};
for item in &ast.items {
match item.kind {
FlagsItemKind::Negation => self.wtr.write_str("-"),
FlagsItemKind::Flag(ref flag) => match *flag {
Flag::CaseInsensitive => self.wtr.write_str("i"),
Flag::MultiLine => self.wtr.write_str("m"),
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
Flag::SwapGreed => self.wtr.write_str("U"),
Flag::Unicode => self.wtr.write_str("u"),
Flag::CRLF => self.wtr.write_str("R"),
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
},
}?;
}
Ok(())
}
fn fmt_class_bracketed_pre(
&mut self,
ast: &ast::ClassBracketed,
) -> fmt::Result {
if ast.negated {
self.wtr.write_str("[^")
} else {
self.wtr.write_str("[")
}
}
fn fmt_class_bracketed_post(
&mut self,
_ast: &ast::ClassBracketed,
) -> fmt::Result {
self.wtr.write_str("]")
}
fn fmt_class_set_binary_op_kind(
&mut self,
ast: &ast::ClassSetBinaryOpKind,
) -> fmt::Result {
use crate::ast::ClassSetBinaryOpKind::*;
match *ast {
Intersection => self.wtr.write_str("&&"),
Difference => self.wtr.write_str("--"),
SymmetricDifference => self.wtr.write_str("~~"),
}
}
fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result {
use crate::ast::ClassPerlKind::*;
match ast.kind {
Digit if ast.negated => self.wtr.write_str(r"\D"),
Digit => self.wtr.write_str(r"\d"),
Space if ast.negated => self.wtr.write_str(r"\S"),
Space => self.wtr.write_str(r"\s"),
Word if ast.negated => self.wtr.write_str(r"\W"),
Word => self.wtr.write_str(r"\w"),
}
}
fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result {
use crate::ast::ClassAsciiKind::*;
match ast.kind {
Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"),
Alnum => self.wtr.write_str("[:alnum:]"),
Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"),
Alpha => self.wtr.write_str("[:alpha:]"),
Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"),
Ascii => self.wtr.write_str("[:ascii:]"),
Blank if ast.negated => self.wtr.write_str("[:^blank:]"),
Blank => self.wtr.write_str("[:blank:]"),
Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"),
Cntrl => self.wtr.write_str("[:cntrl:]"),
Digit if ast.negated => self.wtr.write_str("[:^digit:]"),
Digit => self.wtr.write_str("[:digit:]"),
Graph if ast.negated => self.wtr.write_str("[:^graph:]"),
Graph => self.wtr.write_str("[:graph:]"),
Lower if ast.negated => self.wtr.write_str("[:^lower:]"),
Lower => self.wtr.write_str("[:lower:]"),
Print if ast.negated => self.wtr.write_str("[:^print:]"),
Print => self.wtr.write_str("[:print:]"),
Punct if ast.negated => self.wtr.write_str("[:^punct:]"),
Punct => self.wtr.write_str("[:punct:]"),
Space if ast.negated => self.wtr.write_str("[:^space:]"),
Space => self.wtr.write_str("[:space:]"),
Upper if ast.negated => self.wtr.write_str("[:^upper:]"),
Upper => self.wtr.write_str("[:upper:]"),
Word if ast.negated => self.wtr.write_str("[:^word:]"),
Word => self.wtr.write_str("[:word:]"),
Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"),
Xdigit => self.wtr.write_str("[:xdigit:]"),
}
}
fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result {
use crate::ast::ClassUnicodeKind::*;
use crate::ast::ClassUnicodeOpKind::*;
if ast.negated {
self.wtr.write_str(r"\P")?;
} else {
self.wtr.write_str(r"\p")?;
}
match ast.kind {
OneLetter(c) => self.wtr.write_char(c),
Named(ref x) => write!(self.wtr, "{{{}}}", x),
NamedValue { op: Equal, ref name, ref value } => {
write!(self.wtr, "{{{}={}}}", name, value)
}
NamedValue { op: Colon, ref name, ref value } => {
write!(self.wtr, "{{{}:{}}}", name, value)
}
NamedValue { op: NotEqual, ref name, ref value } => {
write!(self.wtr, "{{{}!={}}}", name, value)
}
}
}
}
#[cfg(test)]
mod tests {
use alloc::string::String;
use crate::ast::parse::ParserBuilder;
use super::*;
fn roundtrip(given: &str) {
roundtrip_with(|b| b, given);
}
fn roundtrip_with<F>(mut f: F, given: &str)
where
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
{
let mut builder = ParserBuilder::new();
f(&mut builder);
let ast = builder.build().parse(given).unwrap();
let mut printer = Printer::new();
let mut dst = String::new();
printer.print(&ast, &mut dst).unwrap();
assert_eq!(given, dst);
}
#[test]
fn print_literal() {
roundtrip("a");
roundtrip(r"\[");
roundtrip_with(|b| b.octal(true), r"\141");
roundtrip(r"\x61");
roundtrip(r"\x7F");
roundtrip(r"\u0061");
roundtrip(r"\U00000061");
roundtrip(r"\x{61}");
roundtrip(r"\x{7F}");
roundtrip(r"\u{61}");
roundtrip(r"\U{61}");
roundtrip(r"\a");
roundtrip(r"\f");
roundtrip(r"\t");
roundtrip(r"\n");
roundtrip(r"\r");
roundtrip(r"\v");
roundtrip(r"(?x)\ ");
}
#[test]
fn print_dot() {
roundtrip(".");
}
#[test]
fn print_concat() {
roundtrip("ab");
roundtrip("abcde");
roundtrip("a(bcd)ef");
}
#[test]
fn print_alternation() {
roundtrip("a|b");
roundtrip("a|b|c|d|e");
roundtrip("|a|b|c|d|e");
roundtrip("|a|b|c|d|e|");
roundtrip("a(b|c|d)|e|f");
}
#[test]
fn print_assertion() {
roundtrip(r"^");
roundtrip(r"$");
roundtrip(r"\A");
roundtrip(r"\z");
roundtrip(r"\b");
roundtrip(r"\B");
}
#[test]
fn print_repetition() {
roundtrip("a?");
roundtrip("a??");
roundtrip("a*");
roundtrip("a*?");
roundtrip("a+");
roundtrip("a+?");
roundtrip("a{5}");
roundtrip("a{5}?");
roundtrip("a{5,}");
roundtrip("a{5,}?");
roundtrip("a{5,10}");
roundtrip("a{5,10}?");
}
#[test]
fn print_flags() {
roundtrip("(?i)");
roundtrip("(?-i)");
roundtrip("(?s-i)");
roundtrip("(?-si)");
roundtrip("(?siUmux)");
}
#[test]
fn print_group() {
roundtrip("(?i:a)");
roundtrip("(?P<foo>a)");
roundtrip("(?<foo>a)");
roundtrip("(a)");
}
#[test]
fn print_class() {
roundtrip(r"[abc]");
roundtrip(r"[a-z]");
roundtrip(r"[^a-z]");
roundtrip(r"[a-z0-9]");
roundtrip(r"[-a-z0-9]");
roundtrip(r"[-a-z0-9]");
roundtrip(r"[a-z0-9---]");
roundtrip(r"[a-z&&m-n]");
roundtrip(r"[[a-z&&m-n]]");
roundtrip(r"[a-z--m-n]");
roundtrip(r"[a-z~~m-n]");
roundtrip(r"[a-z[0-9]]");
roundtrip(r"[a-z[^0-9]]");
roundtrip(r"\d");
roundtrip(r"\D");
roundtrip(r"\s");
roundtrip(r"\S");
roundtrip(r"\w");
roundtrip(r"\W");
roundtrip(r"[[:alnum:]]");
roundtrip(r"[[:^alnum:]]");
roundtrip(r"[[:alpha:]]");
roundtrip(r"[[:^alpha:]]");
roundtrip(r"[[:ascii:]]");
roundtrip(r"[[:^ascii:]]");
roundtrip(r"[[:blank:]]");
roundtrip(r"[[:^blank:]]");
roundtrip(r"[[:cntrl:]]");
roundtrip(r"[[:^cntrl:]]");
roundtrip(r"[[:digit:]]");
roundtrip(r"[[:^digit:]]");
roundtrip(r"[[:graph:]]");
roundtrip(r"[[:^graph:]]");
roundtrip(r"[[:lower:]]");
roundtrip(r"[[:^lower:]]");
roundtrip(r"[[:print:]]");
roundtrip(r"[[:^print:]]");
roundtrip(r"[[:punct:]]");
roundtrip(r"[[:^punct:]]");
roundtrip(r"[[:space:]]");
roundtrip(r"[[:^space:]]");
roundtrip(r"[[:upper:]]");
roundtrip(r"[[:^upper:]]");
roundtrip(r"[[:word:]]");
roundtrip(r"[[:^word:]]");
roundtrip(r"[[:xdigit:]]");
roundtrip(r"[[:^xdigit:]]");
roundtrip(r"\pL");
roundtrip(r"\PL");
roundtrip(r"\p{L}");
roundtrip(r"\P{L}");
roundtrip(r"\p{X=Y}");
roundtrip(r"\P{X=Y}");
roundtrip(r"\p{X:Y}");
roundtrip(r"\P{X:Y}");
roundtrip(r"\p{X!=Y}");
roundtrip(r"\P{X!=Y}");
}
}

522
vendor/regex-syntax/src/ast/visitor.rs vendored Normal file
View File

@@ -0,0 +1,522 @@
use alloc::{vec, vec::Vec};
use crate::ast::{self, Ast};
/// A trait for visiting an abstract syntax tree (AST) in depth first order.
///
/// The principle aim of this trait is to enable callers to perform case
/// analysis on an abstract syntax tree without necessarily using recursion.
/// In particular, this permits callers to do case analysis with constant stack
/// usage, which can be important since the size of an abstract syntax tree
/// may be proportional to end user input.
///
/// Typical usage of this trait involves providing an implementation and then
/// running it using the [`visit`] function.
///
/// Note that the abstract syntax tree for a regular expression is quite
/// complex. Unless you specifically need it, you might be able to use the much
/// simpler [high-level intermediate representation](crate::hir::Hir) and its
/// [corresponding `Visitor` trait](crate::hir::Visitor) instead.
pub trait Visitor {
/// The result of visiting an AST.
type Output;
/// An error that visiting an AST might return.
type Err;
/// All implementors of `Visitor` must provide a `finish` method, which
/// yields the result of visiting the AST or an error.
fn finish(self) -> Result<Self::Output, Self::Err>;
/// This method is called before beginning traversal of the AST.
fn start(&mut self) {}
/// This method is called on an `Ast` before descending into child `Ast`
/// nodes.
fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on an `Ast` after descending all of its child
/// `Ast` nodes.
fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of an
/// [`Alternation`](ast::Alternation).
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of a concatenation.
fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
/// before descending into child nodes.
fn visit_class_set_item_pre(
&mut self,
_ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
/// after descending into child nodes.
fn visit_class_set_item_post(
&mut self,
_ast: &ast::ClassSetItem,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every
/// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into
/// child nodes.
fn visit_class_set_binary_op_pre(
&mut self,
_ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on every
/// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child
/// nodes.
fn visit_class_set_binary_op_post(
&mut self,
_ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between the left hand and right hand child nodes
/// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp).
fn visit_class_set_binary_op_in(
&mut self,
_ast: &ast::ClassSetBinaryOp,
) -> Result<(), Self::Err> {
Ok(())
}
}
/// Executes an implementation of `Visitor` in constant stack space.
///
/// This function will visit every node in the given `Ast` while calling the
/// appropriate methods provided by the [`Visitor`] trait.
///
/// The primary use case for this method is when one wants to perform case
/// analysis over an `Ast` without using a stack size proportional to the depth
/// of the `Ast`. Namely, this method will instead use constant stack size, but
/// will use heap space proportional to the size of the `Ast`. This may be
/// desirable in cases where the size of `Ast` is proportional to end user
/// input.
///
/// If the visitor returns an error at any point, then visiting is stopped and
/// the error is returned.
pub fn visit<V: Visitor>(ast: &Ast, visitor: V) -> Result<V::Output, V::Err> {
HeapVisitor::new().visit(ast, visitor)
}
/// HeapVisitor visits every item in an `Ast` recursively using constant stack
/// size and a heap size proportional to the size of the `Ast`.
struct HeapVisitor<'a> {
/// A stack of `Ast` nodes. This is roughly analogous to the call stack
/// used in a typical recursive visitor.
stack: Vec<(&'a Ast, Frame<'a>)>,
/// Similar to the `Ast` stack above, but is used only for character
/// classes. In particular, character classes embed their own mini
/// recursive syntax.
stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>,
}
/// Represents a single stack frame while performing structural induction over
/// an `Ast`.
enum Frame<'a> {
/// A stack frame allocated just before descending into a repetition
/// operator's child node.
Repetition(&'a ast::Repetition),
/// A stack frame allocated just before descending into a group's child
/// node.
Group(&'a ast::Group),
/// The stack frame used while visiting every child node of a concatenation
/// of expressions.
Concat {
/// The child node we are currently visiting.
head: &'a Ast,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Ast],
},
/// The stack frame used while visiting every child node of an alternation
/// of expressions.
Alternation {
/// The child node we are currently visiting.
head: &'a Ast,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Ast],
},
}
/// Represents a single stack frame while performing structural induction over
/// a character class.
enum ClassFrame<'a> {
/// The stack frame used while visiting every child node of a union of
/// character class items.
Union {
/// The child node we are currently visiting.
head: &'a ast::ClassSetItem,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [ast::ClassSetItem],
},
/// The stack frame used while a binary class operation.
Binary { op: &'a ast::ClassSetBinaryOp },
/// A stack frame allocated just before descending into a binary operator's
/// left hand child node.
BinaryLHS {
op: &'a ast::ClassSetBinaryOp,
lhs: &'a ast::ClassSet,
rhs: &'a ast::ClassSet,
},
/// A stack frame allocated just before descending into a binary operator's
/// right hand child node.
BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet },
}
/// A representation of the inductive step when performing structural induction
/// over a character class.
///
/// Note that there is no analogous explicit type for the inductive step for
/// `Ast` nodes because the inductive step is just an `Ast`. For character
/// classes, the inductive step can produce one of two possible child nodes:
/// an item or a binary operation. (An item cannot be a binary operation
/// because that would imply binary operations can be unioned in the concrete
/// syntax, which is not possible.)
enum ClassInduct<'a> {
Item(&'a ast::ClassSetItem),
BinaryOp(&'a ast::ClassSetBinaryOp),
}
impl<'a> HeapVisitor<'a> {
fn new() -> HeapVisitor<'a> {
HeapVisitor { stack: vec![], stack_class: vec![] }
}
fn visit<V: Visitor>(
&mut self,
mut ast: &'a Ast,
mut visitor: V,
) -> Result<V::Output, V::Err> {
self.stack.clear();
self.stack_class.clear();
visitor.start();
loop {
visitor.visit_pre(ast)?;
if let Some(x) = self.induct(ast, &mut visitor)? {
let child = x.child();
self.stack.push((ast, x));
ast = child;
continue;
}
// No induction means we have a base case, so we can post visit
// it now.
visitor.visit_post(ast)?;
// At this point, we now try to pop our call stack until it is
// either empty or we hit another inductive case.
loop {
let (post_ast, frame) = match self.stack.pop() {
None => return visitor.finish(),
Some((post_ast, frame)) => (post_ast, frame),
};
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
match x {
Frame::Alternation { .. } => {
visitor.visit_alternation_in()?;
}
Frame::Concat { .. } => {
visitor.visit_concat_in()?;
}
_ => {}
}
ast = x.child();
self.stack.push((post_ast, x));
break;
}
// Otherwise, we've finished visiting all the child nodes for
// this AST, so we can post visit it now.
visitor.visit_post(post_ast)?;
}
}
}
/// Build a stack frame for the given AST if one is needed (which occurs if
/// and only if there are child nodes in the AST). Otherwise, return None.
///
/// If this visits a class, then the underlying visitor implementation may
/// return an error which will be passed on here.
fn induct<V: Visitor>(
&mut self,
ast: &'a Ast,
visitor: &mut V,
) -> Result<Option<Frame<'a>>, V::Err> {
Ok(match *ast {
Ast::ClassBracketed(ref x) => {
self.visit_class(x, visitor)?;
None
}
Ast::Repetition(ref x) => Some(Frame::Repetition(x)),
Ast::Group(ref x) => Some(Frame::Group(x)),
Ast::Concat(ref x) if x.asts.is_empty() => None,
Ast::Concat(ref x) => {
Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
}
Ast::Alternation(ref x) if x.asts.is_empty() => None,
Ast::Alternation(ref x) => Some(Frame::Alternation {
head: &x.asts[0],
tail: &x.asts[1..],
}),
_ => None,
})
}
/// Pops the given frame. If the frame has an additional inductive step,
/// then return it, otherwise return `None`.
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
match induct {
Frame::Repetition(_) => None,
Frame::Group(_) => None,
Frame::Concat { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
}
}
Frame::Alternation { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Alternation {
head: &tail[0],
tail: &tail[1..],
})
}
}
}
}
fn visit_class<V: Visitor>(
&mut self,
ast: &'a ast::ClassBracketed,
visitor: &mut V,
) -> Result<(), V::Err> {
let mut ast = ClassInduct::from_bracketed(ast);
loop {
self.visit_class_pre(&ast, visitor)?;
if let Some(x) = self.induct_class(&ast) {
let child = x.child();
self.stack_class.push((ast, x));
ast = child;
continue;
}
self.visit_class_post(&ast, visitor)?;
// At this point, we now try to pop our call stack until it is
// either empty or we hit another inductive case.
loop {
let (post_ast, frame) = match self.stack_class.pop() {
None => return Ok(()),
Some((post_ast, frame)) => (post_ast, frame),
};
// If this is a union or a binary op, then we might have
// additional inductive steps to process.
if let Some(x) = self.pop_class(frame) {
if let ClassFrame::BinaryRHS { ref op, .. } = x {
visitor.visit_class_set_binary_op_in(op)?;
}
ast = x.child();
self.stack_class.push((post_ast, x));
break;
}
// Otherwise, we've finished visiting all the child nodes for
// this class node, so we can post visit it now.
self.visit_class_post(&post_ast, visitor)?;
}
}
}
/// Call the appropriate `Visitor` methods given an inductive step.
fn visit_class_pre<V: Visitor>(
&self,
ast: &ClassInduct<'a>,
visitor: &mut V,
) -> Result<(), V::Err> {
match *ast {
ClassInduct::Item(item) => {
visitor.visit_class_set_item_pre(item)?;
}
ClassInduct::BinaryOp(op) => {
visitor.visit_class_set_binary_op_pre(op)?;
}
}
Ok(())
}
/// Call the appropriate `Visitor` methods given an inductive step.
fn visit_class_post<V: Visitor>(
&self,
ast: &ClassInduct<'a>,
visitor: &mut V,
) -> Result<(), V::Err> {
match *ast {
ClassInduct::Item(item) => {
visitor.visit_class_set_item_post(item)?;
}
ClassInduct::BinaryOp(op) => {
visitor.visit_class_set_binary_op_post(op)?;
}
}
Ok(())
}
/// Build a stack frame for the given class node if one is needed (which
/// occurs if and only if there are child nodes). Otherwise, return None.
fn induct_class(&self, ast: &ClassInduct<'a>) -> Option<ClassFrame<'a>> {
match *ast {
ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => {
match x.kind {
ast::ClassSet::Item(ref item) => {
Some(ClassFrame::Union { head: item, tail: &[] })
}
ast::ClassSet::BinaryOp(ref op) => {
Some(ClassFrame::Binary { op })
}
}
}
ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => {
if x.items.is_empty() {
None
} else {
Some(ClassFrame::Union {
head: &x.items[0],
tail: &x.items[1..],
})
}
}
ClassInduct::BinaryOp(op) => {
Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs })
}
_ => None,
}
}
/// Pops the given frame. If the frame has an additional inductive step,
/// then return it, otherwise return `None`.
fn pop_class(&self, induct: ClassFrame<'a>) -> Option<ClassFrame<'a>> {
match induct {
ClassFrame::Union { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(ClassFrame::Union {
head: &tail[0],
tail: &tail[1..],
})
}
}
ClassFrame::Binary { .. } => None,
ClassFrame::BinaryLHS { op, rhs, .. } => {
Some(ClassFrame::BinaryRHS { op, rhs })
}
ClassFrame::BinaryRHS { .. } => None,
}
}
}
impl<'a> Frame<'a> {
/// Perform the next inductive step on this frame and return the next
/// child AST node to visit.
fn child(&self) -> &'a Ast {
match *self {
Frame::Repetition(rep) => &rep.ast,
Frame::Group(group) => &group.ast,
Frame::Concat { head, .. } => head,
Frame::Alternation { head, .. } => head,
}
}
}
impl<'a> ClassFrame<'a> {
/// Perform the next inductive step on this frame and return the next
/// child class node to visit.
fn child(&self) -> ClassInduct<'a> {
match *self {
ClassFrame::Union { head, .. } => ClassInduct::Item(head),
ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op),
ClassFrame::BinaryLHS { ref lhs, .. } => {
ClassInduct::from_set(lhs)
}
ClassFrame::BinaryRHS { ref rhs, .. } => {
ClassInduct::from_set(rhs)
}
}
}
}
impl<'a> ClassInduct<'a> {
fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> {
ClassInduct::from_set(&ast.kind)
}
fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> {
match *ast {
ast::ClassSet::Item(ref item) => ClassInduct::Item(item),
ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op),
}
}
}
impl<'a> core::fmt::Debug for ClassFrame<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let x = match *self {
ClassFrame::Union { .. } => "Union",
ClassFrame::Binary { .. } => "Binary",
ClassFrame::BinaryLHS { .. } => "BinaryLHS",
ClassFrame::BinaryRHS { .. } => "BinaryRHS",
};
write!(f, "{}", x)
}
}
impl<'a> core::fmt::Debug for ClassInduct<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let x = match *self {
ClassInduct::Item(it) => match *it {
ast::ClassSetItem::Empty(_) => "Item(Empty)",
ast::ClassSetItem::Literal(_) => "Item(Literal)",
ast::ClassSetItem::Range(_) => "Item(Range)",
ast::ClassSetItem::Ascii(_) => "Item(Ascii)",
ast::ClassSetItem::Perl(_) => "Item(Perl)",
ast::ClassSetItem::Unicode(_) => "Item(Unicode)",
ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)",
ast::ClassSetItem::Union(_) => "Item(Union)",
},
ClassInduct::BinaryOp(it) => match it.kind {
ast::ClassSetBinaryOpKind::Intersection => {
"BinaryOp(Intersection)"
}
ast::ClassSetBinaryOpKind::Difference => {
"BinaryOp(Difference)"
}
ast::ClassSetBinaryOpKind::SymmetricDifference => {
"BinaryOp(SymmetricDifference)"
}
},
};
write!(f, "{}", x)
}
}

107
vendor/regex-syntax/src/debug.rs vendored Normal file
View File

@@ -0,0 +1,107 @@
/// A type that wraps a single byte with a convenient fmt::Debug impl that
/// escapes the byte.
pub(crate) struct Byte(pub(crate) u8);
impl core::fmt::Debug for Byte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// Special case ASCII space. It's too hard to read otherwise, so
// put quotes around it. I sometimes wonder whether just '\x20' would
// be better...
if self.0 == b' ' {
return write!(f, "' '");
}
// 10 bytes is enough to cover any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
/// A type that provides a human readable debug impl for arbitrary bytes.
///
/// This generally works best when the bytes are presumed to be mostly UTF-8,
/// but will work for anything.
///
/// N.B. This is copied nearly verbatim from regex-automata. Sigh.
pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]);
impl<'a> core::fmt::Debug for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8_decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(byte) => {
write!(f, r"\x{:02x}", byte)?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
// ASCII control characters except \0, \n, \r, \t
'\x01'..='\x08'
| '\x0b'
| '\x0c'
| '\x0e'..='\x19'
| '\x7f' => {
write!(f, "\\x{:02x}", u32::from(ch))?;
}
'\n' | '\r' | '\t' | _ => {
write!(f, "{}", ch.escape_debug())?;
}
}
}
write!(f, "\"")?;
Ok(())
}
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
fn len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
return Some(1);
} else if byte & 0b1100_0000 == 0b1000_0000 {
return None;
} else if byte <= 0b1101_1111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}
if bytes.is_empty() {
return None;
}
let len = match len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}

8
vendor/regex-syntax/src/either.rs vendored Normal file
View File

@@ -0,0 +1,8 @@
/// A simple binary sum type.
///
/// This is occasionally useful in an ad hoc fashion.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Either<Left, Right> {
Left(Left),
Right(Right),
}

311
vendor/regex-syntax/src/error.rs vendored Normal file
View File

@@ -0,0 +1,311 @@
use alloc::{
format,
string::{String, ToString},
vec,
vec::Vec,
};
use crate::{ast, hir};
/// This error type encompasses any error that can be returned by this crate.
///
/// This error type is marked as `non_exhaustive`. This means that adding a
/// new variant is not considered a breaking change.
#[non_exhaustive]
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Error {
/// An error that occurred while translating concrete syntax into abstract
/// syntax (AST).
Parse(ast::Error),
/// An error that occurred while translating abstract syntax into a high
/// level intermediate representation (HIR).
Translate(hir::Error),
}
impl From<ast::Error> for Error {
fn from(err: ast::Error) -> Error {
Error::Parse(err)
}
}
impl From<hir::Error> for Error {
fn from(err: hir::Error) -> Error {
Error::Translate(err)
}
}
#[cfg(feature = "std")]
impl std::error::Error for Error {}
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
Error::Parse(ref x) => x.fmt(f),
Error::Translate(ref x) => x.fmt(f),
}
}
}
/// A helper type for formatting nice error messages.
///
/// This type is responsible for reporting regex parse errors in a nice human
/// readable format. Most of its complexity is from interspersing notational
/// markers pointing out the position where an error occurred.
#[derive(Debug)]
pub struct Formatter<'e, E> {
/// The original regex pattern in which the error occurred.
pattern: &'e str,
/// The error kind. It must impl fmt::Display.
err: &'e E,
/// The primary span of the error.
span: &'e ast::Span,
/// An auxiliary and optional span, in case the error needs to point to
/// two locations (e.g., when reporting a duplicate capture group name).
aux_span: Option<&'e ast::Span>,
}
impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> {
fn from(err: &'e ast::Error) -> Self {
Formatter {
pattern: err.pattern(),
err: err.kind(),
span: err.span(),
aux_span: err.auxiliary_span(),
}
}
}
impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> {
fn from(err: &'e hir::Error) -> Self {
Formatter {
pattern: err.pattern(),
err: err.kind(),
span: err.span(),
aux_span: None,
}
}
}
impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let spans = Spans::from_formatter(self);
if self.pattern.contains('\n') {
let divider = repeat_char('~', 79);
writeln!(f, "regex parse error:")?;
writeln!(f, "{}", divider)?;
let notated = spans.notate();
write!(f, "{}", notated)?;
writeln!(f, "{}", divider)?;
// If we have error spans that cover multiple lines, then we just
// note the line numbers.
if !spans.multi_line.is_empty() {
let mut notes = vec![];
for span in &spans.multi_line {
notes.push(format!(
"on line {} (column {}) through line {} (column {})",
span.start.line,
span.start.column,
span.end.line,
span.end.column - 1
));
}
writeln!(f, "{}", notes.join("\n"))?;
}
write!(f, "error: {}", self.err)?;
} else {
writeln!(f, "regex parse error:")?;
let notated = Spans::from_formatter(self).notate();
write!(f, "{}", notated)?;
write!(f, "error: {}", self.err)?;
}
Ok(())
}
}
/// This type represents an arbitrary number of error spans in a way that makes
/// it convenient to notate the regex pattern. ("Notate" means "point out
/// exactly where the error occurred in the regex pattern.")
///
/// Technically, we can only ever have two spans given our current error
/// structure. However, after toiling with a specific algorithm for handling
/// two spans, it became obvious that an algorithm to handle an arbitrary
/// number of spans was actually much simpler.
struct Spans<'p> {
/// The original regex pattern string.
pattern: &'p str,
/// The total width that should be used for line numbers. The width is
/// used for left padding the line numbers for alignment.
///
/// A value of `0` means line numbers should not be displayed. That is,
/// the pattern is itself only one line.
line_number_width: usize,
/// All error spans that occur on a single line. This sequence always has
/// length equivalent to the number of lines in `pattern`, where the index
/// of the sequence represents a line number, starting at `0`. The spans
/// in each line are sorted in ascending order.
by_line: Vec<Vec<ast::Span>>,
/// All error spans that occur over one or more lines. That is, the start
/// and end position of the span have different line numbers. The spans are
/// sorted in ascending order.
multi_line: Vec<ast::Span>,
}
impl<'p> Spans<'p> {
/// Build a sequence of spans from a formatter.
fn from_formatter<'e, E: core::fmt::Display>(
fmter: &'p Formatter<'e, E>,
) -> Spans<'p> {
let mut line_count = fmter.pattern.lines().count();
// If the pattern ends with a `\n` literal, then our line count is
// off by one, since a span can occur immediately after the last `\n`,
// which is consider to be an additional line.
if fmter.pattern.ends_with('\n') {
line_count += 1;
}
let line_number_width =
if line_count <= 1 { 0 } else { line_count.to_string().len() };
let mut spans = Spans {
pattern: &fmter.pattern,
line_number_width,
by_line: vec![vec![]; line_count],
multi_line: vec![],
};
spans.add(fmter.span.clone());
if let Some(span) = fmter.aux_span {
spans.add(span.clone());
}
spans
}
/// Add the given span to this sequence, putting it in the right place.
fn add(&mut self, span: ast::Span) {
// This is grossly inefficient since we sort after each add, but right
// now, we only ever add two spans at most.
if span.is_one_line() {
let i = span.start.line - 1; // because lines are 1-indexed
self.by_line[i].push(span);
self.by_line[i].sort();
} else {
self.multi_line.push(span);
self.multi_line.sort();
}
}
/// Notate the pattern string with carets (`^`) pointing at each span
/// location. This only applies to spans that occur within a single line.
fn notate(&self) -> String {
let mut notated = String::new();
for (i, line) in self.pattern.lines().enumerate() {
if self.line_number_width > 0 {
notated.push_str(&self.left_pad_line_number(i + 1));
notated.push_str(": ");
} else {
notated.push_str(" ");
}
notated.push_str(line);
notated.push('\n');
if let Some(notes) = self.notate_line(i) {
notated.push_str(&notes);
notated.push('\n');
}
}
notated
}
/// Return notes for the line indexed at `i` (zero-based). If there are no
/// spans for the given line, then `None` is returned. Otherwise, an
/// appropriately space padded string with correctly positioned `^` is
/// returned, accounting for line numbers.
fn notate_line(&self, i: usize) -> Option<String> {
let spans = &self.by_line[i];
if spans.is_empty() {
return None;
}
let mut notes = String::new();
for _ in 0..self.line_number_padding() {
notes.push(' ');
}
let mut pos = 0;
for span in spans {
for _ in pos..(span.start.column - 1) {
notes.push(' ');
pos += 1;
}
let note_len = span.end.column.saturating_sub(span.start.column);
for _ in 0..core::cmp::max(1, note_len) {
notes.push('^');
pos += 1;
}
}
Some(notes)
}
/// Left pad the given line number with spaces such that it is aligned with
/// other line numbers.
fn left_pad_line_number(&self, n: usize) -> String {
let n = n.to_string();
let pad = self.line_number_width.checked_sub(n.len()).unwrap();
let mut result = repeat_char(' ', pad);
result.push_str(&n);
result
}
/// Return the line number padding beginning at the start of each line of
/// the pattern.
///
/// If the pattern is only one line, then this returns a fixed padding
/// for visual indentation.
fn line_number_padding(&self) -> usize {
if self.line_number_width == 0 {
4
} else {
2 + self.line_number_width
}
}
}
fn repeat_char(c: char, count: usize) -> String {
core::iter::repeat(c).take(count).collect()
}
#[cfg(test)]
mod tests {
use alloc::string::ToString;
use crate::ast::parse::Parser;
fn assert_panic_message(pattern: &str, expected_msg: &str) {
let result = Parser::new().parse(pattern);
match result {
Ok(_) => {
panic!("regex should not have parsed");
}
Err(err) => {
assert_eq!(err.to_string(), expected_msg.trim());
}
}
}
// See: https://github.com/rust-lang/regex/issues/464
#[test]
fn regression_464() {
let err = Parser::new().parse("a{\n").unwrap_err();
// This test checks that the error formatter doesn't panic.
assert!(!err.to_string().is_empty());
}
// See: https://github.com/rust-lang/regex/issues/545
#[test]
fn repetition_quantifier_expects_a_valid_decimal() {
assert_panic_message(
r"\\u{[^}]*}",
r#"
regex parse error:
\\u{[^}]*}
^
error: repetition quantifier expects a valid decimal
"#,
);
}
}

564
vendor/regex-syntax/src/hir/interval.rs vendored Normal file
View File

@@ -0,0 +1,564 @@
use core::{char, cmp, fmt::Debug, slice};
use alloc::vec::Vec;
use crate::unicode;
// This module contains an *internal* implementation of interval sets.
//
// The primary invariant that interval sets guards is canonical ordering. That
// is, every interval set contains an ordered sequence of intervals where
// no two intervals are overlapping or adjacent. While this invariant is
// occasionally broken within the implementation, it should be impossible for
// callers to observe it.
//
// Since case folding (as implemented below) breaks that invariant, we roll
// that into this API even though it is a little out of place in an otherwise
// generic interval set. (Hence the reason why the `unicode` module is imported
// here.)
//
// Some of the implementation complexity here is a result of me wanting to
// preserve the sequential representation without using additional memory.
// In many cases, we do use linear extra memory, but it is at most 2x and it
// is amortized. If we relaxed the memory requirements, this implementation
// could become much simpler. The extra memory is honestly probably OK, but
// character classes (especially of the Unicode variety) can become quite
// large, and it would be nice to keep regex compilation snappy even in debug
// builds. (In the past, I have been careless with this area of code and it has
// caused slow regex compilations in debug mode, so this isn't entirely
// unwarranted.)
//
// Tests on this are relegated to the public API of HIR in src/hir.rs.
#[derive(Clone, Debug)]
pub struct IntervalSet<I> {
/// A sorted set of non-overlapping ranges.
ranges: Vec<I>,
/// While not required at all for correctness, we keep track of whether an
/// interval set has been case folded or not. This helps us avoid doing
/// redundant work if, for example, a set has already been cased folded.
/// And note that whether a set is folded or not is preserved through
/// all of the pairwise set operations. That is, if both interval sets
/// have been case folded, then any of difference, union, intersection or
/// symmetric difference all produce a case folded set.
///
/// Note that when this is true, it *must* be the case that the set is case
/// folded. But when it's false, the set *may* be case folded. In other
/// words, we only set this to true when we know it to be case, but we're
/// okay with it being false if it would otherwise be costly to determine
/// whether it should be true. This means code cannot assume that a false
/// value necessarily indicates that the set is not case folded.
///
/// Bottom line: this is a performance optimization.
folded: bool,
}
impl<I: Interval> Eq for IntervalSet<I> {}
// We implement PartialEq manually so that we don't consider the set's internal
// 'folded' property to be part of its identity. The 'folded' property is
// strictly an optimization.
impl<I: Interval> PartialEq for IntervalSet<I> {
fn eq(&self, other: &IntervalSet<I>) -> bool {
self.ranges.eq(&other.ranges)
}
}
impl<I: Interval> IntervalSet<I> {
/// Create a new set from a sequence of intervals. Each interval is
/// specified as a pair of bounds, where both bounds are inclusive.
///
/// The given ranges do not need to be in any specific order, and ranges
/// may overlap.
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
let ranges: Vec<I> = intervals.into_iter().collect();
// An empty set is case folded.
let folded = ranges.is_empty();
let mut set = IntervalSet { ranges, folded };
set.canonicalize();
set
}
/// Add a new interval to this set.
pub fn push(&mut self, interval: I) {
// TODO: This could be faster. e.g., Push the interval such that
// it preserves canonicalization.
self.ranges.push(interval);
self.canonicalize();
// We don't know whether the new interval added here is considered
// case folded, so we conservatively assume that the entire set is
// no longer case folded if it was previously.
self.folded = false;
}
/// Return an iterator over all intervals in this set.
///
/// The iterator yields intervals in ascending order.
pub fn iter(&self) -> IntervalSetIter<'_, I> {
IntervalSetIter(self.ranges.iter())
}
/// Return an immutable slice of intervals in this set.
///
/// The sequence returned is in canonical ordering.
pub fn intervals(&self) -> &[I] {
&self.ranges
}
/// Expand this interval set such that it contains all case folded
/// characters. For example, if this class consists of the range `a-z`,
/// then applying case folding will result in the class containing both the
/// ranges `a-z` and `A-Z`.
///
/// This returns an error if the necessary case mapping data is not
/// available.
pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
if self.folded {
return Ok(());
}
let len = self.ranges.len();
for i in 0..len {
let range = self.ranges[i];
if let Err(err) = range.case_fold_simple(&mut self.ranges) {
self.canonicalize();
return Err(err);
}
}
self.canonicalize();
self.folded = true;
Ok(())
}
/// Union this set with the given set, in place.
pub fn union(&mut self, other: &IntervalSet<I>) {
if other.ranges.is_empty() || self.ranges == other.ranges {
return;
}
// This could almost certainly be done more efficiently.
self.ranges.extend(&other.ranges);
self.canonicalize();
self.folded = self.folded && other.folded;
}
/// Intersect this set with the given set, in place.
pub fn intersect(&mut self, other: &IntervalSet<I>) {
if self.ranges.is_empty() {
return;
}
if other.ranges.is_empty() {
self.ranges.clear();
// An empty set is case folded.
self.folded = true;
return;
}
// There should be a way to do this in-place with constant memory,
// but I couldn't figure out a simple way to do it. So just append
// the intersection to the end of this range, and then drain it before
// we're done.
let drain_end = self.ranges.len();
let mut ita = 0..drain_end;
let mut itb = 0..other.ranges.len();
let mut a = ita.next().unwrap();
let mut b = itb.next().unwrap();
loop {
if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
self.ranges.push(ab);
}
let (it, aorb) =
if self.ranges[a].upper() < other.ranges[b].upper() {
(&mut ita, &mut a)
} else {
(&mut itb, &mut b)
};
match it.next() {
Some(v) => *aorb = v,
None => break,
}
}
self.ranges.drain(..drain_end);
self.folded = self.folded && other.folded;
}
/// Subtract the given set from this set, in place.
pub fn difference(&mut self, other: &IntervalSet<I>) {
if self.ranges.is_empty() || other.ranges.is_empty() {
return;
}
// This algorithm is (to me) surprisingly complex. A search of the
// interwebs indicate that this is a potentially interesting problem.
// Folks seem to suggest interval or segment trees, but I'd like to
// avoid the overhead (both runtime and conceptual) of that.
//
// The following is basically my Shitty First Draft. Therefore, in
// order to grok it, you probably need to read each line carefully.
// Simplifications are most welcome!
//
// Remember, we can assume the canonical format invariant here, which
// says that all ranges are sorted, not overlapping and not adjacent in
// each class.
let drain_end = self.ranges.len();
let (mut a, mut b) = (0, 0);
'LOOP: while a < drain_end && b < other.ranges.len() {
// Basically, the easy cases are when neither range overlaps with
// each other. If the `b` range is less than our current `a`
// range, then we can skip it and move on.
if other.ranges[b].upper() < self.ranges[a].lower() {
b += 1;
continue;
}
// ... similarly for the `a` range. If it's less than the smallest
// `b` range, then we can add it as-is.
if self.ranges[a].upper() < other.ranges[b].lower() {
let range = self.ranges[a];
self.ranges.push(range);
a += 1;
continue;
}
// Otherwise, we have overlapping ranges.
assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
// This part is tricky and was non-obvious to me without looking
// at explicit examples (see the tests). The trickiness stems from
// two things: 1) subtracting a range from another range could
// yield two ranges and 2) after subtracting a range, it's possible
// that future ranges can have an impact. The loop below advances
// the `b` ranges until they can't possible impact the current
// range.
//
// For example, if our `a` range is `a-t` and our next three `b`
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
// subtraction three times before moving on to the next `a` range.
let mut range = self.ranges[a];
while b < other.ranges.len()
&& !range.is_intersection_empty(&other.ranges[b])
{
let old_range = range;
range = match range.difference(&other.ranges[b]) {
(None, None) => {
// We lost the entire range, so move on to the next
// without adding this one.
a += 1;
continue 'LOOP;
}
(Some(range1), None) | (None, Some(range1)) => range1,
(Some(range1), Some(range2)) => {
self.ranges.push(range1);
range2
}
};
// It's possible that the `b` range has more to contribute
// here. In particular, if it is greater than the original
// range, then it might impact the next `a` range *and* it
// has impacted the current `a` range as much as possible,
// so we can quit. We don't bump `b` so that the next `a`
// range can apply it.
if other.ranges[b].upper() > old_range.upper() {
break;
}
// Otherwise, the next `b` range might apply to the current
// `a` range.
b += 1;
}
self.ranges.push(range);
a += 1;
}
while a < drain_end {
let range = self.ranges[a];
self.ranges.push(range);
a += 1;
}
self.ranges.drain(..drain_end);
self.folded = self.folded && other.folded;
}
/// Compute the symmetric difference of the two sets, in place.
///
/// This computes the symmetric difference of two interval sets. This
/// removes all elements in this set that are also in the given set,
/// but also adds all elements from the given set that aren't in this
/// set. That is, the set will contain all elements in either set,
/// but will not contain any elements that are in both sets.
pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
// TODO(burntsushi): Fix this so that it amortizes allocation.
let mut intersection = self.clone();
intersection.intersect(other);
self.union(other);
self.difference(&intersection);
}
/// Negate this interval set.
///
/// For all `x` where `x` is any element, if `x` was in this set, then it
/// will not be in this set after negation.
pub fn negate(&mut self) {
if self.ranges.is_empty() {
let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
self.ranges.push(I::create(min, max));
// The set containing everything must case folded.
self.folded = true;
return;
}
// There should be a way to do this in-place with constant memory,
// but I couldn't figure out a simple way to do it. So just append
// the negation to the end of this range, and then drain it before
// we're done.
let drain_end = self.ranges.len();
// We do checked arithmetic below because of the canonical ordering
// invariant.
if self.ranges[0].lower() > I::Bound::min_value() {
let upper = self.ranges[0].lower().decrement();
self.ranges.push(I::create(I::Bound::min_value(), upper));
}
for i in 1..drain_end {
let lower = self.ranges[i - 1].upper().increment();
let upper = self.ranges[i].lower().decrement();
self.ranges.push(I::create(lower, upper));
}
if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
let lower = self.ranges[drain_end - 1].upper().increment();
self.ranges.push(I::create(lower, I::Bound::max_value()));
}
self.ranges.drain(..drain_end);
// We don't need to update whether this set is folded or not, because
// it is conservatively preserved through negation. Namely, if a set
// is not folded, then it is possible that its negation is folded, for
// example, [^☃]. But we're fine with assuming that the set is not
// folded in that case. (`folded` permits false negatives but not false
// positives.)
//
// But what about when a set is folded, is its negation also
// necessarily folded? Yes. Because if a set is folded, then for every
// character in the set, it necessarily included its equivalence class
// of case folded characters. Negating it in turn means that all
// equivalence classes in the set are negated, and any equivalence
// class that was previously not in the set is now entirely in the set.
}
/// Converts this set into a canonical ordering.
fn canonicalize(&mut self) {
if self.is_canonical() {
return;
}
self.ranges.sort();
assert!(!self.ranges.is_empty());
// Is there a way to do this in-place with constant memory? I couldn't
// figure out a way to do it. So just append the canonicalization to
// the end of this range, and then drain it before we're done.
let drain_end = self.ranges.len();
for oldi in 0..drain_end {
// If we've added at least one new range, then check if we can
// merge this range in the previously added range.
if self.ranges.len() > drain_end {
let (last, rest) = self.ranges.split_last_mut().unwrap();
if let Some(union) = last.union(&rest[oldi]) {
*last = union;
continue;
}
}
let range = self.ranges[oldi];
self.ranges.push(range);
}
self.ranges.drain(..drain_end);
}
/// Returns true if and only if this class is in a canonical ordering.
fn is_canonical(&self) -> bool {
for pair in self.ranges.windows(2) {
if pair[0] >= pair[1] {
return false;
}
if pair[0].is_contiguous(&pair[1]) {
return false;
}
}
true
}
}
/// An iterator over intervals.
#[derive(Debug)]
pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
impl<'a, I> Iterator for IntervalSetIter<'a, I> {
type Item = &'a I;
fn next(&mut self) -> Option<&'a I> {
self.0.next()
}
}
pub trait Interval:
Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
{
type Bound: Bound;
fn lower(&self) -> Self::Bound;
fn upper(&self) -> Self::Bound;
fn set_lower(&mut self, bound: Self::Bound);
fn set_upper(&mut self, bound: Self::Bound);
fn case_fold_simple(
&self,
intervals: &mut Vec<Self>,
) -> Result<(), unicode::CaseFoldError>;
/// Create a new interval.
fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
let mut int = Self::default();
if lower <= upper {
int.set_lower(lower);
int.set_upper(upper);
} else {
int.set_lower(upper);
int.set_upper(lower);
}
int
}
/// Union the given overlapping range into this range.
///
/// If the two ranges aren't contiguous, then this returns `None`.
fn union(&self, other: &Self) -> Option<Self> {
if !self.is_contiguous(other) {
return None;
}
let lower = cmp::min(self.lower(), other.lower());
let upper = cmp::max(self.upper(), other.upper());
Some(Self::create(lower, upper))
}
/// Intersect this range with the given range and return the result.
///
/// If the intersection is empty, then this returns `None`.
fn intersect(&self, other: &Self) -> Option<Self> {
let lower = cmp::max(self.lower(), other.lower());
let upper = cmp::min(self.upper(), other.upper());
if lower <= upper {
Some(Self::create(lower, upper))
} else {
None
}
}
/// Subtract the given range from this range and return the resulting
/// ranges.
///
/// If subtraction would result in an empty range, then no ranges are
/// returned.
fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
if self.is_subset(other) {
return (None, None);
}
if self.is_intersection_empty(other) {
return (Some(self.clone()), None);
}
let add_lower = other.lower() > self.lower();
let add_upper = other.upper() < self.upper();
// We know this because !self.is_subset(other) and the ranges have
// a non-empty intersection.
assert!(add_lower || add_upper);
let mut ret = (None, None);
if add_lower {
let upper = other.lower().decrement();
ret.0 = Some(Self::create(self.lower(), upper));
}
if add_upper {
let lower = other.upper().increment();
let range = Self::create(lower, self.upper());
if ret.0.is_none() {
ret.0 = Some(range);
} else {
ret.1 = Some(range);
}
}
ret
}
/// Returns true if and only if the two ranges are contiguous. Two ranges
/// are contiguous if and only if the ranges are either overlapping or
/// adjacent.
fn is_contiguous(&self, other: &Self) -> bool {
let lower1 = self.lower().as_u32();
let upper1 = self.upper().as_u32();
let lower2 = other.lower().as_u32();
let upper2 = other.upper().as_u32();
cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
}
/// Returns true if and only if the intersection of this range and the
/// other range is empty.
fn is_intersection_empty(&self, other: &Self) -> bool {
let (lower1, upper1) = (self.lower(), self.upper());
let (lower2, upper2) = (other.lower(), other.upper());
cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
}
/// Returns true if and only if this range is a subset of the other range.
fn is_subset(&self, other: &Self) -> bool {
let (lower1, upper1) = (self.lower(), self.upper());
let (lower2, upper2) = (other.lower(), other.upper());
(lower2 <= lower1 && lower1 <= upper2)
&& (lower2 <= upper1 && upper1 <= upper2)
}
}
pub trait Bound:
Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
{
fn min_value() -> Self;
fn max_value() -> Self;
fn as_u32(self) -> u32;
fn increment(self) -> Self;
fn decrement(self) -> Self;
}
impl Bound for u8 {
fn min_value() -> Self {
u8::MIN
}
fn max_value() -> Self {
u8::MAX
}
fn as_u32(self) -> u32 {
u32::from(self)
}
fn increment(self) -> Self {
self.checked_add(1).unwrap()
}
fn decrement(self) -> Self {
self.checked_sub(1).unwrap()
}
}
impl Bound for char {
fn min_value() -> Self {
'\x00'
}
fn max_value() -> Self {
'\u{10FFFF}'
}
fn as_u32(self) -> u32 {
u32::from(self)
}
fn increment(self) -> Self {
match self {
'\u{D7FF}' => '\u{E000}',
c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
}
}
fn decrement(self) -> Self {
match self {
'\u{E000}' => '\u{D7FF}',
c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
}
}
}
// Tests for interval sets are written in src/hir.rs against the public API.

3214
vendor/regex-syntax/src/hir/literal.rs vendored Normal file

File diff suppressed because it is too large Load Diff

3873
vendor/regex-syntax/src/hir/mod.rs vendored Normal file

File diff suppressed because it is too large Load Diff

608
vendor/regex-syntax/src/hir/print.rs vendored Normal file
View File

@@ -0,0 +1,608 @@
/*!
This module provides a regular expression printer for `Hir`.
*/
use core::fmt;
use crate::{
hir::{
self,
visitor::{self, Visitor},
Hir, HirKind,
},
is_meta_character,
};
/// A builder for constructing a printer.
///
/// Note that since a printer doesn't have any configuration knobs, this type
/// remains unexported.
#[derive(Clone, Debug)]
struct PrinterBuilder {
_priv: (),
}
impl Default for PrinterBuilder {
fn default() -> PrinterBuilder {
PrinterBuilder::new()
}
}
impl PrinterBuilder {
fn new() -> PrinterBuilder {
PrinterBuilder { _priv: () }
}
fn build(&self) -> Printer {
Printer { _priv: () }
}
}
/// A printer for a regular expression's high-level intermediate
/// representation.
///
/// A printer converts a high-level intermediate representation (HIR) to a
/// regular expression pattern string. This particular printer uses constant
/// stack space and heap space proportional to the size of the HIR.
///
/// Since this printer is only using the HIR, the pattern it prints will likely
/// not resemble the original pattern at all. For example, a pattern like
/// `\pL` will have its entire class written out.
///
/// The purpose of this printer is to provide a means to mutate an HIR and then
/// build a regular expression from the result of that mutation. (A regex
/// library could provide a constructor from this HIR explicitly, but that
/// creates an unnecessary public coupling between the regex library and this
/// specific HIR representation.)
#[derive(Debug)]
pub struct Printer {
_priv: (),
}
impl Printer {
/// Create a new printer.
pub fn new() -> Printer {
PrinterBuilder::new().build()
}
/// Print the given `Ast` to the given writer. The writer must implement
/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
/// here are a `fmt::Formatter` (which is available in `fmt::Display`
/// implementations) or a `&mut String`.
pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
visitor::visit(hir, Writer { wtr })
}
}
#[derive(Debug)]
struct Writer<W> {
wtr: W,
}
impl<W: fmt::Write> Visitor for Writer<W> {
type Output = ();
type Err = fmt::Error;
fn finish(self) -> fmt::Result {
Ok(())
}
fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
HirKind::Empty => {
// Technically an empty sub-expression could be "printed" by
// just ignoring it, but in practice, you could have a
// repetition operator attached to an empty expression, and you
// really need something in the concrete syntax to make that
// work as you'd expect.
self.wtr.write_str(r"(?:)")?;
}
// Repetition operators are strictly suffix oriented.
HirKind::Repetition(_) => {}
HirKind::Literal(hir::Literal(ref bytes)) => {
// See the comment on the 'Concat' and 'Alternation' case below
// for why we put parens here. Literals are, conceptually,
// a special case of concatenation where each element is a
// character. The HIR flattens this into a Box<[u8]>, but we
// still need to treat it like a concatenation for correct
// printing. As a special case, we don't write parens if there
// is only one character. One character means there is no
// concat so we don't need parens. Adding parens would still be
// correct, but we drop them here because it tends to create
// rather noisy regexes even in simple cases.
let result = core::str::from_utf8(bytes);
let len = result.map_or(bytes.len(), |s| s.chars().count());
if len > 1 {
self.wtr.write_str(r"(?:")?;
}
match result {
Ok(string) => {
for c in string.chars() {
self.write_literal_char(c)?;
}
}
Err(_) => {
for &b in bytes.iter() {
self.write_literal_byte(b)?;
}
}
}
if len > 1 {
self.wtr.write_str(r")")?;
}
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if cls.ranges().is_empty() {
return self.wtr.write_str("[a&&b]");
}
self.wtr.write_str("[")?;
for range in cls.iter() {
if range.start() == range.end() {
self.write_literal_char(range.start())?;
} else if u32::from(range.start()) + 1
== u32::from(range.end())
{
self.write_literal_char(range.start())?;
self.write_literal_char(range.end())?;
} else {
self.write_literal_char(range.start())?;
self.wtr.write_str("-")?;
self.write_literal_char(range.end())?;
}
}
self.wtr.write_str("]")?;
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if cls.ranges().is_empty() {
return self.wtr.write_str("[a&&b]");
}
self.wtr.write_str("(?-u:[")?;
for range in cls.iter() {
if range.start() == range.end() {
self.write_literal_class_byte(range.start())?;
} else if range.start() + 1 == range.end() {
self.write_literal_class_byte(range.start())?;
self.write_literal_class_byte(range.end())?;
} else {
self.write_literal_class_byte(range.start())?;
self.wtr.write_str("-")?;
self.write_literal_class_byte(range.end())?;
}
}
self.wtr.write_str("])")?;
}
HirKind::Look(ref look) => match *look {
hir::Look::Start => {
self.wtr.write_str(r"\A")?;
}
hir::Look::End => {
self.wtr.write_str(r"\z")?;
}
hir::Look::StartLF => {
self.wtr.write_str("(?m:^)")?;
}
hir::Look::EndLF => {
self.wtr.write_str("(?m:$)")?;
}
hir::Look::StartCRLF => {
self.wtr.write_str("(?mR:^)")?;
}
hir::Look::EndCRLF => {
self.wtr.write_str("(?mR:$)")?;
}
hir::Look::WordAscii => {
self.wtr.write_str(r"(?-u:\b)")?;
}
hir::Look::WordAsciiNegate => {
self.wtr.write_str(r"(?-u:\B)")?;
}
hir::Look::WordUnicode => {
self.wtr.write_str(r"\b")?;
}
hir::Look::WordUnicodeNegate => {
self.wtr.write_str(r"\B")?;
}
hir::Look::WordStartAscii => {
self.wtr.write_str(r"(?-u:\b{start})")?;
}
hir::Look::WordEndAscii => {
self.wtr.write_str(r"(?-u:\b{end})")?;
}
hir::Look::WordStartUnicode => {
self.wtr.write_str(r"\b{start}")?;
}
hir::Look::WordEndUnicode => {
self.wtr.write_str(r"\b{end}")?;
}
hir::Look::WordStartHalfAscii => {
self.wtr.write_str(r"(?-u:\b{start-half})")?;
}
hir::Look::WordEndHalfAscii => {
self.wtr.write_str(r"(?-u:\b{end-half})")?;
}
hir::Look::WordStartHalfUnicode => {
self.wtr.write_str(r"\b{start-half}")?;
}
hir::Look::WordEndHalfUnicode => {
self.wtr.write_str(r"\b{end-half}")?;
}
},
HirKind::Capture(hir::Capture { ref name, .. }) => {
self.wtr.write_str("(")?;
if let Some(ref name) = *name {
write!(self.wtr, "?P<{}>", name)?;
}
}
// Why do this? Wrapping concats and alts in non-capturing groups
// is not *always* necessary, but is sometimes necessary. For
// example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'
// and not 'ab|c'. The former is clearly the intended meaning, but
// the latter is actually 'alt(concat(a, b), c)'.
//
// It would be possible to only group these things in cases where
// it's strictly necessary, but it requires knowing the parent
// expression. And since this technique is simpler and always
// correct, we take this route. More to the point, it is a non-goal
// of an HIR printer to show a nice easy-to-read regex. Indeed,
// its construction forbids it from doing so. Therefore, inserting
// extra groups where they aren't necessary is perfectly okay.
HirKind::Concat(_) | HirKind::Alternation(_) => {
self.wtr.write_str(r"(?:")?;
}
}
Ok(())
}
fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
// Handled during visit_pre
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_) => {}
HirKind::Repetition(ref x) => {
match (x.min, x.max) {
(0, Some(1)) => {
self.wtr.write_str("?")?;
}
(0, None) => {
self.wtr.write_str("*")?;
}
(1, None) => {
self.wtr.write_str("+")?;
}
(1, Some(1)) => {
// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
return Ok(());
}
(m, None) => {
write!(self.wtr, "{{{},}}", m)?;
}
(m, Some(n)) if m == n => {
write!(self.wtr, "{{{}}}", m)?;
// a{m} and a{m}? are always exactly equivalent.
return Ok(());
}
(m, Some(n)) => {
write!(self.wtr, "{{{},{}}}", m, n)?;
}
}
if !x.greedy {
self.wtr.write_str("?")?;
}
}
HirKind::Capture(_)
| HirKind::Concat(_)
| HirKind::Alternation(_) => {
self.wtr.write_str(r")")?;
}
}
Ok(())
}
fn visit_alternation_in(&mut self) -> fmt::Result {
self.wtr.write_str("|")
}
}
impl<W: fmt::Write> Writer<W> {
fn write_literal_char(&mut self, c: char) -> fmt::Result {
if is_meta_character(c) {
self.wtr.write_str("\\")?;
}
self.wtr.write_char(c)
}
fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "(?-u:\\x{:02X})", b)
}
}
fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "\\x{:02X}", b)
}
}
}
#[cfg(test)]
mod tests {
use alloc::{
boxed::Box,
string::{String, ToString},
};
use crate::ParserBuilder;
use super::*;
fn roundtrip(given: &str, expected: &str) {
roundtrip_with(|b| b, given, expected);
}
fn roundtrip_bytes(given: &str, expected: &str) {
roundtrip_with(|b| b.utf8(false), given, expected);
}
fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
where
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
{
let mut builder = ParserBuilder::new();
f(&mut builder);
let hir = builder.build().parse(given).unwrap();
let mut printer = Printer::new();
let mut dst = String::new();
printer.print(&hir, &mut dst).unwrap();
// Check that the result is actually valid.
builder.build().parse(&dst).unwrap();
assert_eq!(expected, dst);
}
#[test]
fn print_literal() {
roundtrip("a", "a");
roundtrip(r"\xff", "\u{FF}");
roundtrip_bytes(r"\xff", "\u{FF}");
roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
roundtrip("", "");
}
#[test]
fn print_class() {
roundtrip(r"[a]", r"a");
roundtrip(r"[ab]", r"[ab]");
roundtrip(r"[a-z]", r"[a-z]");
roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}");
roundtrip(r"[-]", r"\-");
roundtrip(r"[☃-⛄]", r"[☃-⛄]");
roundtrip(r"(?-u)[a]", r"a");
roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
// The following test that the printer escapes meta characters
// in character classes.
roundtrip(r"[\[]", r"\[");
roundtrip(r"[Z-_]", r"[Z-_]");
roundtrip(r"[Z-_--Z]", r"[\[-_]");
// The following test that the printer escapes meta characters
// in byte oriented character classes.
roundtrip_bytes(r"(?-u)[\[]", r"\[");
roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
// This tests that an empty character class is correctly roundtripped.
#[cfg(feature = "unicode-gencat")]
roundtrip(r"\P{any}", r"[a&&b]");
roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
}
#[test]
fn print_anchor() {
roundtrip(r"^", r"\A");
roundtrip(r"$", r"\z");
roundtrip(r"(?m)^", r"(?m:^)");
roundtrip(r"(?m)$", r"(?m:$)");
}
#[test]
fn print_word_boundary() {
roundtrip(r"\b", r"\b");
roundtrip(r"\B", r"\B");
roundtrip(r"(?-u)\b", r"(?-u:\b)");
roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
}
#[test]
fn print_repetition() {
roundtrip("a?", "a?");
roundtrip("a??", "a??");
roundtrip("(?U)a?", "a??");
roundtrip("a*", "a*");
roundtrip("a*?", "a*?");
roundtrip("(?U)a*", "a*?");
roundtrip("a+", "a+");
roundtrip("a+?", "a+?");
roundtrip("(?U)a+", "a+?");
roundtrip("a{1}", "a");
roundtrip("a{2}", "a{2}");
roundtrip("a{1,}", "a+");
roundtrip("a{1,5}", "a{1,5}");
roundtrip("a{1}?", "a");
roundtrip("a{2}?", "a{2}");
roundtrip("a{1,}?", "a+?");
roundtrip("a{1,5}?", "a{1,5}?");
roundtrip("(?U)a{1}", "a");
roundtrip("(?U)a{2}", "a{2}");
roundtrip("(?U)a{1,}", "a+?");
roundtrip("(?U)a{1,5}", "a{1,5}?");
// Test that various zero-length repetitions always translate to an
// empty regex. This is more a property of HIR's smart constructors
// than the printer though.
roundtrip("a{0}", "(?:)");
roundtrip("(?:ab){0}", "(?:)");
#[cfg(feature = "unicode-gencat")]
{
roundtrip(r"\p{any}{0}", "(?:)");
roundtrip(r"\P{any}{0}", "(?:)");
}
}
#[test]
fn print_group() {
roundtrip("()", "((?:))");
roundtrip("(?P<foo>)", "(?P<foo>(?:))");
roundtrip("(?:)", "(?:)");
roundtrip("(a)", "(a)");
roundtrip("(?P<foo>a)", "(?P<foo>a)");
roundtrip("(?:a)", "a");
roundtrip("((((a))))", "((((a))))");
}
#[test]
fn print_alternation() {
roundtrip("|", "(?:(?:)|(?:))");
roundtrip("||", "(?:(?:)|(?:)|(?:))");
roundtrip("a|b", "[ab]");
roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
roundtrip("a|b|c", "[a-c]");
roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))");
roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))");
}
// This is a regression test that stresses a peculiarity of how the HIR
// is both constructed and printed. Namely, it is legal for a repetition
// to directly contain a concatenation. This particular construct isn't
// really possible to build from the concrete syntax directly, since you'd
// be forced to put the concatenation into (at least) a non-capturing
// group. Concurrently, the printer doesn't consider this case and just
// kind of naively prints the child expression and tacks on the repetition
// operator.
//
// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
// you 'ab+', but clearly it really should be '(?:ab)+'.
//
// This bug isn't easy to surface because most ways of building an HIR
// come directly from the concrete syntax, and as mentioned above, it just
// isn't possible to build this kind of HIR from the concrete syntax.
// Nevertheless, this is definitely a bug.
//
// See: https://github.com/rust-lang/regex/issues/731
#[test]
fn regression_repetition_concat() {
let expr = Hir::concat(alloc::vec![
Hir::literal("x".as_bytes()),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::literal("ab".as_bytes())),
}),
Hir::literal("y".as_bytes()),
]);
assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
let expr = Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::look(hir::Look::End),
])),
}),
Hir::look(hir::Look::End),
]);
assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
}
// Just like regression_repetition_concat, but with the repetition using
// an alternation as a child expression instead.
//
// See: https://github.com/rust-lang/regex/issues/731
#[test]
fn regression_repetition_alternation() {
let expr = Hir::concat(alloc::vec![
Hir::literal("ab".as_bytes()),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::alternation(alloc::vec![
Hir::literal("cd".as_bytes()),
Hir::literal("ef".as_bytes()),
])),
}),
Hir::literal("gh".as_bytes()),
]);
assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());
let expr = Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::alternation(alloc::vec![
Hir::look(hir::Look::Start),
Hir::look(hir::Look::End),
])),
}),
Hir::look(hir::Look::End),
]);
assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
}
// This regression test is very similar in flavor to
// regression_repetition_concat in that the root of the issue lies in a
// peculiarity of how the HIR is represented and how the printer writes it
// out. Like the other regression, this one is also rooted in the fact that
// you can't produce the peculiar HIR from the concrete syntax. Namely, you
// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
// be in (at least) a non-capturing group. Why? Because the '|' has very
// low precedence (lower that concatenation), and so something like 'ab|c'
// is actually 'alt(ab, c)'.
//
// See: https://github.com/rust-lang/regex/issues/516
#[test]
fn regression_alternation_concat() {
let expr = Hir::concat(alloc::vec![
Hir::literal("ab".as_bytes()),
Hir::alternation(alloc::vec![
Hir::literal("mn".as_bytes()),
Hir::literal("xy".as_bytes()),
]),
]);
assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());
let expr = Hir::concat(alloc::vec![
Hir::look(hir::Look::Start),
Hir::alternation(alloc::vec![
Hir::look(hir::Look::Start),
Hir::look(hir::Look::End),
]),
]);
assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
}
}

3744
vendor/regex-syntax/src/hir/translate.rs vendored Normal file

File diff suppressed because it is too large Load Diff

215
vendor/regex-syntax/src/hir/visitor.rs vendored Normal file
View File

@@ -0,0 +1,215 @@
use alloc::{vec, vec::Vec};
use crate::hir::{self, Hir, HirKind};
/// A trait for visiting the high-level IR (HIR) in depth first order.
///
/// The principle aim of this trait is to enable callers to perform case
/// analysis on a high-level intermediate representation of a regular
/// expression without necessarily using recursion. In particular, this permits
/// callers to do case analysis with constant stack usage, which can be
/// important since the size of an HIR may be proportional to end user input.
///
/// Typical usage of this trait involves providing an implementation and then
/// running it using the [`visit`] function.
pub trait Visitor {
/// The result of visiting an HIR.
type Output;
/// An error that visiting an HIR might return.
type Err;
/// All implementors of `Visitor` must provide a `finish` method, which
/// yields the result of visiting the HIR or an error.
fn finish(self) -> Result<Self::Output, Self::Err>;
/// This method is called before beginning traversal of the HIR.
fn start(&mut self) {}
/// This method is called on an `Hir` before descending into child `Hir`
/// nodes.
fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called on an `Hir` after descending all of its child
/// `Hir` nodes.
fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of an alternation.
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
/// This method is called between child nodes of a concatenation.
fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
}
/// Executes an implementation of `Visitor` in constant stack space.
///
/// This function will visit every node in the given `Hir` while calling
/// appropriate methods provided by the [`Visitor`] trait.
///
/// The primary use case for this method is when one wants to perform case
/// analysis over an `Hir` without using a stack size proportional to the depth
/// of the `Hir`. Namely, this method will instead use constant stack space,
/// but will use heap space proportional to the size of the `Hir`. This may be
/// desirable in cases where the size of `Hir` is proportional to end user
/// input.
///
/// If the visitor returns an error at any point, then visiting is stopped and
/// the error is returned.
pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> {
HeapVisitor::new().visit(hir, visitor)
}
/// HeapVisitor visits every item in an `Hir` recursively using constant stack
/// size and a heap size proportional to the size of the `Hir`.
struct HeapVisitor<'a> {
/// A stack of `Hir` nodes. This is roughly analogous to the call stack
/// used in a typical recursive visitor.
stack: Vec<(&'a Hir, Frame<'a>)>,
}
/// Represents a single stack frame while performing structural induction over
/// an `Hir`.
enum Frame<'a> {
/// A stack frame allocated just before descending into a repetition
/// operator's child node.
Repetition(&'a hir::Repetition),
/// A stack frame allocated just before descending into a capture's child
/// node.
Capture(&'a hir::Capture),
/// The stack frame used while visiting every child node of a concatenation
/// of expressions.
Concat {
/// The child node we are currently visiting.
head: &'a Hir,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Hir],
},
/// The stack frame used while visiting every child node of an alternation
/// of expressions.
Alternation {
/// The child node we are currently visiting.
head: &'a Hir,
/// The remaining child nodes to visit (which may be empty).
tail: &'a [Hir],
},
}
impl<'a> HeapVisitor<'a> {
fn new() -> HeapVisitor<'a> {
HeapVisitor { stack: vec![] }
}
fn visit<V: Visitor>(
&mut self,
mut hir: &'a Hir,
mut visitor: V,
) -> Result<V::Output, V::Err> {
self.stack.clear();
visitor.start();
loop {
visitor.visit_pre(hir)?;
if let Some(x) = self.induct(hir) {
let child = x.child();
self.stack.push((hir, x));
hir = child;
continue;
}
// No induction means we have a base case, so we can post visit
// it now.
visitor.visit_post(hir)?;
// At this point, we now try to pop our call stack until it is
// either empty or we hit another inductive case.
loop {
let (post_hir, frame) = match self.stack.pop() {
None => return visitor.finish(),
Some((post_hir, frame)) => (post_hir, frame),
};
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
match x {
Frame::Alternation { .. } => {
visitor.visit_alternation_in()?;
}
Frame::Concat { .. } => {
visitor.visit_concat_in()?;
}
_ => {}
}
hir = x.child();
self.stack.push((post_hir, x));
break;
}
// Otherwise, we've finished visiting all the child nodes for
// this HIR, so we can post visit it now.
visitor.visit_post(post_hir)?;
}
}
}
/// Build a stack frame for the given HIR if one is needed (which occurs if
/// and only if there are child nodes in the HIR). Otherwise, return None.
fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
match *hir.kind() {
HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
HirKind::Capture(ref x) => Some(Frame::Capture(x)),
HirKind::Concat(ref x) if x.is_empty() => None,
HirKind::Concat(ref x) => {
Some(Frame::Concat { head: &x[0], tail: &x[1..] })
}
HirKind::Alternation(ref x) if x.is_empty() => None,
HirKind::Alternation(ref x) => {
Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
}
_ => None,
}
}
/// Pops the given frame. If the frame has an additional inductive step,
/// then return it, otherwise return `None`.
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
match induct {
Frame::Repetition(_) => None,
Frame::Capture(_) => None,
Frame::Concat { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
}
}
Frame::Alternation { tail, .. } => {
if tail.is_empty() {
None
} else {
Some(Frame::Alternation {
head: &tail[0],
tail: &tail[1..],
})
}
}
}
}
}
impl<'a> Frame<'a> {
/// Perform the next inductive step on this frame and return the next
/// child HIR node to visit.
fn child(&self) -> &'a Hir {
match *self {
Frame::Repetition(rep) => &rep.sub,
Frame::Capture(capture) => &capture.sub,
Frame::Concat { head, .. } => head,
Frame::Alternation { head, .. } => head,
}
}
}

431
vendor/regex-syntax/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,431 @@
/*!
This crate provides a robust regular expression parser.
This crate defines two primary types:
* [`Ast`](ast::Ast) is the abstract syntax of a regular expression.
An abstract syntax corresponds to a *structured representation* of the
concrete syntax of a regular expression, where the concrete syntax is the
pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it
can be converted back to the original concrete syntax (modulo some details,
like whitespace). To a first approximation, the abstract syntax is complex
and difficult to analyze.
* [`Hir`](hir::Hir) is the high-level intermediate representation
("HIR" or "high-level IR" for short) of regular expression. It corresponds to
an intermediate state of a regular expression that sits between the abstract
syntax and the low level compiled opcodes that are eventually responsible for
executing a regular expression search. Given some high-level IR, it is not
possible to produce the original concrete syntax (although it is possible to
produce an equivalent concrete syntax, but it will likely scarcely resemble
the original pattern). To a first approximation, the high-level IR is simple
and easy to analyze.
These two types come with conversion routines:
* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an
[`Ast`](ast::Ast).
* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a
[`Hir`](hir::Hir).
As a convenience, the above two conversion routines are combined into one via
the top-level [`Parser`] type. This `Parser` will first convert your pattern to
an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level
[`parse`] free function.
# Example
This example shows how to parse a pattern string into its HIR:
```
use regex_syntax::{hir::Hir, parse};
let hir = parse("a|b")?;
assert_eq!(hir, Hir::alternation(vec![
Hir::literal("a".as_bytes()),
Hir::literal("b".as_bytes()),
]));
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Concrete syntax supported
The concrete syntax is documented as part of the public API of the
[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax).
# Input safety
A key feature of this library is that it is safe to use with end user facing
input. This plays a significant role in the internal implementation. In
particular:
1. Parsers provide a `nest_limit` option that permits callers to control how
deeply nested a regular expression is allowed to be. This makes it possible
to do case analysis over an `Ast` or an `Hir` using recursion without
worrying about stack overflow.
2. Since relying on a particular stack size is brittle, this crate goes to
great lengths to ensure that all interactions with both the `Ast` and the
`Hir` do not use recursion. Namely, they use constant stack space and heap
space proportional to the size of the original pattern string (in bytes).
This includes the type's corresponding destructors. (One exception to this
is literal extraction, but this will eventually get fixed.)
# Error reporting
The `Display` implementations on all `Error` types exposed in this library
provide nice human readable errors that are suitable for showing to end users
in a monospace font.
# Literal extraction
This crate provides limited support for [literal extraction from `Hir`
values](hir::literal). Be warned that literal extraction uses recursion, and
therefore, stack size proportional to the size of the `Hir`.
The purpose of literal extraction is to speed up searches. That is, if you
know a regular expression must match a prefix or suffix literal, then it is
often quicker to search for instances of that literal, and then confirm or deny
the match using the full regular expression engine. These optimizations are
done automatically in the `regex` crate.
# Crate features
An important feature provided by this crate is its Unicode support. This
includes things like case folding, boolean properties, general categories,
scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`.
However, a downside of this support is that it requires bundling several
Unicode data tables that are substantial in size.
A fair number of use cases do not require full Unicode support. For this
reason, this crate exposes a number of features to control which Unicode
data is available.
If a regular expression attempts to use a Unicode feature that is not available
because the corresponding crate feature was disabled, then translating that
regular expression to an `Hir` will return an error. (It is still possible
construct an `Ast` for such a regular expression, since Unicode data is not
used until translation to an `Hir`.) Stated differently, enabling or disabling
any of the features below can only add or subtract from the total set of valid
regular expressions. Enabling or disabling a feature will never modify the
match semantics of a regular expression.
The following features are available:
* **std** -
Enables support for the standard library. This feature is enabled by default.
When disabled, only `core` and `alloc` are used. Otherwise, enabling `std`
generally just enables `std::error::Error` trait impls for the various error
types.
* **unicode** -
Enables all Unicode features. This feature is enabled by default, and will
always cover all Unicode features, even if more are added in the future.
* **unicode-age** -
Provide the data for the
[Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
This makes it possible to use classes like `\p{Age:6.0}` to refer to all
codepoints first introduced in Unicode 6.0
* **unicode-bool** -
Provide the data for numerous Unicode boolean properties. The full list
is not included here, but contains properties like `Alphabetic`, `Emoji`,
`Lowercase`, `Math`, `Uppercase` and `White_Space`.
* **unicode-case** -
Provide the data for case insensitive matching using
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
* **unicode-gencat** -
Provide the data for
[Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
This includes, but is not limited to, `Decimal_Number`, `Letter`,
`Math_Symbol`, `Number` and `Punctuation`.
* **unicode-perl** -
Provide the data for supporting the Unicode-aware Perl character classes,
corresponding to `\w`, `\s` and `\d`. This is also necessary for using
Unicode-aware word boundary assertions. Note that if this feature is
disabled, the `\s` and `\d` character classes are still available if the
`unicode-bool` and `unicode-gencat` features are enabled, respectively.
* **unicode-script** -
Provide the data for
[Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
`Latin` and `Thai`.
* **unicode-segment** -
Provide the data necessary to provide the properties used to implement the
[Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
`\p{sb=ATerm}`.
* **arbitrary** -
Enabling this feature introduces a public dependency on the
[`arbitrary`](https://crates.io/crates/arbitrary)
crate. Namely, it implements the `Arbitrary` trait from that crate for the
[`Ast`](crate::ast::Ast) type. This feature is disabled by default.
*/
#![no_std]
#![forbid(unsafe_code)]
#![deny(missing_docs, rustdoc::broken_intra_doc_links)]
#![warn(missing_debug_implementations)]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
#[cfg(any(test, feature = "std"))]
extern crate std;
extern crate alloc;
pub use crate::{
error::Error,
parser::{parse, Parser, ParserBuilder},
unicode::UnicodeWordError,
};
use alloc::string::String;
pub mod ast;
mod debug;
mod either;
mod error;
pub mod hir;
mod parser;
mod rank;
mod unicode;
mod unicode_tables;
pub mod utf8;
/// Escapes all regular expression meta characters in `text`.
///
/// The string returned may be safely used as a literal in a regular
/// expression.
pub fn escape(text: &str) -> String {
let mut quoted = String::new();
escape_into(text, &mut quoted);
quoted
}
/// Escapes all meta characters in `text` and writes the result into `buf`.
///
/// This will append escape characters into the given buffer. The characters
/// that are appended are safe to use as a literal in a regular expression.
pub fn escape_into(text: &str, buf: &mut String) {
buf.reserve(text.len());
for c in text.chars() {
if is_meta_character(c) {
buf.push('\\');
}
buf.push(c);
}
}
/// Returns true if the given character has significance in a regex.
///
/// Generally speaking, these are the only characters which _must_ be escaped
/// in order to match their literal meaning. For example, to match a literal
/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
/// example, `-` is treated as a meta character because of its significance
/// for writing ranges inside of character classes, but the regex `-` will
/// match a literal `-` because `-` has no special meaning outside of character
/// classes.
///
/// In order to determine whether a character may be escaped at all, the
/// [`is_escapeable_character`] routine should be used. The difference between
/// `is_meta_character` and `is_escapeable_character` is that the latter will
/// return true for some characters that are _not_ meta characters. For
/// example, `%` and `\%` both match a literal `%` in all contexts. In other
/// words, `is_escapeable_character` includes "superfluous" escapes.
///
/// Note that the set of characters for which this function returns `true` or
/// `false` is fixed and won't change in a semver compatible release. (In this
/// case, "semver compatible release" actually refers to the `regex` crate
/// itself, since reducing or expanding the set of meta characters would be a
/// breaking change for not just `regex-syntax` but also `regex` itself.)
///
/// # Example
///
/// ```
/// use regex_syntax::is_meta_character;
///
/// assert!(is_meta_character('?'));
/// assert!(is_meta_character('-'));
/// assert!(is_meta_character('&'));
/// assert!(is_meta_character('#'));
///
/// assert!(!is_meta_character('%'));
/// assert!(!is_meta_character('/'));
/// assert!(!is_meta_character('!'));
/// assert!(!is_meta_character('"'));
/// assert!(!is_meta_character('e'));
/// ```
pub fn is_meta_character(c: char) -> bool {
match c {
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
| '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
_ => false,
}
}
/// Returns true if the given character can be escaped in a regex.
///
/// This returns true in all cases that `is_meta_character` returns true, but
/// also returns true in some cases where `is_meta_character` returns false.
/// For example, `%` is not a meta character, but it is escapable. That is,
/// `%` and `\%` both match a literal `%` in all contexts.
///
/// The purpose of this routine is to provide knowledge about what characters
/// may be escaped. Namely, most regex engines permit "superfluous" escapes
/// where characters without any special significance may be escaped even
/// though there is no actual _need_ to do so.
///
/// This will return false for some characters. For example, `e` is not
/// escapable. Therefore, `\e` will either result in a parse error (which is
/// true today), or it could backwards compatibly evolve into a new construct
/// with its own meaning. Indeed, that is the purpose of banning _some_
/// superfluous escapes: it provides a way to evolve the syntax in a compatible
/// manner.
///
/// # Example
///
/// ```
/// use regex_syntax::is_escapeable_character;
///
/// assert!(is_escapeable_character('?'));
/// assert!(is_escapeable_character('-'));
/// assert!(is_escapeable_character('&'));
/// assert!(is_escapeable_character('#'));
/// assert!(is_escapeable_character('%'));
/// assert!(is_escapeable_character('/'));
/// assert!(is_escapeable_character('!'));
/// assert!(is_escapeable_character('"'));
///
/// assert!(!is_escapeable_character('e'));
/// ```
pub fn is_escapeable_character(c: char) -> bool {
// Certainly escapable if it's a meta character.
if is_meta_character(c) {
return true;
}
// Any character that isn't ASCII is definitely not escapable. There's
// no real need to allow things like \☃ right?
if !c.is_ascii() {
return false;
}
// Otherwise, we basically say that everything is escapable unless it's a
// letter or digit. Things like \3 are either octal (when enabled) or an
// error, and we should keep it that way. Otherwise, letters are reserved
// for adding new syntax in a backwards compatible way.
match c {
'0'..='9' | 'A'..='Z' | 'a'..='z' => false,
// While not currently supported, we keep these as not escapable to
// give us some flexibility with respect to supporting the \< and
// \> word boundary assertions in the future. By rejecting them as
// escapable, \< and \> will result in a parse error. Thus, we can
// turn them into something else in the future without it being a
// backwards incompatible change.
//
// OK, now we support \< and \>, and we need to retain them as *not*
// escapable here since the escape sequence is significant.
'<' | '>' => false,
_ => true,
}
}
/// Returns true if and only if the given character is a Unicode word
/// character.
///
/// A Unicode word character is defined by
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
/// In particular, a character
/// is considered a word character if it is in either of the `Alphabetic` or
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
/// or `Connector_Punctuation` general categories.
///
/// # Panics
///
/// If the `unicode-perl` feature is not enabled, then this function
/// panics. For this reason, it is recommended that callers use
/// [`try_is_word_character`] instead.
pub fn is_word_character(c: char) -> bool {
try_is_word_character(c).expect("unicode-perl feature must be enabled")
}
/// Returns true if and only if the given character is a Unicode word
/// character.
///
/// A Unicode word character is defined by
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
/// In particular, a character
/// is considered a word character if it is in either of the `Alphabetic` or
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
/// or `Connector_Punctuation` general categories.
///
/// # Errors
///
/// If the `unicode-perl` feature is not enabled, then this function always
/// returns an error.
pub fn try_is_word_character(
c: char,
) -> core::result::Result<bool, UnicodeWordError> {
unicode::is_word_character(c)
}
/// Returns true if and only if the given character is an ASCII word character.
///
/// An ASCII word character is defined by the following character class:
/// `[_0-9a-zA-Z]`.
pub fn is_word_byte(c: u8) -> bool {
match c {
b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use alloc::string::ToString;
use super::*;
#[test]
fn escape_meta() {
assert_eq!(
escape(r"\.+*?()|[]{}^$#&-~"),
r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string()
);
}
#[test]
fn word_byte() {
assert!(is_word_byte(b'a'));
assert!(!is_word_byte(b'-'));
}
#[test]
#[cfg(feature = "unicode-perl")]
fn word_char() {
assert!(is_word_character('a'), "ASCII");
assert!(is_word_character('à'), "Latin-1");
assert!(is_word_character('β'), "Greek");
assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)");
assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)");
assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)");
assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)");
assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)");
assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)");
assert!(!is_word_character('-'));
assert!(!is_word_character('☃'));
}
#[test]
#[should_panic]
#[cfg(not(feature = "unicode-perl"))]
fn word_char_disabled_panic() {
assert!(is_word_character('a'));
}
#[test]
#[cfg(not(feature = "unicode-perl"))]
fn word_char_disabled_error() {
assert!(try_is_word_character('a').is_err());
}
}

254
vendor/regex-syntax/src/parser.rs vendored Normal file
View File

@@ -0,0 +1,254 @@
use crate::{ast, hir, Error};
/// A convenience routine for parsing a regex using default options.
///
/// This is equivalent to `Parser::new().parse(pattern)`.
///
/// If you need to set non-default options, then use a [`ParserBuilder`].
///
/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
/// you should use a [`ast::parse::Parser`].
pub fn parse(pattern: &str) -> Result<hir::Hir, Error> {
Parser::new().parse(pattern)
}
/// A builder for a regular expression parser.
///
/// This builder permits modifying configuration options for the parser.
///
/// This type combines the builder options for both the [AST
/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR
/// `TranslatorBuilder`](hir::translate::TranslatorBuilder).
#[derive(Clone, Debug, Default)]
pub struct ParserBuilder {
ast: ast::parse::ParserBuilder,
hir: hir::translate::TranslatorBuilder,
}
impl ParserBuilder {
/// Create a new parser builder with a default configuration.
pub fn new() -> ParserBuilder {
ParserBuilder::default()
}
/// Build a parser from this configuration with the given pattern.
pub fn build(&self) -> Parser {
Parser { ast: self.ast.build(), hir: self.hir.build() }
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an `Ast` using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire Ast is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since this parser
/// implementation will limit itself to heap space proportional to the
/// length of the pattern string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation, which results in a nest
/// depth of `1`. In general, a nest limit is not something that manifests
/// in an obvious way in the concrete syntax, therefore, it should not be
/// used in a granular way.
pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
self.ast.nest_limit(limit);
self
}
/// Whether to support octal syntax or not.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints in
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
/// `\141` are all equivalent regular expressions, where the last example
/// shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem, it does
/// make good error messages harder. That is, in PCRE based regex engines,
/// syntax like `\0` invokes a backreference, which is explicitly
/// unsupported in Rust's regex engine. However, many users expect it to
/// be supported. Therefore, when octal support is disabled, the error
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
self.ast.octal(yes);
self
}
/// When disabled, translation will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// When enabled (the default), the translator is guaranteed to produce an
/// expression that, for non-empty matches, will only ever produce spans
/// that are entirely valid UTF-8 (otherwise, the translator will return an
/// error).
///
/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
/// syntax) will be allowed even though they can produce matches that split
/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
/// matches, and it is expected that the regex engine itself must handle
/// these cases if necessary (perhaps by suppressing any zero-width matches
/// that split a codepoint).
pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.utf8(yes);
self
}
/// Enable verbose mode in the regular expression.
///
/// When enabled, verbose mode permits insignificant whitespace in many
/// places in the regular expression, as well as comments. Comments are
/// started using `#` and continue until the end of the line.
///
/// By default, this is disabled. It may be selectively enabled in the
/// regular expression by using the `x` flag regardless of this setting.
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
self.ast.ignore_whitespace(yes);
self
}
/// Enable or disable the case insensitive flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `i` flag.
pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.case_insensitive(yes);
self
}
/// Enable or disable the multi-line matching flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `m` flag.
pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.multi_line(yes);
self
}
/// Enable or disable the "dot matches any character" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `s` flag.
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.dot_matches_new_line(yes);
self
}
/// Enable or disable the CRLF mode flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `R` flag.
///
/// When CRLF mode is enabled, the following happens:
///
/// * Unless `dot_matches_new_line` is enabled, `.` will match any character
/// except for `\r` and `\n`.
/// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
/// `\r` and `\n` as line terminators. And in particular, neither will
/// match between a `\r` and a `\n`.
pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.crlf(yes);
self
}
/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
///
/// Namely, instead of `.` (by default) matching everything except for `\n`,
/// this will cause `.` to match everything except for the byte given.
///
/// If `.` is used in a context where Unicode mode is enabled and this byte
/// isn't ASCII, then an error will be returned. When Unicode mode is
/// disabled, then any byte is permitted, but will return an error if UTF-8
/// mode is enabled and it is a non-ASCII byte.
///
/// In short, any ASCII value for a line terminator is always okay. But a
/// non-ASCII byte might result in an error depending on whether Unicode
/// mode or UTF-8 mode are enabled.
///
/// Note that if `R` mode is enabled then it always takes precedence and
/// the line terminator will be treated as `\r` and `\n` simultaneously.
///
/// Note also that this *doesn't* impact the look-around assertions
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
/// configuration in the regex engine itself.
pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
self.hir.line_terminator(byte);
self
}
/// Enable or disable the "swap greed" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `U` flag.
pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.swap_greed(yes);
self
}
/// Enable or disable the Unicode flag (`u`) by default.
///
/// By default this is **enabled**. It may alternatively be selectively
/// disabled in the regular expression itself via the `u` flag.
///
/// Note that unless `utf8` is disabled (it's enabled by default), a
/// regular expression will fail to parse if Unicode mode is disabled and a
/// sub-expression could possibly match invalid UTF-8.
pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
self.hir.unicode(yes);
self
}
}
/// A convenience parser for regular expressions.
///
/// This parser takes as input a regular expression pattern string (the
/// "concrete syntax") and returns a high-level intermediate representation
/// (the HIR) suitable for most types of analysis. In particular, this parser
/// hides the intermediate state of producing an AST (the "abstract syntax").
/// The AST is itself far more complex than the HIR, so this parser serves as a
/// convenience for never having to deal with it at all.
///
/// If callers have more fine grained use cases that need an AST, then please
/// see the [`ast::parse`] module.
///
/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
#[derive(Clone, Debug)]
pub struct Parser {
ast: ast::parse::Parser,
hir: hir::translate::Translator,
}
impl Parser {
/// Create a new parser with a default configuration.
///
/// The parser can be run with `parse` method. The parse method returns
/// a high level intermediate representation of the given regular
/// expression.
///
/// To set configuration options on the parser, use [`ParserBuilder`].
pub fn new() -> Parser {
ParserBuilder::new().build()
}
/// Parse the regular expression into a high level intermediate
/// representation.
pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> {
let ast = self.ast.parse(pattern)?;
let hir = self.hir.translate(pattern, &ast)?;
Ok(hir)
}
}

258
vendor/regex-syntax/src/rank.rs vendored Normal file
View File

@@ -0,0 +1,258 @@
pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [
55, // '\x00'
52, // '\x01'
51, // '\x02'
50, // '\x03'
49, // '\x04'
48, // '\x05'
47, // '\x06'
46, // '\x07'
45, // '\x08'
103, // '\t'
242, // '\n'
66, // '\x0b'
67, // '\x0c'
229, // '\r'
44, // '\x0e'
43, // '\x0f'
42, // '\x10'
41, // '\x11'
40, // '\x12'
39, // '\x13'
38, // '\x14'
37, // '\x15'
36, // '\x16'
35, // '\x17'
34, // '\x18'
33, // '\x19'
56, // '\x1a'
32, // '\x1b'
31, // '\x1c'
30, // '\x1d'
29, // '\x1e'
28, // '\x1f'
255, // ' '
148, // '!'
164, // '"'
149, // '#'
136, // '$'
160, // '%'
155, // '&'
173, // "'"
221, // '('
222, // ')'
134, // '*'
122, // '+'
232, // ','
202, // '-'
215, // '.'
224, // '/'
208, // '0'
220, // '1'
204, // '2'
187, // '3'
183, // '4'
179, // '5'
177, // '6'
168, // '7'
178, // '8'
200, // '9'
226, // ':'
195, // ';'
154, // '<'
184, // '='
174, // '>'
126, // '?'
120, // '@'
191, // 'A'
157, // 'B'
194, // 'C'
170, // 'D'
189, // 'E'
162, // 'F'
161, // 'G'
150, // 'H'
193, // 'I'
142, // 'J'
137, // 'K'
171, // 'L'
176, // 'M'
185, // 'N'
167, // 'O'
186, // 'P'
112, // 'Q'
175, // 'R'
192, // 'S'
188, // 'T'
156, // 'U'
140, // 'V'
143, // 'W'
123, // 'X'
133, // 'Y'
128, // 'Z'
147, // '['
138, // '\\'
146, // ']'
114, // '^'
223, // '_'
151, // '`'
249, // 'a'
216, // 'b'
238, // 'c'
236, // 'd'
253, // 'e'
227, // 'f'
218, // 'g'
230, // 'h'
247, // 'i'
135, // 'j'
180, // 'k'
241, // 'l'
233, // 'm'
246, // 'n'
244, // 'o'
231, // 'p'
139, // 'q'
245, // 'r'
243, // 's'
251, // 't'
235, // 'u'
201, // 'v'
196, // 'w'
240, // 'x'
214, // 'y'
152, // 'z'
182, // '{'
205, // '|'
181, // '}'
127, // '~'
27, // '\x7f'
212, // '\x80'
211, // '\x81'
210, // '\x82'
213, // '\x83'
228, // '\x84'
197, // '\x85'
169, // '\x86'
159, // '\x87'
131, // '\x88'
172, // '\x89'
105, // '\x8a'
80, // '\x8b'
98, // '\x8c'
96, // '\x8d'
97, // '\x8e'
81, // '\x8f'
207, // '\x90'
145, // '\x91'
116, // '\x92'
115, // '\x93'
144, // '\x94'
130, // '\x95'
153, // '\x96'
121, // '\x97'
107, // '\x98'
132, // '\x99'
109, // '\x9a'
110, // '\x9b'
124, // '\x9c'
111, // '\x9d'
82, // '\x9e'
108, // '\x9f'
118, // '\xa0'
141, // '¡'
113, // '¢'
129, // '£'
119, // '¤'
125, // '¥'
165, // '¦'
117, // '§'
92, // '¨'
106, // '©'
83, // 'ª'
72, // '«'
99, // '¬'
93, // '\xad'
65, // '®'
79, // '¯'
166, // '°'
237, // '±'
163, // '²'
199, // '³'
190, // '´'
225, // 'µ'
209, // '¶'
203, // '·'
198, // '¸'
217, // '¹'
219, // 'º'
206, // '»'
234, // '¼'
248, // '½'
158, // '¾'
239, // '¿'
255, // 'À'
255, // 'Á'
255, // 'Â'
255, // 'Ã'
255, // 'Ä'
255, // 'Å'
255, // 'Æ'
255, // 'Ç'
255, // 'È'
255, // 'É'
255, // 'Ê'
255, // 'Ë'
255, // 'Ì'
255, // 'Í'
255, // 'Î'
255, // 'Ï'
255, // 'Ð'
255, // 'Ñ'
255, // 'Ò'
255, // 'Ó'
255, // 'Ô'
255, // 'Õ'
255, // 'Ö'
255, // '×'
255, // 'Ø'
255, // 'Ù'
255, // 'Ú'
255, // 'Û'
255, // 'Ü'
255, // 'Ý'
255, // 'Þ'
255, // 'ß'
255, // 'à'
255, // 'á'
255, // 'â'
255, // 'ã'
255, // 'ä'
255, // 'å'
255, // 'æ'
255, // 'ç'
255, // 'è'
255, // 'é'
255, // 'ê'
255, // 'ë'
255, // 'ì'
255, // 'í'
255, // 'î'
255, // 'ï'
255, // 'ð'
255, // 'ñ'
255, // 'ò'
255, // 'ó'
255, // 'ô'
255, // 'õ'
255, // 'ö'
255, // '÷'
255, // 'ø'
255, // 'ù'
255, // 'ú'
255, // 'û'
255, // 'ü'
255, // 'ý'
255, // 'þ'
255, // 'ÿ'
];

1041
vendor/regex-syntax/src/unicode.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,57 @@
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
Unicode Data Files include all data files under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
Unicode Data Files do not include PDF online code charts under the
directory http://www.unicode.org/Public/.
Software includes any source code published in the Unicode Standard
or under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
NOTICE TO USER: Carefully read the following legal agreement.
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT.
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
THE DATA FILES OR SOFTWARE.
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,57 @@
#[cfg(feature = "unicode-age")]
pub mod age;
#[cfg(feature = "unicode-case")]
pub mod case_folding_simple;
#[cfg(feature = "unicode-gencat")]
pub mod general_category;
#[cfg(feature = "unicode-segment")]
pub mod grapheme_cluster_break;
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
#[allow(dead_code)]
pub mod perl_decimal;
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
#[allow(dead_code)]
pub mod perl_space;
#[cfg(feature = "unicode-perl")]
pub mod perl_word;
#[cfg(feature = "unicode-bool")]
pub mod property_bool;
#[cfg(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
))]
pub mod property_names;
#[cfg(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
))]
pub mod property_values;
#[cfg(feature = "unicode-script")]
pub mod script;
#[cfg(feature = "unicode-script")]
pub mod script_extension;
#[cfg(feature = "unicode-segment")]
pub mod sentence_break;
#[cfg(feature = "unicode-segment")]
pub mod word_break;

View File

@@ -0,0 +1,84 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate general-category ucd-16.0.0 --chars --include decimalnumber
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
&[("Decimal_Number", DECIMAL_NUMBER)];
pub const DECIMAL_NUMBER: &'static [(char, char)] = &[
('0', '9'),
('٠', '٩'),
('۰', '۹'),
('߀', '߉'),
('', '९'),
('', '৯'),
('', '੯'),
('', '૯'),
('', '୯'),
('', '௯'),
('', '౯'),
('', '೯'),
('', '൯'),
('෦', '෯'),
('', '๙'),
('', '໙'),
('༠', '༩'),
('', '၉'),
('႐', '႙'),
('០', '៩'),
('᠐', '᠙'),
('᥆', '᥏'),
('᧐', '᧙'),
('᪀', '᪉'),
('᪐', '᪙'),
('᭐', '᭙'),
('᮰', '᮹'),
('᱀', '᱉'),
('᱐', '᱙'),
('꘠', '꘩'),
('꣐', '꣙'),
('꤀', '꤉'),
('꧐', '꧙'),
('꧰', '꧹'),
('꩐', '꩙'),
('꯰', '꯹'),
('', ''),
('𐒠', '𐒩'),
('𐴰', '𐴹'),
('𐵀', '𐵉'),
('𑁦', '𑁯'),
('𑃰', '𑃹'),
('𑄶', '𑄿'),
('𑇐', '𑇙'),
('𑋰', '𑋹'),
('𑑐', '𑑙'),
('𑓐', '𑓙'),
('𑙐', '𑙙'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜰', '𑜹'),
('𑣠', '𑣩'),
('𑥐', '𑥙'),
('𑯰', '𑯹'),
('𑱐', '𑱙'),
('𑵐', '𑵙'),
('𑶠', '𑶩'),
('𑽐', '𑽙'),
('𖄰', '𖄹'),
('𖩠', '𖩩'),
('𖫀', '𖫉'),
('𖭐', '𖭙'),
('𖵰', '𖵹'),
('𜳰', '𜳹'),
('𝟎', '𝟿'),
('𞅀', '𞅉'),
('𞋰', '𞋹'),
('𞓰', '𞓹'),
('𞗱', '𞗺'),
('𞥐', '𞥙'),
('🯰', '🯹'),
];

View File

@@ -0,0 +1,23 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-bool ucd-16.0.0 --chars --include whitespace
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
&[("White_Space", WHITE_SPACE)];
pub const WHITE_SPACE: &'static [(char, char)] = &[
('\t', '\r'),
(' ', ' '),
('\u{85}', '\u{85}'),
('\u{a0}', '\u{a0}'),
('\u{1680}', '\u{1680}'),
('\u{2000}', '\u{200a}'),
('\u{2028}', '\u{2029}'),
('\u{202f}', '\u{202f}'),
('\u{205f}', '\u{205f}'),
('\u{3000}', '\u{3000}'),
];

View File

@@ -0,0 +1,806 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate perl-word ucd-16.0.0 --chars
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PERL_WORD: &'static [(char, char)] = &[
('0', '9'),
('A', 'Z'),
('_', '_'),
('a', 'z'),
('ª', 'ª'),
('µ', 'µ'),
('º', 'º'),
('À', 'Ö'),
('Ø', 'ö'),
('ø', 'ˁ'),
('ˆ', 'ˑ'),
('ˠ', 'ˤ'),
('ˬ', 'ˬ'),
('ˮ', 'ˮ'),
('\u{300}', 'ʹ'),
('Ͷ', 'ͷ'),
('ͺ', 'ͽ'),
('Ϳ', 'Ϳ'),
('Ά', 'Ά'),
('Έ', 'Ί'),
('Ό', 'Ό'),
('Ύ', 'Ρ'),
('Σ', 'ϵ'),
('Ϸ', 'ҁ'),
('\u{483}', 'ԯ'),
('Ա', 'Ֆ'),
('ՙ', 'ՙ'),
('ՠ', 'ֈ'),
('\u{591}', '\u{5bd}'),
('\u{5bf}', '\u{5bf}'),
('\u{5c1}', '\u{5c2}'),
('\u{5c4}', '\u{5c5}'),
('\u{5c7}', '\u{5c7}'),
('א', 'ת'),
('ׯ', 'ײ'),
('\u{610}', '\u{61a}'),
('ؠ', '٩'),
('ٮ', 'ۓ'),
('ە', '\u{6dc}'),
('\u{6df}', '\u{6e8}'),
('\u{6ea}', 'ۼ'),
('ۿ', 'ۿ'),
('ܐ', '\u{74a}'),
('ݍ', 'ޱ'),
('߀', 'ߵ'),
('ߺ', 'ߺ'),
('\u{7fd}', '\u{7fd}'),
('ࠀ', '\u{82d}'),
('ࡀ', '\u{85b}'),
('ࡠ', 'ࡪ'),
('ࡰ', 'ࢇ'),
('ࢉ', 'ࢎ'),
('\u{897}', '\u{8e1}'),
('\u{8e3}', '\u{963}'),
('', '९'),
('ॱ', 'ঃ'),
('অ', 'ঌ'),
('এ', 'ঐ'),
('ও', 'ন'),
('প', 'র'),
('ল', 'ল'),
('শ', 'হ'),
('\u{9bc}', '\u{9c4}'),
('ে', 'ৈ'),
('ো', 'ৎ'),
('\u{9d7}', '\u{9d7}'),
('ড়', 'ঢ়'),
('য়', '\u{9e3}'),
('', 'ৱ'),
('ৼ', 'ৼ'),
('\u{9fe}', '\u{9fe}'),
('\u{a01}', 'ਃ'),
('ਅ', 'ਊ'),
('ਏ', 'ਐ'),
('ਓ', 'ਨ'),
('ਪ', 'ਰ'),
('ਲ', 'ਲ਼'),
('ਵ', 'ਸ਼'),
('ਸ', 'ਹ'),
('\u{a3c}', '\u{a3c}'),
('ਾ', '\u{a42}'),
('\u{a47}', '\u{a48}'),
('\u{a4b}', '\u{a4d}'),
('\u{a51}', '\u{a51}'),
('ਖ਼', 'ੜ'),
('ਫ਼', 'ਫ਼'),
('', '\u{a75}'),
('\u{a81}', ''),
('અ', 'ઍ'),
('એ', 'ઑ'),
('ઓ', 'ન'),
('પ', 'ર'),
('લ', 'ળ'),
('વ', 'હ'),
('\u{abc}', '\u{ac5}'),
('\u{ac7}', 'ૉ'),
('ો', '\u{acd}'),
('ૐ', 'ૐ'),
('ૠ', '\u{ae3}'),
('', '૯'),
('ૹ', '\u{aff}'),
('\u{b01}', ''),
('ଅ', 'ଌ'),
('ଏ', 'ଐ'),
('ଓ', 'ନ'),
('ପ', 'ର'),
('ଲ', 'ଳ'),
('ଵ', 'ହ'),
('\u{b3c}', '\u{b44}'),
('େ', 'ୈ'),
('ୋ', '\u{b4d}'),
('\u{b55}', '\u{b57}'),
('ଡ଼', 'ଢ଼'),
('ୟ', '\u{b63}'),
('', '୯'),
('ୱ', 'ୱ'),
('\u{b82}', 'ஃ'),
('அ', 'ஊ'),
('எ', 'ஐ'),
('ஒ', 'க'),
('ங', 'ச'),
('ஜ', 'ஜ'),
('ஞ', 'ட'),
('ண', 'த'),
('ந', 'ப'),
('ம', 'ஹ'),
('\u{bbe}', 'ூ'),
('ெ', 'ை'),
('ொ', '\u{bcd}'),
('ௐ', 'ௐ'),
('\u{bd7}', '\u{bd7}'),
('', '௯'),
('\u{c00}', 'ఌ'),
('ఎ', 'ఐ'),
('ఒ', 'న'),
('ప', 'హ'),
('\u{c3c}', 'ౄ'),
('\u{c46}', '\u{c48}'),
('\u{c4a}', '\u{c4d}'),
('\u{c55}', '\u{c56}'),
('ౘ', 'ౚ'),
('ౝ', 'ౝ'),
('ౠ', '\u{c63}'),
('', '౯'),
('ಀ', 'ಃ'),
('ಅ', 'ಌ'),
('ಎ', 'ಐ'),
('ಒ', 'ನ'),
('ಪ', 'ಳ'),
('ವ', 'ಹ'),
('\u{cbc}', 'ೄ'),
('\u{cc6}', '\u{cc8}'),
('\u{cca}', '\u{ccd}'),
('\u{cd5}', '\u{cd6}'),
('ೝ', 'ೞ'),
('ೠ', '\u{ce3}'),
('', '೯'),
('ೱ', 'ೳ'),
('\u{d00}', 'ഌ'),
('എ', 'ഐ'),
('ഒ', '\u{d44}'),
('െ', 'ൈ'),
('ൊ', 'ൎ'),
('ൔ', '\u{d57}'),
('ൟ', '\u{d63}'),
('', '൯'),
('ൺ', 'ൿ'),
('\u{d81}', 'ඃ'),
('අ', 'ඖ'),
('ක', 'න'),
('ඳ', 'ර'),
('ල', 'ල'),
('ව', 'ෆ'),
('\u{dca}', '\u{dca}'),
('\u{dcf}', '\u{dd4}'),
('\u{dd6}', '\u{dd6}'),
('ෘ', '\u{ddf}'),
('෦', '෯'),
('ෲ', 'ෳ'),
('ก', '\u{e3a}'),
('เ', '\u{e4e}'),
('', '๙'),
('ກ', 'ຂ'),
('ຄ', 'ຄ'),
('ຆ', 'ຊ'),
('ຌ', 'ຣ'),
('ລ', 'ລ'),
('ວ', 'ຽ'),
('ເ', 'ໄ'),
('ໆ', 'ໆ'),
('\u{ec8}', '\u{ece}'),
('', '໙'),
('ໜ', 'ໟ'),
('ༀ', 'ༀ'),
('\u{f18}', '\u{f19}'),
('༠', '༩'),
('\u{f35}', '\u{f35}'),
('\u{f37}', '\u{f37}'),
('\u{f39}', '\u{f39}'),
('༾', 'ཇ'),
('ཉ', 'ཬ'),
('\u{f71}', '\u{f84}'),
('\u{f86}', '\u{f97}'),
('\u{f99}', '\u{fbc}'),
('\u{fc6}', '\u{fc6}'),
('က', '၉'),
('ၐ', '\u{109d}'),
('Ⴀ', 'Ⴥ'),
('Ⴧ', 'Ⴧ'),
('Ⴭ', 'Ⴭ'),
('ა', 'ჺ'),
('ჼ', 'ቈ'),
('ቊ', 'ቍ'),
('ቐ', 'ቖ'),
('ቘ', 'ቘ'),
('ቚ', 'ቝ'),
('በ', 'ኈ'),
('ኊ', 'ኍ'),
('ነ', 'ኰ'),
('ኲ', 'ኵ'),
('ኸ', 'ኾ'),
('ዀ', 'ዀ'),
('ዂ', 'ዅ'),
('ወ', 'ዖ'),
('ዘ', 'ጐ'),
('ጒ', 'ጕ'),
('ጘ', 'ፚ'),
('\u{135d}', '\u{135f}'),
('ᎀ', 'ᎏ'),
('', 'Ᏽ'),
('ᏸ', 'ᏽ'),
('ᐁ', 'ᙬ'),
('ᙯ', 'ᙿ'),
('ᚁ', 'ᚚ'),
('ᚠ', 'ᛪ'),
('ᛮ', 'ᛸ'),
('ᜀ', '\u{1715}'),
('ᜟ', '\u{1734}'),
('ᝀ', '\u{1753}'),
('ᝠ', 'ᝬ'),
('ᝮ', 'ᝰ'),
('\u{1772}', '\u{1773}'),
('ក', '\u{17d3}'),
('ៗ', 'ៗ'),
('ៜ', '\u{17dd}'),
('០', '៩'),
('\u{180b}', '\u{180d}'),
('\u{180f}', '᠙'),
('ᠠ', 'ᡸ'),
('ᢀ', 'ᢪ'),
('ᢰ', 'ᣵ'),
('ᤀ', 'ᤞ'),
('\u{1920}', 'ᤫ'),
('ᤰ', '\u{193b}'),
('᥆', 'ᥭ'),
('ᥰ', 'ᥴ'),
('ᦀ', 'ᦫ'),
('ᦰ', 'ᧉ'),
('᧐', '᧙'),
('ᨀ', '\u{1a1b}'),
('ᨠ', '\u{1a5e}'),
('\u{1a60}', '\u{1a7c}'),
('\u{1a7f}', '᪉'),
('᪐', '᪙'),
('ᪧ', 'ᪧ'),
('\u{1ab0}', '\u{1ace}'),
('\u{1b00}', 'ᭌ'),
('᭐', '᭙'),
('\u{1b6b}', '\u{1b73}'),
('\u{1b80}', '\u{1bf3}'),
('ᰀ', '\u{1c37}'),
('᱀', '᱉'),
('ᱍ', 'ᱽ'),
('ᲀ', 'ᲊ'),
('Ა', 'Ჺ'),
('Ჽ', 'Ჿ'),
('\u{1cd0}', '\u{1cd2}'),
('\u{1cd4}', 'ᳺ'),
('ᴀ', 'ἕ'),
('Ἐ', 'Ἕ'),
('ἠ', 'ὅ'),
('Ὀ', 'Ὅ'),
('ὐ', 'ὗ'),
('Ὑ', 'Ὑ'),
('Ὓ', 'Ὓ'),
('Ὕ', 'Ὕ'),
('Ὗ', 'ώ'),
('ᾀ', 'ᾴ'),
('ᾶ', 'ᾼ'),
('', ''),
('ῂ', 'ῄ'),
('ῆ', 'ῌ'),
('ῐ', 'ΐ'),
('ῖ', 'Ί'),
('ῠ', 'Ῥ'),
('ῲ', 'ῴ'),
('ῶ', 'ῼ'),
('\u{200c}', '\u{200d}'),
('‿', '⁀'),
('⁔', '⁔'),
('ⁱ', 'ⁱ'),
('ⁿ', 'ⁿ'),
('ₐ', 'ₜ'),
('\u{20d0}', '\u{20f0}'),
('', ''),
('ℇ', 'ℇ'),
('', ''),
('', ''),
('', ''),
('', ''),
('Ω', 'Ω'),
('', ''),
('', ''),
('', ''),
('ℼ', 'ℿ'),
('', ''),
('ⅎ', 'ⅎ'),
('', 'ↈ'),
('Ⓐ', 'ⓩ'),
('Ⰰ', 'ⳤ'),
('Ⳬ', 'ⳳ'),
('ⴀ', 'ⴥ'),
('ⴧ', 'ⴧ'),
('ⴭ', 'ⴭ'),
('ⴰ', 'ⵧ'),
('ⵯ', 'ⵯ'),
('\u{2d7f}', 'ⶖ'),
('ⶠ', 'ⶦ'),
('ⶨ', 'ⶮ'),
('ⶰ', 'ⶶ'),
('ⶸ', 'ⶾ'),
('ⷀ', 'ⷆ'),
('ⷈ', 'ⷎ'),
('ⷐ', 'ⷖ'),
('ⷘ', 'ⷞ'),
('\u{2de0}', '\u{2dff}'),
('ⸯ', 'ⸯ'),
('々', ''),
('〡', '\u{302f}'),
('〱', '〵'),
('〸', '〼'),
('ぁ', 'ゖ'),
('\u{3099}', '\u{309a}'),
('ゝ', 'ゟ'),
('ァ', 'ヺ'),
('ー', 'ヿ'),
('ㄅ', 'ㄯ'),
('ㄱ', 'ㆎ'),
('ㆠ', 'ㆿ'),
('ㇰ', 'ㇿ'),
('㐀', '䶿'),
('一', 'ꒌ'),
('', ''),
('ꔀ', 'ꘌ'),
('ꘐ', 'ꘫ'),
('Ꙁ', '\u{a672}'),
('\u{a674}', '\u{a67d}'),
('ꙿ', '\u{a6f1}'),
('ꜗ', 'ꜟ'),
('Ꜣ', 'ꞈ'),
('Ꞌ', 'ꟍ'),
('Ꟑ', 'ꟑ'),
('ꟓ', 'ꟓ'),
('ꟕ', 'Ƛ'),
('ꟲ', 'ꠧ'),
('\u{a82c}', '\u{a82c}'),
('ꡀ', 'ꡳ'),
('ꢀ', '\u{a8c5}'),
('꣐', '꣙'),
('\u{a8e0}', 'ꣷ'),
('ꣻ', 'ꣻ'),
('ꣽ', '\u{a92d}'),
('ꤰ', '\u{a953}'),
('ꥠ', 'ꥼ'),
('\u{a980}', '\u{a9c0}'),
('ꧏ', '꧙'),
('ꧠ', 'ꧾ'),
('ꨀ', '\u{aa36}'),
('ꩀ', 'ꩍ'),
('꩐', '꩙'),
('ꩠ', 'ꩶ'),
('ꩺ', 'ꫂ'),
('ꫛ', 'ꫝ'),
('ꫠ', 'ꫯ'),
('ꫲ', '\u{aaf6}'),
('ꬁ', 'ꬆ'),
('ꬉ', 'ꬎ'),
('ꬑ', 'ꬖ'),
('ꬠ', 'ꬦ'),
('ꬨ', 'ꬮ'),
('ꬰ', ''),
('ꭜ', 'ꭩ'),
('ꭰ', 'ꯪ'),
('꯬', '\u{abed}'),
('꯰', '꯹'),
('가', '힣'),
('ힰ', 'ퟆ'),
('ퟋ', 'ퟻ'),
('豈', '舘'),
('並', '龎'),
('ff', 'st'),
('ﬓ', 'ﬗ'),
('יִ', 'ﬨ'),
('שׁ', 'זּ'),
('טּ', 'לּ'),
('מּ', 'מּ'),
('נּ', 'סּ'),
('ףּ', 'פּ'),
('צּ', 'ﮱ'),
('ﯓ', 'ﴽ'),
('ﵐ', 'ﶏ'),
('ﶒ', 'ﷇ'),
('ﷰ', 'ﷻ'),
('\u{fe00}', '\u{fe0f}'),
('\u{fe20}', '\u{fe2f}'),
('︳', '︴'),
('', ''),
('ﹰ', 'ﹴ'),
('ﹶ', 'ﻼ'),
('', ''),
('', ''),
('_', '_'),
('', ''),
('ヲ', 'ᄒ'),
('ᅡ', 'ᅦ'),
('ᅧ', 'ᅬ'),
('ᅭ', 'ᅲ'),
('ᅳ', 'ᅵ'),
('𐀀', '𐀋'),
('𐀍', '𐀦'),
('𐀨', '𐀺'),
('𐀼', '𐀽'),
('𐀿', '𐁍'),
('𐁐', '𐁝'),
('𐂀', '𐃺'),
('𐅀', '𐅴'),
('\u{101fd}', '\u{101fd}'),
('𐊀', '𐊜'),
('𐊠', '𐋐'),
('\u{102e0}', '\u{102e0}'),
('𐌀', '𐌟'),
('𐌭', '𐍊'),
('𐍐', '\u{1037a}'),
('𐎀', '𐎝'),
('𐎠', '𐏃'),
('𐏈', '𐏏'),
('𐏑', '𐏕'),
('𐐀', '𐒝'),
('𐒠', '𐒩'),
('𐒰', '𐓓'),
('𐓘', '𐓻'),
('𐔀', '𐔧'),
('𐔰', '𐕣'),
('𐕰', '𐕺'),
('𐕼', '𐖊'),
('𐖌', '𐖒'),
('𐖔', '𐖕'),
('𐖗', '𐖡'),
('𐖣', '𐖱'),
('𐖳', '𐖹'),
('𐖻', '𐖼'),
('𐗀', '𐗳'),
('𐘀', '𐜶'),
('𐝀', '𐝕'),
('𐝠', '𐝧'),
('𐞀', '𐞅'),
('𐞇', '𐞰'),
('𐞲', '𐞺'),
('𐠀', '𐠅'),
('𐠈', '𐠈'),
('𐠊', '𐠵'),
('𐠷', '𐠸'),
('𐠼', '𐠼'),
('𐠿', '𐡕'),
('𐡠', '𐡶'),
('𐢀', '𐢞'),
('𐣠', '𐣲'),
('𐣴', '𐣵'),
('𐤀', '𐤕'),
('𐤠', '𐤹'),
('𐦀', '𐦷'),
('𐦾', '𐦿'),
('𐨀', '\u{10a03}'),
('\u{10a05}', '\u{10a06}'),
('\u{10a0c}', '𐨓'),
('𐨕', '𐨗'),
('𐨙', '𐨵'),
('\u{10a38}', '\u{10a3a}'),
('\u{10a3f}', '\u{10a3f}'),
('𐩠', '𐩼'),
('𐪀', '𐪜'),
('𐫀', '𐫇'),
('𐫉', '\u{10ae6}'),
('𐬀', '𐬵'),
('𐭀', '𐭕'),
('𐭠', '𐭲'),
('𐮀', '𐮑'),
('𐰀', '𐱈'),
('𐲀', '𐲲'),
('𐳀', '𐳲'),
('𐴀', '\u{10d27}'),
('𐴰', '𐴹'),
('𐵀', '𐵥'),
('\u{10d69}', '\u{10d6d}'),
('𐵯', '𐶅'),
('𐺀', '𐺩'),
('\u{10eab}', '\u{10eac}'),
('𐺰', '𐺱'),
('𐻂', '𐻄'),
('\u{10efc}', '𐼜'),
('𐼧', '𐼧'),
('𐼰', '\u{10f50}'),
('𐽰', '\u{10f85}'),
('𐾰', '𐿄'),
('𐿠', '𐿶'),
('𑀀', '\u{11046}'),
('𑁦', '𑁵'),
('\u{1107f}', '\u{110ba}'),
('\u{110c2}', '\u{110c2}'),
('𑃐', '𑃨'),
('𑃰', '𑃹'),
('\u{11100}', '\u{11134}'),
('𑄶', '𑄿'),
('𑅄', '𑅇'),
('𑅐', '\u{11173}'),
('𑅶', '𑅶'),
('\u{11180}', '𑇄'),
('\u{111c9}', '\u{111cc}'),
('𑇎', '𑇚'),
('𑇜', '𑇜'),
('𑈀', '𑈑'),
('𑈓', '\u{11237}'),
('\u{1123e}', '\u{11241}'),
('𑊀', '𑊆'),
('𑊈', '𑊈'),
('𑊊', '𑊍'),
('𑊏', '𑊝'),
('𑊟', '𑊨'),
('𑊰', '\u{112ea}'),
('𑋰', '𑋹'),
('\u{11300}', '𑌃'),
('𑌅', '𑌌'),
('𑌏', '𑌐'),
('𑌓', '𑌨'),
('𑌪', '𑌰'),
('𑌲', '𑌳'),
('𑌵', '𑌹'),
('\u{1133b}', '𑍄'),
('𑍇', '𑍈'),
('𑍋', '\u{1134d}'),
('𑍐', '𑍐'),
('\u{11357}', '\u{11357}'),
('𑍝', '𑍣'),
('\u{11366}', '\u{1136c}'),
('\u{11370}', '\u{11374}'),
('𑎀', '𑎉'),
('𑎋', '𑎋'),
('𑎎', '𑎎'),
('𑎐', '𑎵'),
('𑎷', '\u{113c0}'),
('\u{113c2}', '\u{113c2}'),
('\u{113c5}', '\u{113c5}'),
('\u{113c7}', '𑏊'),
('𑏌', '𑏓'),
('\u{113e1}', '\u{113e2}'),
('𑐀', '𑑊'),
('𑑐', '𑑙'),
('\u{1145e}', '𑑡'),
('𑒀', '𑓅'),
('𑓇', '𑓇'),
('𑓐', '𑓙'),
('𑖀', '\u{115b5}'),
('𑖸', '\u{115c0}'),
('𑗘', '\u{115dd}'),
('𑘀', '\u{11640}'),
('𑙄', '𑙄'),
('𑙐', '𑙙'),
('𑚀', '𑚸'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜀', '𑜚'),
('\u{1171d}', '\u{1172b}'),
('𑜰', '𑜹'),
('𑝀', '𑝆'),
('𑠀', '\u{1183a}'),
('𑢠', '𑣩'),
('𑣿', '𑤆'),
('𑤉', '𑤉'),
('𑤌', '𑤓'),
('𑤕', '𑤖'),
('𑤘', '𑤵'),
('𑤷', '𑤸'),
('\u{1193b}', '\u{11943}'),
('𑥐', '𑥙'),
('𑦠', '𑦧'),
('𑦪', '\u{119d7}'),
('\u{119da}', '𑧡'),
('𑧣', '𑧤'),
('𑨀', '\u{11a3e}'),
('\u{11a47}', '\u{11a47}'),
('𑩐', '\u{11a99}'),
('𑪝', '𑪝'),
('𑪰', '𑫸'),
('𑯀', '𑯠'),
('𑯰', '𑯹'),
('𑰀', '𑰈'),
('𑰊', '\u{11c36}'),
('\u{11c38}', '𑱀'),
('𑱐', '𑱙'),
('𑱲', '𑲏'),
('\u{11c92}', '\u{11ca7}'),
('𑲩', '\u{11cb6}'),
('𑴀', '𑴆'),
('𑴈', '𑴉'),
('𑴋', '\u{11d36}'),
('\u{11d3a}', '\u{11d3a}'),
('\u{11d3c}', '\u{11d3d}'),
('\u{11d3f}', '\u{11d47}'),
('𑵐', '𑵙'),
('𑵠', '𑵥'),
('𑵧', '𑵨'),
('𑵪', '𑶎'),
('\u{11d90}', '\u{11d91}'),
('𑶓', '𑶘'),
('𑶠', '𑶩'),
('𑻠', '𑻶'),
('\u{11f00}', '𑼐'),
('𑼒', '\u{11f3a}'),
('𑼾', '\u{11f42}'),
('𑽐', '\u{11f5a}'),
('𑾰', '𑾰'),
('𒀀', '𒎙'),
('𒐀', '𒑮'),
('𒒀', '𒕃'),
('𒾐', '𒿰'),
('𓀀', '𓐯'),
('\u{13440}', '\u{13455}'),
('𓑠', '𔏺'),
('𔐀', '𔙆'),
('𖄀', '𖄹'),
('𖠀', '𖨸'),
('𖩀', '𖩞'),
('𖩠', '𖩩'),
('𖩰', '𖪾'),
('𖫀', '𖫉'),
('𖫐', '𖫭'),
('\u{16af0}', '\u{16af4}'),
('𖬀', '\u{16b36}'),
('𖭀', '𖭃'),
('𖭐', '𖭙'),
('𖭣', '𖭷'),
('𖭽', '𖮏'),
('𖵀', '𖵬'),
('𖵰', '𖵹'),
('𖹀', '𖹿'),
('𖼀', '𖽊'),
('\u{16f4f}', '𖾇'),
('\u{16f8f}', '𖾟'),
('𖿠', '𖿡'),
('𖿣', '\u{16fe4}'),
('\u{16ff0}', '\u{16ff1}'),
('𗀀', '𘟷'),
('𘠀', '𘳕'),
('𘳿', '𘴈'),
('𚿰', '𚿳'),
('𚿵', '𚿻'),
('𚿽', '𚿾'),
('𛀀', '𛄢'),
('𛄲', '𛄲'),
('𛅐', '𛅒'),
('𛅕', '𛅕'),
('𛅤', '𛅧'),
('𛅰', '𛋻'),
('𛰀', '𛱪'),
('𛱰', '𛱼'),
('𛲀', '𛲈'),
('𛲐', '𛲙'),
('\u{1bc9d}', '\u{1bc9e}'),
('𜳰', '𜳹'),
('\u{1cf00}', '\u{1cf2d}'),
('\u{1cf30}', '\u{1cf46}'),
('\u{1d165}', '\u{1d169}'),
('\u{1d16d}', '\u{1d172}'),
('\u{1d17b}', '\u{1d182}'),
('\u{1d185}', '\u{1d18b}'),
('\u{1d1aa}', '\u{1d1ad}'),
('\u{1d242}', '\u{1d244}'),
('𝐀', '𝑔'),
('𝑖', '𝒜'),
('𝒞', '𝒟'),
('𝒢', '𝒢'),
('𝒥', '𝒦'),
('𝒩', '𝒬'),
('𝒮', '𝒹'),
('𝒻', '𝒻'),
('𝒽', '𝓃'),
('𝓅', '𝔅'),
('𝔇', '𝔊'),
('𝔍', '𝔔'),
('𝔖', '𝔜'),
('𝔞', '𝔹'),
('𝔻', '𝔾'),
('𝕀', '𝕄'),
('𝕆', '𝕆'),
('𝕊', '𝕐'),
('𝕒', '𝚥'),
('𝚨', '𝛀'),
('𝛂', '𝛚'),
('𝛜', '𝛺'),
('𝛼', '𝜔'),
('𝜖', '𝜴'),
('𝜶', '𝝎'),
('𝝐', '𝝮'),
('𝝰', '𝞈'),
('𝞊', '𝞨'),
('𝞪', '𝟂'),
('𝟄', '𝟋'),
('𝟎', '𝟿'),
('\u{1da00}', '\u{1da36}'),
('\u{1da3b}', '\u{1da6c}'),
('\u{1da75}', '\u{1da75}'),
('\u{1da84}', '\u{1da84}'),
('\u{1da9b}', '\u{1da9f}'),
('\u{1daa1}', '\u{1daaf}'),
('𝼀', '𝼞'),
('𝼥', '𝼪'),
('\u{1e000}', '\u{1e006}'),
('\u{1e008}', '\u{1e018}'),
('\u{1e01b}', '\u{1e021}'),
('\u{1e023}', '\u{1e024}'),
('\u{1e026}', '\u{1e02a}'),
('𞀰', '𞁭'),
('\u{1e08f}', '\u{1e08f}'),
('𞄀', '𞄬'),
('\u{1e130}', '𞄽'),
('𞅀', '𞅉'),
('𞅎', '𞅎'),
('𞊐', '\u{1e2ae}'),
('𞋀', '𞋹'),
('𞓐', '𞓹'),
('𞗐', '𞗺'),
('𞟠', '𞟦'),
('𞟨', '𞟫'),
('𞟭', '𞟮'),
('𞟰', '𞟾'),
('𞠀', '𞣄'),
('\u{1e8d0}', '\u{1e8d6}'),
('𞤀', '𞥋'),
('𞥐', '𞥙'),
('𞸀', '𞸃'),
('𞸅', '𞸟'),
('𞸡', '𞸢'),
('𞸤', '𞸤'),
('𞸧', '𞸧'),
('𞸩', '𞸲'),
('𞸴', '𞸷'),
('𞸹', '𞸹'),
('𞸻', '𞸻'),
('𞹂', '𞹂'),
('𞹇', '𞹇'),
('𞹉', '𞹉'),
('𞹋', '𞹋'),
('𞹍', '𞹏'),
('𞹑', '𞹒'),
('𞹔', '𞹔'),
('𞹗', '𞹗'),
('𞹙', '𞹙'),
('𞹛', '𞹛'),
('𞹝', '𞹝'),
('𞹟', '𞹟'),
('𞹡', '𞹢'),
('𞹤', '𞹤'),
('𞹧', '𞹪'),
('𞹬', '𞹲'),
('𞹴', '𞹷'),
('𞹹', '𞹼'),
('𞹾', '𞹾'),
('𞺀', '𞺉'),
('𞺋', '𞺛'),
('𞺡', '𞺣'),
('𞺥', '𞺩'),
('𞺫', '𞺻'),
('🄰', '🅉'),
('🅐', '🅩'),
('🅰', '🆉'),
('🯰', '🯹'),
('𠀀', '𪛟'),
('𪜀', '𫜹'),
('𫝀', '𫠝'),
('𫠠', '𬺡'),
('𬺰', '𮯠'),
('𮯰', '𮹝'),
('丽', '𪘀'),
('𰀀', '𱍊'),
('𱍐', '𲎯'),
('\u{e0100}', '\u{e01ef}'),
];

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,281 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-names ucd-16.0.0
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
("age", "Age"),
("ahex", "ASCII_Hex_Digit"),
("alpha", "Alphabetic"),
("alphabetic", "Alphabetic"),
("asciihexdigit", "ASCII_Hex_Digit"),
("bc", "Bidi_Class"),
("bidic", "Bidi_Control"),
("bidiclass", "Bidi_Class"),
("bidicontrol", "Bidi_Control"),
("bidim", "Bidi_Mirrored"),
("bidimirrored", "Bidi_Mirrored"),
("bidimirroringglyph", "Bidi_Mirroring_Glyph"),
("bidipairedbracket", "Bidi_Paired_Bracket"),
("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"),
("blk", "Block"),
("block", "Block"),
("bmg", "Bidi_Mirroring_Glyph"),
("bpb", "Bidi_Paired_Bracket"),
("bpt", "Bidi_Paired_Bracket_Type"),
("canonicalcombiningclass", "Canonical_Combining_Class"),
("cased", "Cased"),
("casefolding", "Case_Folding"),
("caseignorable", "Case_Ignorable"),
("ccc", "Canonical_Combining_Class"),
("ce", "Composition_Exclusion"),
("cf", "Case_Folding"),
("changeswhencasefolded", "Changes_When_Casefolded"),
("changeswhencasemapped", "Changes_When_Casemapped"),
("changeswhenlowercased", "Changes_When_Lowercased"),
("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"),
("changeswhentitlecased", "Changes_When_Titlecased"),
("changeswhenuppercased", "Changes_When_Uppercased"),
("ci", "Case_Ignorable"),
("cjkaccountingnumeric", "kAccountingNumeric"),
("cjkcompatibilityvariant", "kCompatibilityVariant"),
("cjkiicore", "kIICore"),
("cjkirggsource", "kIRG_GSource"),
("cjkirghsource", "kIRG_HSource"),
("cjkirgjsource", "kIRG_JSource"),
("cjkirgkpsource", "kIRG_KPSource"),
("cjkirgksource", "kIRG_KSource"),
("cjkirgmsource", "kIRG_MSource"),
("cjkirgssource", "kIRG_SSource"),
("cjkirgtsource", "kIRG_TSource"),
("cjkirguksource", "kIRG_UKSource"),
("cjkirgusource", "kIRG_USource"),
("cjkirgvsource", "kIRG_VSource"),
("cjkothernumeric", "kOtherNumeric"),
("cjkprimarynumeric", "kPrimaryNumeric"),
("cjkrsunicode", "kRSUnicode"),
("compex", "Full_Composition_Exclusion"),
("compositionexclusion", "Composition_Exclusion"),
("cwcf", "Changes_When_Casefolded"),
("cwcm", "Changes_When_Casemapped"),
("cwkcf", "Changes_When_NFKC_Casefolded"),
("cwl", "Changes_When_Lowercased"),
("cwt", "Changes_When_Titlecased"),
("cwu", "Changes_When_Uppercased"),
("dash", "Dash"),
("decompositionmapping", "Decomposition_Mapping"),
("decompositiontype", "Decomposition_Type"),
("defaultignorablecodepoint", "Default_Ignorable_Code_Point"),
("dep", "Deprecated"),
("deprecated", "Deprecated"),
("di", "Default_Ignorable_Code_Point"),
("dia", "Diacritic"),
("diacritic", "Diacritic"),
("dm", "Decomposition_Mapping"),
("dt", "Decomposition_Type"),
("ea", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"),
("ebase", "Emoji_Modifier_Base"),
("ecomp", "Emoji_Component"),
("emod", "Emoji_Modifier"),
("emoji", "Emoji"),
("emojicomponent", "Emoji_Component"),
("emojimodifier", "Emoji_Modifier"),
("emojimodifierbase", "Emoji_Modifier_Base"),
("emojipresentation", "Emoji_Presentation"),
("epres", "Emoji_Presentation"),
("equideo", "Equivalent_Unified_Ideograph"),
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
("expandsonnfc", "Expands_On_NFC"),
("expandsonnfd", "Expands_On_NFD"),
("expandsonnfkc", "Expands_On_NFKC"),
("expandsonnfkd", "Expands_On_NFKD"),
("ext", "Extender"),
("extendedpictographic", "Extended_Pictographic"),
("extender", "Extender"),
("extpict", "Extended_Pictographic"),
("fcnfkc", "FC_NFKC_Closure"),
("fcnfkcclosure", "FC_NFKC_Closure"),
("fullcompositionexclusion", "Full_Composition_Exclusion"),
("gc", "General_Category"),
("gcb", "Grapheme_Cluster_Break"),
("generalcategory", "General_Category"),
("graphemebase", "Grapheme_Base"),
("graphemeclusterbreak", "Grapheme_Cluster_Break"),
("graphemeextend", "Grapheme_Extend"),
("graphemelink", "Grapheme_Link"),
("grbase", "Grapheme_Base"),
("grext", "Grapheme_Extend"),
("grlink", "Grapheme_Link"),
("hangulsyllabletype", "Hangul_Syllable_Type"),
("hex", "Hex_Digit"),
("hexdigit", "Hex_Digit"),
("hst", "Hangul_Syllable_Type"),
("hyphen", "Hyphen"),
("idc", "ID_Continue"),
("idcompatmathcontinue", "ID_Compat_Math_Continue"),
("idcompatmathstart", "ID_Compat_Math_Start"),
("idcontinue", "ID_Continue"),
("ideo", "Ideographic"),
("ideographic", "Ideographic"),
("ids", "ID_Start"),
("idsb", "IDS_Binary_Operator"),
("idsbinaryoperator", "IDS_Binary_Operator"),
("idst", "IDS_Trinary_Operator"),
("idstart", "ID_Start"),
("idstrinaryoperator", "IDS_Trinary_Operator"),
("idsu", "IDS_Unary_Operator"),
("idsunaryoperator", "IDS_Unary_Operator"),
("incb", "Indic_Conjunct_Break"),
("indicconjunctbreak", "Indic_Conjunct_Break"),
("indicpositionalcategory", "Indic_Positional_Category"),
("indicsyllabiccategory", "Indic_Syllabic_Category"),
("inpc", "Indic_Positional_Category"),
("insc", "Indic_Syllabic_Category"),
("isc", "ISO_Comment"),
("jamoshortname", "Jamo_Short_Name"),
("jg", "Joining_Group"),
("joinc", "Join_Control"),
("joincontrol", "Join_Control"),
("joininggroup", "Joining_Group"),
("joiningtype", "Joining_Type"),
("jsn", "Jamo_Short_Name"),
("jt", "Joining_Type"),
("kaccountingnumeric", "kAccountingNumeric"),
("kcompatibilityvariant", "kCompatibilityVariant"),
("kehcat", "kEH_Cat"),
("kehdesc", "kEH_Desc"),
("kehhg", "kEH_HG"),
("kehifao", "kEH_IFAO"),
("kehjsesh", "kEH_JSesh"),
("kehnomirror", "kEH_NoMirror"),
("kehnorotate", "kEH_NoRotate"),
("kiicore", "kIICore"),
("kirggsource", "kIRG_GSource"),
("kirghsource", "kIRG_HSource"),
("kirgjsource", "kIRG_JSource"),
("kirgkpsource", "kIRG_KPSource"),
("kirgksource", "kIRG_KSource"),
("kirgmsource", "kIRG_MSource"),
("kirgssource", "kIRG_SSource"),
("kirgtsource", "kIRG_TSource"),
("kirguksource", "kIRG_UKSource"),
("kirgusource", "kIRG_USource"),
("kirgvsource", "kIRG_VSource"),
("kothernumeric", "kOtherNumeric"),
("kprimarynumeric", "kPrimaryNumeric"),
("krsunicode", "kRSUnicode"),
("lb", "Line_Break"),
("lc", "Lowercase_Mapping"),
("linebreak", "Line_Break"),
("loe", "Logical_Order_Exception"),
("logicalorderexception", "Logical_Order_Exception"),
("lower", "Lowercase"),
("lowercase", "Lowercase"),
("lowercasemapping", "Lowercase_Mapping"),
("math", "Math"),
("mcm", "Modifier_Combining_Mark"),
("modifiercombiningmark", "Modifier_Combining_Mark"),
("na", "Name"),
("na1", "Unicode_1_Name"),
("name", "Name"),
("namealias", "Name_Alias"),
("nchar", "Noncharacter_Code_Point"),
("nfcqc", "NFC_Quick_Check"),
("nfcquickcheck", "NFC_Quick_Check"),
("nfdqc", "NFD_Quick_Check"),
("nfdquickcheck", "NFD_Quick_Check"),
("nfkccasefold", "NFKC_Casefold"),
("nfkccf", "NFKC_Casefold"),
("nfkcqc", "NFKC_Quick_Check"),
("nfkcquickcheck", "NFKC_Quick_Check"),
("nfkcscf", "NFKC_Simple_Casefold"),
("nfkcsimplecasefold", "NFKC_Simple_Casefold"),
("nfkdqc", "NFKD_Quick_Check"),
("nfkdquickcheck", "NFKD_Quick_Check"),
("noncharactercodepoint", "Noncharacter_Code_Point"),
("nt", "Numeric_Type"),
("numerictype", "Numeric_Type"),
("numericvalue", "Numeric_Value"),
("nv", "Numeric_Value"),
("oalpha", "Other_Alphabetic"),
("ocomment", "ISO_Comment"),
("odi", "Other_Default_Ignorable_Code_Point"),
("ogrext", "Other_Grapheme_Extend"),
("oidc", "Other_ID_Continue"),
("oids", "Other_ID_Start"),
("olower", "Other_Lowercase"),
("omath", "Other_Math"),
("otheralphabetic", "Other_Alphabetic"),
("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"),
("othergraphemeextend", "Other_Grapheme_Extend"),
("otheridcontinue", "Other_ID_Continue"),
("otheridstart", "Other_ID_Start"),
("otherlowercase", "Other_Lowercase"),
("othermath", "Other_Math"),
("otheruppercase", "Other_Uppercase"),
("oupper", "Other_Uppercase"),
("patsyn", "Pattern_Syntax"),
("patternsyntax", "Pattern_Syntax"),
("patternwhitespace", "Pattern_White_Space"),
("patws", "Pattern_White_Space"),
("pcm", "Prepended_Concatenation_Mark"),
("prependedconcatenationmark", "Prepended_Concatenation_Mark"),
("qmark", "Quotation_Mark"),
("quotationmark", "Quotation_Mark"),
("radical", "Radical"),
("regionalindicator", "Regional_Indicator"),
("ri", "Regional_Indicator"),
("sb", "Sentence_Break"),
("sc", "Script"),
("scf", "Simple_Case_Folding"),
("script", "Script"),
("scriptextensions", "Script_Extensions"),
("scx", "Script_Extensions"),
("sd", "Soft_Dotted"),
("sentencebreak", "Sentence_Break"),
("sentenceterminal", "Sentence_Terminal"),
("sfc", "Simple_Case_Folding"),
("simplecasefolding", "Simple_Case_Folding"),
("simplelowercasemapping", "Simple_Lowercase_Mapping"),
("simpletitlecasemapping", "Simple_Titlecase_Mapping"),
("simpleuppercasemapping", "Simple_Uppercase_Mapping"),
("slc", "Simple_Lowercase_Mapping"),
("softdotted", "Soft_Dotted"),
("space", "White_Space"),
("stc", "Simple_Titlecase_Mapping"),
("sterm", "Sentence_Terminal"),
("suc", "Simple_Uppercase_Mapping"),
("tc", "Titlecase_Mapping"),
("term", "Terminal_Punctuation"),
("terminalpunctuation", "Terminal_Punctuation"),
("titlecasemapping", "Titlecase_Mapping"),
("uc", "Uppercase_Mapping"),
("uideo", "Unified_Ideograph"),
("unicode1name", "Unicode_1_Name"),
("unicoderadicalstroke", "kRSUnicode"),
("unifiedideograph", "Unified_Ideograph"),
("upper", "Uppercase"),
("uppercase", "Uppercase"),
("uppercasemapping", "Uppercase_Mapping"),
("urs", "kRSUnicode"),
("variationselector", "Variation_Selector"),
("verticalorientation", "Vertical_Orientation"),
("vo", "Vertical_Orientation"),
("vs", "Variation_Selector"),
("wb", "Word_Break"),
("whitespace", "White_Space"),
("wordbreak", "Word_Break"),
("wspace", "White_Space"),
("xidc", "XID_Continue"),
("xidcontinue", "XID_Continue"),
("xids", "XID_Start"),
("xidstart", "XID_Start"),
("xonfc", "Expands_On_NFC"),
("xonfd", "Expands_On_NFD"),
("xonfkc", "Expands_On_NFKC"),
("xonfkd", "Expands_On_NFKD"),
];

View File

@@ -0,0 +1,956 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-values ucd-16.0.0 --include gc,script,scx,age,gcb,wb,sb
//
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const PROPERTY_VALUES: &'static [(
&'static str,
&'static [(&'static str, &'static str)],
)] = &[
(
"Age",
&[
("1.1", "V1_1"),
("10.0", "V10_0"),
("11.0", "V11_0"),
("12.0", "V12_0"),
("12.1", "V12_1"),
("13.0", "V13_0"),
("14.0", "V14_0"),
("15.0", "V15_0"),
("15.1", "V15_1"),
("16.0", "V16_0"),
("2.0", "V2_0"),
("2.1", "V2_1"),
("3.0", "V3_0"),
("3.1", "V3_1"),
("3.2", "V3_2"),
("4.0", "V4_0"),
("4.1", "V4_1"),
("5.0", "V5_0"),
("5.1", "V5_1"),
("5.2", "V5_2"),
("6.0", "V6_0"),
("6.1", "V6_1"),
("6.2", "V6_2"),
("6.3", "V6_3"),
("7.0", "V7_0"),
("8.0", "V8_0"),
("9.0", "V9_0"),
("na", "Unassigned"),
("unassigned", "Unassigned"),
("v100", "V10_0"),
("v11", "V1_1"),
("v110", "V11_0"),
("v120", "V12_0"),
("v121", "V12_1"),
("v130", "V13_0"),
("v140", "V14_0"),
("v150", "V15_0"),
("v151", "V15_1"),
("v160", "V16_0"),
("v20", "V2_0"),
("v21", "V2_1"),
("v30", "V3_0"),
("v31", "V3_1"),
("v32", "V3_2"),
("v40", "V4_0"),
("v41", "V4_1"),
("v50", "V5_0"),
("v51", "V5_1"),
("v52", "V5_2"),
("v60", "V6_0"),
("v61", "V6_1"),
("v62", "V6_2"),
("v63", "V6_3"),
("v70", "V7_0"),
("v80", "V8_0"),
("v90", "V9_0"),
],
),
(
"General_Category",
&[
("c", "Other"),
("casedletter", "Cased_Letter"),
("cc", "Control"),
("cf", "Format"),
("closepunctuation", "Close_Punctuation"),
("cn", "Unassigned"),
("cntrl", "Control"),
("co", "Private_Use"),
("combiningmark", "Mark"),
("connectorpunctuation", "Connector_Punctuation"),
("control", "Control"),
("cs", "Surrogate"),
("currencysymbol", "Currency_Symbol"),
("dashpunctuation", "Dash_Punctuation"),
("decimalnumber", "Decimal_Number"),
("digit", "Decimal_Number"),
("enclosingmark", "Enclosing_Mark"),
("finalpunctuation", "Final_Punctuation"),
("format", "Format"),
("initialpunctuation", "Initial_Punctuation"),
("l", "Letter"),
("lc", "Cased_Letter"),
("letter", "Letter"),
("letternumber", "Letter_Number"),
("lineseparator", "Line_Separator"),
("ll", "Lowercase_Letter"),
("lm", "Modifier_Letter"),
("lo", "Other_Letter"),
("lowercaseletter", "Lowercase_Letter"),
("lt", "Titlecase_Letter"),
("lu", "Uppercase_Letter"),
("m", "Mark"),
("mark", "Mark"),
("mathsymbol", "Math_Symbol"),
("mc", "Spacing_Mark"),
("me", "Enclosing_Mark"),
("mn", "Nonspacing_Mark"),
("modifierletter", "Modifier_Letter"),
("modifiersymbol", "Modifier_Symbol"),
("n", "Number"),
("nd", "Decimal_Number"),
("nl", "Letter_Number"),
("no", "Other_Number"),
("nonspacingmark", "Nonspacing_Mark"),
("number", "Number"),
("openpunctuation", "Open_Punctuation"),
("other", "Other"),
("otherletter", "Other_Letter"),
("othernumber", "Other_Number"),
("otherpunctuation", "Other_Punctuation"),
("othersymbol", "Other_Symbol"),
("p", "Punctuation"),
("paragraphseparator", "Paragraph_Separator"),
("pc", "Connector_Punctuation"),
("pd", "Dash_Punctuation"),
("pe", "Close_Punctuation"),
("pf", "Final_Punctuation"),
("pi", "Initial_Punctuation"),
("po", "Other_Punctuation"),
("privateuse", "Private_Use"),
("ps", "Open_Punctuation"),
("punct", "Punctuation"),
("punctuation", "Punctuation"),
("s", "Symbol"),
("sc", "Currency_Symbol"),
("separator", "Separator"),
("sk", "Modifier_Symbol"),
("sm", "Math_Symbol"),
("so", "Other_Symbol"),
("spaceseparator", "Space_Separator"),
("spacingmark", "Spacing_Mark"),
("surrogate", "Surrogate"),
("symbol", "Symbol"),
("titlecaseletter", "Titlecase_Letter"),
("unassigned", "Unassigned"),
("uppercaseletter", "Uppercase_Letter"),
("z", "Separator"),
("zl", "Line_Separator"),
("zp", "Paragraph_Separator"),
("zs", "Space_Separator"),
],
),
(
"Grapheme_Cluster_Break",
&[
("cn", "Control"),
("control", "Control"),
("cr", "CR"),
("eb", "E_Base"),
("ebase", "E_Base"),
("ebasegaz", "E_Base_GAZ"),
("ebg", "E_Base_GAZ"),
("em", "E_Modifier"),
("emodifier", "E_Modifier"),
("ex", "Extend"),
("extend", "Extend"),
("gaz", "Glue_After_Zwj"),
("glueafterzwj", "Glue_After_Zwj"),
("l", "L"),
("lf", "LF"),
("lv", "LV"),
("lvt", "LVT"),
("other", "Other"),
("pp", "Prepend"),
("prepend", "Prepend"),
("regionalindicator", "Regional_Indicator"),
("ri", "Regional_Indicator"),
("sm", "SpacingMark"),
("spacingmark", "SpacingMark"),
("t", "T"),
("v", "V"),
("xx", "Other"),
("zwj", "ZWJ"),
],
),
(
"Script",
&[
("adlam", "Adlam"),
("adlm", "Adlam"),
("aghb", "Caucasian_Albanian"),
("ahom", "Ahom"),
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
("arab", "Arabic"),
("arabic", "Arabic"),
("armenian", "Armenian"),
("armi", "Imperial_Aramaic"),
("armn", "Armenian"),
("avestan", "Avestan"),
("avst", "Avestan"),
("bali", "Balinese"),
("balinese", "Balinese"),
("bamu", "Bamum"),
("bamum", "Bamum"),
("bass", "Bassa_Vah"),
("bassavah", "Bassa_Vah"),
("batak", "Batak"),
("batk", "Batak"),
("beng", "Bengali"),
("bengali", "Bengali"),
("bhaiksuki", "Bhaiksuki"),
("bhks", "Bhaiksuki"),
("bopo", "Bopomofo"),
("bopomofo", "Bopomofo"),
("brah", "Brahmi"),
("brahmi", "Brahmi"),
("brai", "Braille"),
("braille", "Braille"),
("bugi", "Buginese"),
("buginese", "Buginese"),
("buhd", "Buhid"),
("buhid", "Buhid"),
("cakm", "Chakma"),
("canadianaboriginal", "Canadian_Aboriginal"),
("cans", "Canadian_Aboriginal"),
("cari", "Carian"),
("carian", "Carian"),
("caucasianalbanian", "Caucasian_Albanian"),
("chakma", "Chakma"),
("cham", "Cham"),
("cher", "Cherokee"),
("cherokee", "Cherokee"),
("chorasmian", "Chorasmian"),
("chrs", "Chorasmian"),
("common", "Common"),
("copt", "Coptic"),
("coptic", "Coptic"),
("cpmn", "Cypro_Minoan"),
("cprt", "Cypriot"),
("cuneiform", "Cuneiform"),
("cypriot", "Cypriot"),
("cyprominoan", "Cypro_Minoan"),
("cyrillic", "Cyrillic"),
("cyrl", "Cyrillic"),
("deseret", "Deseret"),
("deva", "Devanagari"),
("devanagari", "Devanagari"),
("diak", "Dives_Akuru"),
("divesakuru", "Dives_Akuru"),
("dogr", "Dogra"),
("dogra", "Dogra"),
("dsrt", "Deseret"),
("dupl", "Duployan"),
("duployan", "Duployan"),
("egyp", "Egyptian_Hieroglyphs"),
("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
("elba", "Elbasan"),
("elbasan", "Elbasan"),
("elym", "Elymaic"),
("elymaic", "Elymaic"),
("ethi", "Ethiopic"),
("ethiopic", "Ethiopic"),
("gara", "Garay"),
("garay", "Garay"),
("geor", "Georgian"),
("georgian", "Georgian"),
("glag", "Glagolitic"),
("glagolitic", "Glagolitic"),
("gong", "Gunjala_Gondi"),
("gonm", "Masaram_Gondi"),
("goth", "Gothic"),
("gothic", "Gothic"),
("gran", "Grantha"),
("grantha", "Grantha"),
("greek", "Greek"),
("grek", "Greek"),
("gujarati", "Gujarati"),
("gujr", "Gujarati"),
("gukh", "Gurung_Khema"),
("gunjalagondi", "Gunjala_Gondi"),
("gurmukhi", "Gurmukhi"),
("guru", "Gurmukhi"),
("gurungkhema", "Gurung_Khema"),
("han", "Han"),
("hang", "Hangul"),
("hangul", "Hangul"),
("hani", "Han"),
("hanifirohingya", "Hanifi_Rohingya"),
("hano", "Hanunoo"),
("hanunoo", "Hanunoo"),
("hatr", "Hatran"),
("hatran", "Hatran"),
("hebr", "Hebrew"),
("hebrew", "Hebrew"),
("hira", "Hiragana"),
("hiragana", "Hiragana"),
("hluw", "Anatolian_Hieroglyphs"),
("hmng", "Pahawh_Hmong"),
("hmnp", "Nyiakeng_Puachue_Hmong"),
("hrkt", "Katakana_Or_Hiragana"),
("hung", "Old_Hungarian"),
("imperialaramaic", "Imperial_Aramaic"),
("inherited", "Inherited"),
("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
("inscriptionalparthian", "Inscriptional_Parthian"),
("ital", "Old_Italic"),
("java", "Javanese"),
("javanese", "Javanese"),
("kaithi", "Kaithi"),
("kali", "Kayah_Li"),
("kana", "Katakana"),
("kannada", "Kannada"),
("katakana", "Katakana"),
("katakanaorhiragana", "Katakana_Or_Hiragana"),
("kawi", "Kawi"),
("kayahli", "Kayah_Li"),
("khar", "Kharoshthi"),
("kharoshthi", "Kharoshthi"),
("khitansmallscript", "Khitan_Small_Script"),
("khmer", "Khmer"),
("khmr", "Khmer"),
("khoj", "Khojki"),
("khojki", "Khojki"),
("khudawadi", "Khudawadi"),
("kiratrai", "Kirat_Rai"),
("kits", "Khitan_Small_Script"),
("knda", "Kannada"),
("krai", "Kirat_Rai"),
("kthi", "Kaithi"),
("lana", "Tai_Tham"),
("lao", "Lao"),
("laoo", "Lao"),
("latin", "Latin"),
("latn", "Latin"),
("lepc", "Lepcha"),
("lepcha", "Lepcha"),
("limb", "Limbu"),
("limbu", "Limbu"),
("lina", "Linear_A"),
("linb", "Linear_B"),
("lineara", "Linear_A"),
("linearb", "Linear_B"),
("lisu", "Lisu"),
("lyci", "Lycian"),
("lycian", "Lycian"),
("lydi", "Lydian"),
("lydian", "Lydian"),
("mahajani", "Mahajani"),
("mahj", "Mahajani"),
("maka", "Makasar"),
("makasar", "Makasar"),
("malayalam", "Malayalam"),
("mand", "Mandaic"),
("mandaic", "Mandaic"),
("mani", "Manichaean"),
("manichaean", "Manichaean"),
("marc", "Marchen"),
("marchen", "Marchen"),
("masaramgondi", "Masaram_Gondi"),
("medefaidrin", "Medefaidrin"),
("medf", "Medefaidrin"),
("meeteimayek", "Meetei_Mayek"),
("mend", "Mende_Kikakui"),
("mendekikakui", "Mende_Kikakui"),
("merc", "Meroitic_Cursive"),
("mero", "Meroitic_Hieroglyphs"),
("meroiticcursive", "Meroitic_Cursive"),
("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
("miao", "Miao"),
("mlym", "Malayalam"),
("modi", "Modi"),
("mong", "Mongolian"),
("mongolian", "Mongolian"),
("mro", "Mro"),
("mroo", "Mro"),
("mtei", "Meetei_Mayek"),
("mult", "Multani"),
("multani", "Multani"),
("myanmar", "Myanmar"),
("mymr", "Myanmar"),
("nabataean", "Nabataean"),
("nagm", "Nag_Mundari"),
("nagmundari", "Nag_Mundari"),
("nand", "Nandinagari"),
("nandinagari", "Nandinagari"),
("narb", "Old_North_Arabian"),
("nbat", "Nabataean"),
("newa", "Newa"),
("newtailue", "New_Tai_Lue"),
("nko", "Nko"),
("nkoo", "Nko"),
("nshu", "Nushu"),
("nushu", "Nushu"),
("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
("ogam", "Ogham"),
("ogham", "Ogham"),
("olchiki", "Ol_Chiki"),
("olck", "Ol_Chiki"),
("oldhungarian", "Old_Hungarian"),
("olditalic", "Old_Italic"),
("oldnortharabian", "Old_North_Arabian"),
("oldpermic", "Old_Permic"),
("oldpersian", "Old_Persian"),
("oldsogdian", "Old_Sogdian"),
("oldsoutharabian", "Old_South_Arabian"),
("oldturkic", "Old_Turkic"),
("olduyghur", "Old_Uyghur"),
("olonal", "Ol_Onal"),
("onao", "Ol_Onal"),
("oriya", "Oriya"),
("orkh", "Old_Turkic"),
("orya", "Oriya"),
("osage", "Osage"),
("osge", "Osage"),
("osma", "Osmanya"),
("osmanya", "Osmanya"),
("ougr", "Old_Uyghur"),
("pahawhhmong", "Pahawh_Hmong"),
("palm", "Palmyrene"),
("palmyrene", "Palmyrene"),
("pauc", "Pau_Cin_Hau"),
("paucinhau", "Pau_Cin_Hau"),
("perm", "Old_Permic"),
("phag", "Phags_Pa"),
("phagspa", "Phags_Pa"),
("phli", "Inscriptional_Pahlavi"),
("phlp", "Psalter_Pahlavi"),
("phnx", "Phoenician"),
("phoenician", "Phoenician"),
("plrd", "Miao"),
("prti", "Inscriptional_Parthian"),
("psalterpahlavi", "Psalter_Pahlavi"),
("qaac", "Coptic"),
("qaai", "Inherited"),
("rejang", "Rejang"),
("rjng", "Rejang"),
("rohg", "Hanifi_Rohingya"),
("runic", "Runic"),
("runr", "Runic"),
("samaritan", "Samaritan"),
("samr", "Samaritan"),
("sarb", "Old_South_Arabian"),
("saur", "Saurashtra"),
("saurashtra", "Saurashtra"),
("sgnw", "SignWriting"),
("sharada", "Sharada"),
("shavian", "Shavian"),
("shaw", "Shavian"),
("shrd", "Sharada"),
("sidd", "Siddham"),
("siddham", "Siddham"),
("signwriting", "SignWriting"),
("sind", "Khudawadi"),
("sinh", "Sinhala"),
("sinhala", "Sinhala"),
("sogd", "Sogdian"),
("sogdian", "Sogdian"),
("sogo", "Old_Sogdian"),
("sora", "Sora_Sompeng"),
("sorasompeng", "Sora_Sompeng"),
("soyo", "Soyombo"),
("soyombo", "Soyombo"),
("sund", "Sundanese"),
("sundanese", "Sundanese"),
("sunu", "Sunuwar"),
("sunuwar", "Sunuwar"),
("sylo", "Syloti_Nagri"),
("sylotinagri", "Syloti_Nagri"),
("syrc", "Syriac"),
("syriac", "Syriac"),
("tagalog", "Tagalog"),
("tagb", "Tagbanwa"),
("tagbanwa", "Tagbanwa"),
("taile", "Tai_Le"),
("taitham", "Tai_Tham"),
("taiviet", "Tai_Viet"),
("takr", "Takri"),
("takri", "Takri"),
("tale", "Tai_Le"),
("talu", "New_Tai_Lue"),
("tamil", "Tamil"),
("taml", "Tamil"),
("tang", "Tangut"),
("tangsa", "Tangsa"),
("tangut", "Tangut"),
("tavt", "Tai_Viet"),
("telu", "Telugu"),
("telugu", "Telugu"),
("tfng", "Tifinagh"),
("tglg", "Tagalog"),
("thaa", "Thaana"),
("thaana", "Thaana"),
("thai", "Thai"),
("tibetan", "Tibetan"),
("tibt", "Tibetan"),
("tifinagh", "Tifinagh"),
("tirh", "Tirhuta"),
("tirhuta", "Tirhuta"),
("tnsa", "Tangsa"),
("todhri", "Todhri"),
("todr", "Todhri"),
("toto", "Toto"),
("tulutigalari", "Tulu_Tigalari"),
("tutg", "Tulu_Tigalari"),
("ugar", "Ugaritic"),
("ugaritic", "Ugaritic"),
("unknown", "Unknown"),
("vai", "Vai"),
("vaii", "Vai"),
("vith", "Vithkuqi"),
("vithkuqi", "Vithkuqi"),
("wancho", "Wancho"),
("wara", "Warang_Citi"),
("warangciti", "Warang_Citi"),
("wcho", "Wancho"),
("xpeo", "Old_Persian"),
("xsux", "Cuneiform"),
("yezi", "Yezidi"),
("yezidi", "Yezidi"),
("yi", "Yi"),
("yiii", "Yi"),
("zanabazarsquare", "Zanabazar_Square"),
("zanb", "Zanabazar_Square"),
("zinh", "Inherited"),
("zyyy", "Common"),
("zzzz", "Unknown"),
],
),
(
"Script_Extensions",
&[
("adlam", "Adlam"),
("adlm", "Adlam"),
("aghb", "Caucasian_Albanian"),
("ahom", "Ahom"),
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
("arab", "Arabic"),
("arabic", "Arabic"),
("armenian", "Armenian"),
("armi", "Imperial_Aramaic"),
("armn", "Armenian"),
("avestan", "Avestan"),
("avst", "Avestan"),
("bali", "Balinese"),
("balinese", "Balinese"),
("bamu", "Bamum"),
("bamum", "Bamum"),
("bass", "Bassa_Vah"),
("bassavah", "Bassa_Vah"),
("batak", "Batak"),
("batk", "Batak"),
("beng", "Bengali"),
("bengali", "Bengali"),
("bhaiksuki", "Bhaiksuki"),
("bhks", "Bhaiksuki"),
("bopo", "Bopomofo"),
("bopomofo", "Bopomofo"),
("brah", "Brahmi"),
("brahmi", "Brahmi"),
("brai", "Braille"),
("braille", "Braille"),
("bugi", "Buginese"),
("buginese", "Buginese"),
("buhd", "Buhid"),
("buhid", "Buhid"),
("cakm", "Chakma"),
("canadianaboriginal", "Canadian_Aboriginal"),
("cans", "Canadian_Aboriginal"),
("cari", "Carian"),
("carian", "Carian"),
("caucasianalbanian", "Caucasian_Albanian"),
("chakma", "Chakma"),
("cham", "Cham"),
("cher", "Cherokee"),
("cherokee", "Cherokee"),
("chorasmian", "Chorasmian"),
("chrs", "Chorasmian"),
("common", "Common"),
("copt", "Coptic"),
("coptic", "Coptic"),
("cpmn", "Cypro_Minoan"),
("cprt", "Cypriot"),
("cuneiform", "Cuneiform"),
("cypriot", "Cypriot"),
("cyprominoan", "Cypro_Minoan"),
("cyrillic", "Cyrillic"),
("cyrl", "Cyrillic"),
("deseret", "Deseret"),
("deva", "Devanagari"),
("devanagari", "Devanagari"),
("diak", "Dives_Akuru"),
("divesakuru", "Dives_Akuru"),
("dogr", "Dogra"),
("dogra", "Dogra"),
("dsrt", "Deseret"),
("dupl", "Duployan"),
("duployan", "Duployan"),
("egyp", "Egyptian_Hieroglyphs"),
("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
("elba", "Elbasan"),
("elbasan", "Elbasan"),
("elym", "Elymaic"),
("elymaic", "Elymaic"),
("ethi", "Ethiopic"),
("ethiopic", "Ethiopic"),
("gara", "Garay"),
("garay", "Garay"),
("geor", "Georgian"),
("georgian", "Georgian"),
("glag", "Glagolitic"),
("glagolitic", "Glagolitic"),
("gong", "Gunjala_Gondi"),
("gonm", "Masaram_Gondi"),
("goth", "Gothic"),
("gothic", "Gothic"),
("gran", "Grantha"),
("grantha", "Grantha"),
("greek", "Greek"),
("grek", "Greek"),
("gujarati", "Gujarati"),
("gujr", "Gujarati"),
("gukh", "Gurung_Khema"),
("gunjalagondi", "Gunjala_Gondi"),
("gurmukhi", "Gurmukhi"),
("guru", "Gurmukhi"),
("gurungkhema", "Gurung_Khema"),
("han", "Han"),
("hang", "Hangul"),
("hangul", "Hangul"),
("hani", "Han"),
("hanifirohingya", "Hanifi_Rohingya"),
("hano", "Hanunoo"),
("hanunoo", "Hanunoo"),
("hatr", "Hatran"),
("hatran", "Hatran"),
("hebr", "Hebrew"),
("hebrew", "Hebrew"),
("hira", "Hiragana"),
("hiragana", "Hiragana"),
("hluw", "Anatolian_Hieroglyphs"),
("hmng", "Pahawh_Hmong"),
("hmnp", "Nyiakeng_Puachue_Hmong"),
("hrkt", "Katakana_Or_Hiragana"),
("hung", "Old_Hungarian"),
("imperialaramaic", "Imperial_Aramaic"),
("inherited", "Inherited"),
("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
("inscriptionalparthian", "Inscriptional_Parthian"),
("ital", "Old_Italic"),
("java", "Javanese"),
("javanese", "Javanese"),
("kaithi", "Kaithi"),
("kali", "Kayah_Li"),
("kana", "Katakana"),
("kannada", "Kannada"),
("katakana", "Katakana"),
("katakanaorhiragana", "Katakana_Or_Hiragana"),
("kawi", "Kawi"),
("kayahli", "Kayah_Li"),
("khar", "Kharoshthi"),
("kharoshthi", "Kharoshthi"),
("khitansmallscript", "Khitan_Small_Script"),
("khmer", "Khmer"),
("khmr", "Khmer"),
("khoj", "Khojki"),
("khojki", "Khojki"),
("khudawadi", "Khudawadi"),
("kiratrai", "Kirat_Rai"),
("kits", "Khitan_Small_Script"),
("knda", "Kannada"),
("krai", "Kirat_Rai"),
("kthi", "Kaithi"),
("lana", "Tai_Tham"),
("lao", "Lao"),
("laoo", "Lao"),
("latin", "Latin"),
("latn", "Latin"),
("lepc", "Lepcha"),
("lepcha", "Lepcha"),
("limb", "Limbu"),
("limbu", "Limbu"),
("lina", "Linear_A"),
("linb", "Linear_B"),
("lineara", "Linear_A"),
("linearb", "Linear_B"),
("lisu", "Lisu"),
("lyci", "Lycian"),
("lycian", "Lycian"),
("lydi", "Lydian"),
("lydian", "Lydian"),
("mahajani", "Mahajani"),
("mahj", "Mahajani"),
("maka", "Makasar"),
("makasar", "Makasar"),
("malayalam", "Malayalam"),
("mand", "Mandaic"),
("mandaic", "Mandaic"),
("mani", "Manichaean"),
("manichaean", "Manichaean"),
("marc", "Marchen"),
("marchen", "Marchen"),
("masaramgondi", "Masaram_Gondi"),
("medefaidrin", "Medefaidrin"),
("medf", "Medefaidrin"),
("meeteimayek", "Meetei_Mayek"),
("mend", "Mende_Kikakui"),
("mendekikakui", "Mende_Kikakui"),
("merc", "Meroitic_Cursive"),
("mero", "Meroitic_Hieroglyphs"),
("meroiticcursive", "Meroitic_Cursive"),
("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
("miao", "Miao"),
("mlym", "Malayalam"),
("modi", "Modi"),
("mong", "Mongolian"),
("mongolian", "Mongolian"),
("mro", "Mro"),
("mroo", "Mro"),
("mtei", "Meetei_Mayek"),
("mult", "Multani"),
("multani", "Multani"),
("myanmar", "Myanmar"),
("mymr", "Myanmar"),
("nabataean", "Nabataean"),
("nagm", "Nag_Mundari"),
("nagmundari", "Nag_Mundari"),
("nand", "Nandinagari"),
("nandinagari", "Nandinagari"),
("narb", "Old_North_Arabian"),
("nbat", "Nabataean"),
("newa", "Newa"),
("newtailue", "New_Tai_Lue"),
("nko", "Nko"),
("nkoo", "Nko"),
("nshu", "Nushu"),
("nushu", "Nushu"),
("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
("ogam", "Ogham"),
("ogham", "Ogham"),
("olchiki", "Ol_Chiki"),
("olck", "Ol_Chiki"),
("oldhungarian", "Old_Hungarian"),
("olditalic", "Old_Italic"),
("oldnortharabian", "Old_North_Arabian"),
("oldpermic", "Old_Permic"),
("oldpersian", "Old_Persian"),
("oldsogdian", "Old_Sogdian"),
("oldsoutharabian", "Old_South_Arabian"),
("oldturkic", "Old_Turkic"),
("olduyghur", "Old_Uyghur"),
("olonal", "Ol_Onal"),
("onao", "Ol_Onal"),
("oriya", "Oriya"),
("orkh", "Old_Turkic"),
("orya", "Oriya"),
("osage", "Osage"),
("osge", "Osage"),
("osma", "Osmanya"),
("osmanya", "Osmanya"),
("ougr", "Old_Uyghur"),
("pahawhhmong", "Pahawh_Hmong"),
("palm", "Palmyrene"),
("palmyrene", "Palmyrene"),
("pauc", "Pau_Cin_Hau"),
("paucinhau", "Pau_Cin_Hau"),
("perm", "Old_Permic"),
("phag", "Phags_Pa"),
("phagspa", "Phags_Pa"),
("phli", "Inscriptional_Pahlavi"),
("phlp", "Psalter_Pahlavi"),
("phnx", "Phoenician"),
("phoenician", "Phoenician"),
("plrd", "Miao"),
("prti", "Inscriptional_Parthian"),
("psalterpahlavi", "Psalter_Pahlavi"),
("qaac", "Coptic"),
("qaai", "Inherited"),
("rejang", "Rejang"),
("rjng", "Rejang"),
("rohg", "Hanifi_Rohingya"),
("runic", "Runic"),
("runr", "Runic"),
("samaritan", "Samaritan"),
("samr", "Samaritan"),
("sarb", "Old_South_Arabian"),
("saur", "Saurashtra"),
("saurashtra", "Saurashtra"),
("sgnw", "SignWriting"),
("sharada", "Sharada"),
("shavian", "Shavian"),
("shaw", "Shavian"),
("shrd", "Sharada"),
("sidd", "Siddham"),
("siddham", "Siddham"),
("signwriting", "SignWriting"),
("sind", "Khudawadi"),
("sinh", "Sinhala"),
("sinhala", "Sinhala"),
("sogd", "Sogdian"),
("sogdian", "Sogdian"),
("sogo", "Old_Sogdian"),
("sora", "Sora_Sompeng"),
("sorasompeng", "Sora_Sompeng"),
("soyo", "Soyombo"),
("soyombo", "Soyombo"),
("sund", "Sundanese"),
("sundanese", "Sundanese"),
("sunu", "Sunuwar"),
("sunuwar", "Sunuwar"),
("sylo", "Syloti_Nagri"),
("sylotinagri", "Syloti_Nagri"),
("syrc", "Syriac"),
("syriac", "Syriac"),
("tagalog", "Tagalog"),
("tagb", "Tagbanwa"),
("tagbanwa", "Tagbanwa"),
("taile", "Tai_Le"),
("taitham", "Tai_Tham"),
("taiviet", "Tai_Viet"),
("takr", "Takri"),
("takri", "Takri"),
("tale", "Tai_Le"),
("talu", "New_Tai_Lue"),
("tamil", "Tamil"),
("taml", "Tamil"),
("tang", "Tangut"),
("tangsa", "Tangsa"),
("tangut", "Tangut"),
("tavt", "Tai_Viet"),
("telu", "Telugu"),
("telugu", "Telugu"),
("tfng", "Tifinagh"),
("tglg", "Tagalog"),
("thaa", "Thaana"),
("thaana", "Thaana"),
("thai", "Thai"),
("tibetan", "Tibetan"),
("tibt", "Tibetan"),
("tifinagh", "Tifinagh"),
("tirh", "Tirhuta"),
("tirhuta", "Tirhuta"),
("tnsa", "Tangsa"),
("todhri", "Todhri"),
("todr", "Todhri"),
("toto", "Toto"),
("tulutigalari", "Tulu_Tigalari"),
("tutg", "Tulu_Tigalari"),
("ugar", "Ugaritic"),
("ugaritic", "Ugaritic"),
("unknown", "Unknown"),
("vai", "Vai"),
("vaii", "Vai"),
("vith", "Vithkuqi"),
("vithkuqi", "Vithkuqi"),
("wancho", "Wancho"),
("wara", "Warang_Citi"),
("warangciti", "Warang_Citi"),
("wcho", "Wancho"),
("xpeo", "Old_Persian"),
("xsux", "Cuneiform"),
("yezi", "Yezidi"),
("yezidi", "Yezidi"),
("yi", "Yi"),
("yiii", "Yi"),
("zanabazarsquare", "Zanabazar_Square"),
("zanb", "Zanabazar_Square"),
("zinh", "Inherited"),
("zyyy", "Common"),
("zzzz", "Unknown"),
],
),
(
"Sentence_Break",
&[
("at", "ATerm"),
("aterm", "ATerm"),
("cl", "Close"),
("close", "Close"),
("cr", "CR"),
("ex", "Extend"),
("extend", "Extend"),
("fo", "Format"),
("format", "Format"),
("le", "OLetter"),
("lf", "LF"),
("lo", "Lower"),
("lower", "Lower"),
("nu", "Numeric"),
("numeric", "Numeric"),
("oletter", "OLetter"),
("other", "Other"),
("sc", "SContinue"),
("scontinue", "SContinue"),
("se", "Sep"),
("sep", "Sep"),
("sp", "Sp"),
("st", "STerm"),
("sterm", "STerm"),
("up", "Upper"),
("upper", "Upper"),
("xx", "Other"),
],
),
(
"Word_Break",
&[
("aletter", "ALetter"),
("cr", "CR"),
("doublequote", "Double_Quote"),
("dq", "Double_Quote"),
("eb", "E_Base"),
("ebase", "E_Base"),
("ebasegaz", "E_Base_GAZ"),
("ebg", "E_Base_GAZ"),
("em", "E_Modifier"),
("emodifier", "E_Modifier"),
("ex", "ExtendNumLet"),
("extend", "Extend"),
("extendnumlet", "ExtendNumLet"),
("fo", "Format"),
("format", "Format"),
("gaz", "Glue_After_Zwj"),
("glueafterzwj", "Glue_After_Zwj"),
("hebrewletter", "Hebrew_Letter"),
("hl", "Hebrew_Letter"),
("ka", "Katakana"),
("katakana", "Katakana"),
("le", "ALetter"),
("lf", "LF"),
("mb", "MidNumLet"),
("midletter", "MidLetter"),
("midnum", "MidNum"),
("midnumlet", "MidNumLet"),
("ml", "MidLetter"),
("mn", "MidNum"),
("newline", "Newline"),
("nl", "Newline"),
("nu", "Numeric"),
("numeric", "Numeric"),
("other", "Other"),
("regionalindicator", "Regional_Indicator"),
("ri", "Regional_Indicator"),
("singlequote", "Single_Quote"),
("sq", "Single_Quote"),
("wsegspace", "WSegSpace"),
("xx", "Other"),
("zwj", "ZWJ"),
],
),
];

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

592
vendor/regex-syntax/src/utf8.rs vendored Normal file
View File

@@ -0,0 +1,592 @@
/*!
Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
This is sub-module is useful for constructing byte based automatons that need
to embed UTF-8 decoding. The most common use of this module is in conjunction
with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
See the documentation on the `Utf8Sequences` iterator for more details and
an example.
# Wait, what is this?
This is simplest to explain with an example. Let's say you wanted to test
whether a particular byte sequence was a Cyrillic character. One possible
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
range can be expressed as a sequence of byte ranges:
```text
[D0-D3][80-BF]
```
This is simple enough: simply encode the boundaries, `0400` encodes to
`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
However, what if you wanted to add the Cyrillic Supplementary characters to
your range? Your range might then become `[0400-052F]`. The same procedure
as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
this isn't quite correct because this range doesn't capture many characters,
for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
Instead, you need multiple sequences of byte ranges:
```text
[D0-D3][80-BF] # matches codepoints 0400-04FF
[D4][80-AF] # matches codepoints 0500-052F
```
This gets even more complicated if you want bigger ranges, particularly if
they naively contain surrogate codepoints. For example, the sequence of byte
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
```text
[0-7F]
[C2-DF][80-BF]
[E0][A0-BF][80-BF]
[E1-EC][80-BF][80-BF]
[ED][80-9F][80-BF]
[EE-EF][80-BF][80-BF]
```
Note that the byte ranges above will *not* match any erroneous encoding of
UTF-8, including encodings of surrogate codepoints.
And, of course, for all of Unicode (`[000000-10FFFF]`):
```text
[0-7F]
[C2-DF][80-BF]
[E0][A0-BF][80-BF]
[E1-EC][80-BF][80-BF]
[ED][80-9F][80-BF]
[EE-EF][80-BF][80-BF]
[F0][90-BF][80-BF][80-BF]
[F1-F3][80-BF][80-BF][80-BF]
[F4][80-8F][80-BF][80-BF]
```
This module automates the process of creating these byte ranges from ranges of
Unicode scalar values.
# Lineage
I got the idea and general implementation strategy from Russ Cox in his
[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
I also got the idea from
[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
which uses it for executing automata on their term index.
*/
use core::{char, fmt, iter::FusedIterator, slice};
use alloc::{vec, vec::Vec};
const MAX_UTF8_BYTES: usize = 4;
/// Utf8Sequence represents a sequence of byte ranges.
///
/// To match a Utf8Sequence, a candidate byte sequence must match each
/// successive range.
///
/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
pub enum Utf8Sequence {
/// One byte range.
One(Utf8Range),
/// Two successive byte ranges.
Two([Utf8Range; 2]),
/// Three successive byte ranges.
Three([Utf8Range; 3]),
/// Four successive byte ranges.
Four([Utf8Range; 4]),
}
impl Utf8Sequence {
/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
/// range.
///
/// This assumes that `start` and `end` have the same length.
fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
assert_eq!(start.len(), end.len());
match start.len() {
2 => Utf8Sequence::Two([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
]),
3 => Utf8Sequence::Three([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
Utf8Range::new(start[2], end[2]),
]),
4 => Utf8Sequence::Four([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
Utf8Range::new(start[2], end[2]),
Utf8Range::new(start[3], end[3]),
]),
n => unreachable!("invalid encoded length: {}", n),
}
}
/// Returns the underlying sequence of byte ranges as a slice.
pub fn as_slice(&self) -> &[Utf8Range] {
use self::Utf8Sequence::*;
match *self {
One(ref r) => slice::from_ref(r),
Two(ref r) => &r[..],
Three(ref r) => &r[..],
Four(ref r) => &r[..],
}
}
/// Returns the number of byte ranges in this sequence.
///
/// The length is guaranteed to be in the closed interval `[1, 4]`.
pub fn len(&self) -> usize {
self.as_slice().len()
}
/// Reverses the ranges in this sequence.
///
/// For example, if this corresponds to the following sequence:
///
/// ```text
/// [D0-D3][80-BF]
/// ```
///
/// Then after reversal, it will be
///
/// ```text
/// [80-BF][D0-D3]
/// ```
///
/// This is useful when one is constructing a UTF-8 automaton to match
/// character classes in reverse.
pub fn reverse(&mut self) {
match *self {
Utf8Sequence::One(_) => {}
Utf8Sequence::Two(ref mut x) => x.reverse(),
Utf8Sequence::Three(ref mut x) => x.reverse(),
Utf8Sequence::Four(ref mut x) => x.reverse(),
}
}
/// Returns true if and only if a prefix of `bytes` matches this sequence
/// of byte ranges.
pub fn matches(&self, bytes: &[u8]) -> bool {
if bytes.len() < self.len() {
return false;
}
for (&b, r) in bytes.iter().zip(self) {
if !r.matches(b) {
return false;
}
}
true
}
}
impl<'a> IntoIterator for &'a Utf8Sequence {
type IntoIter = slice::Iter<'a, Utf8Range>;
type Item = &'a Utf8Range;
fn into_iter(self) -> Self::IntoIter {
self.as_slice().iter()
}
}
impl fmt::Debug for Utf8Sequence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use self::Utf8Sequence::*;
match *self {
One(ref r) => write!(f, "{:?}", r),
Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
Four(ref r) => {
write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
}
}
}
}
/// A single inclusive range of UTF-8 bytes.
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
pub struct Utf8Range {
/// Start of byte range (inclusive).
pub start: u8,
/// End of byte range (inclusive).
pub end: u8,
}
impl Utf8Range {
fn new(start: u8, end: u8) -> Self {
Utf8Range { start, end }
}
/// Returns true if and only if the given byte is in this range.
pub fn matches(&self, b: u8) -> bool {
self.start <= b && b <= self.end
}
}
impl fmt::Debug for Utf8Range {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.start == self.end {
write!(f, "[{:X}]", self.start)
} else {
write!(f, "[{:X}-{:X}]", self.start, self.end)
}
}
}
/// An iterator over ranges of matching UTF-8 byte sequences.
///
/// The iteration represents an alternation of comprehensive byte sequences
/// that match precisely the set of UTF-8 encoded scalar values.
///
/// A byte sequence corresponds to one of the scalar values in the range given
/// if and only if it completely matches exactly one of the sequences of byte
/// ranges produced by this iterator.
///
/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
/// sequences will match the same bytes.
///
/// # Example
///
/// This shows how to match an arbitrary byte sequence against a range of
/// scalar values.
///
/// ```rust
/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
///
/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
/// for range in seqs {
/// if range.matches(bytes) {
/// return true;
/// }
/// }
/// false
/// }
///
/// // Test the basic multilingual plane.
/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
///
/// // UTF-8 encoding of 'a'.
/// assert!(matches(&seqs, &[0x61]));
/// // UTF-8 encoding of '☃' (`\u{2603}`).
/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
/// // which is invalid UTF-8, and therefore fails, despite the fact that
/// // the corresponding codepoint (0xD800) falls in the range given.
/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
/// // And fails against plain old invalid UTF-8.
/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
/// ```
///
/// If this example seems circuitous, that's because it is! It's meant to be
/// illustrative. In practice, you could just try to decode your byte sequence
/// and compare it with the scalar value range directly. However, this is not
/// always possible (for example, in a byte based automaton).
#[derive(Debug)]
pub struct Utf8Sequences {
range_stack: Vec<ScalarRange>,
}
impl Utf8Sequences {
/// Create a new iterator over UTF-8 byte ranges for the scalar value range
/// given.
pub fn new(start: char, end: char) -> Self {
let range =
ScalarRange { start: u32::from(start), end: u32::from(end) };
Utf8Sequences { range_stack: vec![range] }
}
/// reset resets the scalar value range.
/// Any existing state is cleared, but resources may be reused.
///
/// N.B. Benchmarks say that this method is dubious.
#[doc(hidden)]
pub fn reset(&mut self, start: char, end: char) {
self.range_stack.clear();
self.push(u32::from(start), u32::from(end));
}
fn push(&mut self, start: u32, end: u32) {
self.range_stack.push(ScalarRange { start, end });
}
}
struct ScalarRange {
start: u32,
end: u32,
}
impl fmt::Debug for ScalarRange {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
}
}
impl Iterator for Utf8Sequences {
type Item = Utf8Sequence;
fn next(&mut self) -> Option<Self::Item> {
'TOP: while let Some(mut r) = self.range_stack.pop() {
'INNER: loop {
if let Some((r1, r2)) = r.split() {
self.push(r2.start, r2.end);
r.start = r1.start;
r.end = r1.end;
continue 'INNER;
}
if !r.is_valid() {
continue 'TOP;
}
for i in 1..MAX_UTF8_BYTES {
let max = max_scalar_value(i);
if r.start <= max && max < r.end {
self.push(max + 1, r.end);
r.end = max;
continue 'INNER;
}
}
if let Some(ascii_range) = r.as_ascii() {
return Some(Utf8Sequence::One(ascii_range));
}
for i in 1..MAX_UTF8_BYTES {
let m = (1 << (6 * i)) - 1;
if (r.start & !m) != (r.end & !m) {
if (r.start & m) != 0 {
self.push((r.start | m) + 1, r.end);
r.end = r.start | m;
continue 'INNER;
}
if (r.end & m) != m {
self.push(r.end & !m, r.end);
r.end = (r.end & !m) - 1;
continue 'INNER;
}
}
}
let mut start = [0; MAX_UTF8_BYTES];
let mut end = [0; MAX_UTF8_BYTES];
let n = r.encode(&mut start, &mut end);
return Some(Utf8Sequence::from_encoded_range(
&start[0..n],
&end[0..n],
));
}
}
None
}
}
impl FusedIterator for Utf8Sequences {}
impl ScalarRange {
/// split splits this range if it overlaps with a surrogate codepoint.
///
/// Either or both ranges may be invalid.
fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
if self.start < 0xE000 && self.end > 0xD7FF {
Some((
ScalarRange { start: self.start, end: 0xD7FF },
ScalarRange { start: 0xE000, end: self.end },
))
} else {
None
}
}
/// is_valid returns true if and only if start <= end.
fn is_valid(&self) -> bool {
self.start <= self.end
}
/// as_ascii returns this range as a Utf8Range if and only if all scalar
/// values in this range can be encoded as a single byte.
fn as_ascii(&self) -> Option<Utf8Range> {
if self.is_ascii() {
let start = u8::try_from(self.start).unwrap();
let end = u8::try_from(self.end).unwrap();
Some(Utf8Range::new(start, end))
} else {
None
}
}
/// is_ascii returns true if the range is ASCII only (i.e., takes a single
/// byte to encode any scalar value).
fn is_ascii(&self) -> bool {
self.is_valid() && self.end <= 0x7f
}
/// encode writes the UTF-8 encoding of the start and end of this range
/// to the corresponding destination slices, and returns the number of
/// bytes written.
///
/// The slices should have room for at least `MAX_UTF8_BYTES`.
fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
let cs = char::from_u32(self.start).unwrap();
let ce = char::from_u32(self.end).unwrap();
let ss = cs.encode_utf8(start);
let se = ce.encode_utf8(end);
assert_eq!(ss.len(), se.len());
ss.len()
}
}
fn max_scalar_value(nbytes: usize) -> u32 {
match nbytes {
1 => 0x007F,
2 => 0x07FF,
3 => 0xFFFF,
4 => 0x0010_FFFF,
_ => unreachable!("invalid UTF-8 byte sequence size"),
}
}
#[cfg(test)]
mod tests {
use core::char;
use alloc::{vec, vec::Vec};
use crate::utf8::{Utf8Range, Utf8Sequences};
fn rutf8(s: u8, e: u8) -> Utf8Range {
Utf8Range::new(s, e)
}
fn never_accepts_surrogate_codepoints(start: char, end: char) {
for cp in 0xD800..0xE000 {
let buf = encode_surrogate(cp);
for r in Utf8Sequences::new(start, end) {
if r.matches(&buf) {
panic!(
"Sequence ({:X}, {:X}) contains range {:?}, \
which matches surrogate code point {:X} \
with encoded bytes {:?}",
u32::from(start),
u32::from(end),
r,
cp,
buf,
);
}
}
}
}
#[test]
fn codepoints_no_surrogates() {
never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
}
#[test]
fn single_codepoint_one_sequence() {
// Tests that every range of scalar values that contains a single
// scalar value is recognized by one sequence of byte ranges.
for i in 0x0..=0x0010_FFFF {
let c = match char::from_u32(i) {
None => continue,
Some(c) => c,
};
let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
assert_eq!(seqs.len(), 1);
}
}
#[test]
fn bmp() {
use crate::utf8::Utf8Sequence::*;
let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
assert_eq!(
seqs,
vec![
One(rutf8(0x0, 0x7F)),
Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
Three([
rutf8(0xE0, 0xE0),
rutf8(0xA0, 0xBF),
rutf8(0x80, 0xBF)
]),
Three([
rutf8(0xE1, 0xEC),
rutf8(0x80, 0xBF),
rutf8(0x80, 0xBF)
]),
Three([
rutf8(0xED, 0xED),
rutf8(0x80, 0x9F),
rutf8(0x80, 0xBF)
]),
Three([
rutf8(0xEE, 0xEF),
rutf8(0x80, 0xBF),
rutf8(0x80, 0xBF)
]),
]
);
}
#[test]
fn reverse() {
use crate::utf8::Utf8Sequence::*;
let mut s = One(rutf8(0xA, 0xB));
s.reverse();
assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
s.reverse();
assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
s.reverse();
assert_eq!(
s.as_slice(),
&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
);
let mut s = Four([
rutf8(0xA, 0xB),
rutf8(0xB, 0xC),
rutf8(0xC, 0xD),
rutf8(0xD, 0xE),
]);
s.reverse();
assert_eq!(
s.as_slice(),
&[
rutf8(0xD, 0xE),
rutf8(0xC, 0xD),
rutf8(0xB, 0xC),
rutf8(0xA, 0xB)
]
);
}
fn encode_surrogate(cp: u32) -> [u8; 3] {
const TAG_CONT: u8 = 0b1000_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
assert!(0xD800 <= cp && cp < 0xE000);
let mut dst = [0; 3];
dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
dst
}
}

30
vendor/regex-syntax/test vendored Executable file
View File

@@ -0,0 +1,30 @@
#!/bin/bash
set -e
# cd to the directory containing this crate's Cargo.toml so that we don't need
# to pass --manifest-path to every `cargo` command.
cd "$(dirname "$0")"
# This is a convenience script for running a broad swath of the syntax tests.
echo "===== DEFAULT FEATURES ==="
cargo test
features=(
std
unicode
unicode-age
unicode-bool
unicode-case
unicode-gencat
unicode-perl
unicode-script
unicode-segment
)
for f in "${features[@]}"; do
echo "=== FEATURE: $f ==="
# We only run library tests because I couldn't figure out how to easily
# make doc tests run in 'no_std' mode. In particular, without the Error
# trait, using '?' in doc tests seems tricky.
cargo test --no-default-features --lib --features "$f"
done