Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

1
vendor/cesu8/.cargo-checksum.json vendored Normal file
View File

@@ -0,0 +1 @@
{"files":{"COPYRIGHT-RUST.txt":"5ca77347e58205d3b543c04a9c5bdd11d20a9f3108a7b246640edffa999b5f35","Cargo.toml":"78e66dd24c12e0ac858e7524cdd7d51d1eee753c1e9c32e5b915dc9f1d767870","README.md":"4fcc5d9b5db444ad3c33c79aebe18de4fa6351982646da0b78830fbad67f9e8f","src/lib.rs":"402f647c80ccaa86f43a5571cc081aa66777d2a538ce9d9dc52a7e588ef137c9","src/unicode.rs":"66cc902b4dd323cef600890b6cd99186592919b03e55e3535adf0c5b95a8ce45"},"package":"6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"}

422
vendor/cesu8/COPYRIGHT-RUST.txt vendored Normal file
View File

@@ -0,0 +1,422 @@
Short version for non-lawyers:
The Rust Project is dual-licensed under Apache 2.0 and MIT
terms.
Longer version:
The Rust Project is copyright 2014, The Rust Project
Developers (given in the file AUTHORS.txt).
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
at your option. All files in the project carrying such
notice may not be copied, modified, or distributed except
according to those terms.
The Rust Project includes packages written by third parties.
The following third party packages are included, and carry
their own copyright notices and license terms:
* Two header files that are part of the Valgrind
package. These files are found at src/rt/vg/valgrind.h and
src/rt/vg/memcheck.h, within this distribution. These files
are redistributed under the following terms, as noted in
them:
for src/rt/vg/valgrind.h:
This file is part of Valgrind, a dynamic binary
instrumentation framework.
Copyright (C) 2000-2010 Julian Seward. All rights
reserved.
Redistribution and use in source and binary forms, with
or without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. The origin of this software must not be
misrepresented; you must not claim that you wrote the
original software. If you use this software in a
product, an acknowledgment in the product
documentation would be appreciated but is not
required.
3. Altered source versions must be plainly marked as
such, and must not be misrepresented as being the
original software.
4. The name of the author may not be used to endorse or
promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE.
for src/rt/vg/memcheck.h:
This file is part of MemCheck, a heavyweight Valgrind
tool for detecting memory errors.
Copyright (C) 2000-2010 Julian Seward. All rights
reserved.
Redistribution and use in source and binary forms, with
or without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. The origin of this software must not be
misrepresented; you must not claim that you wrote the
original software. If you use this software in a
product, an acknowledgment in the product
documentation would be appreciated but is not
required.
3. Altered source versions must be plainly marked as
such, and must not be misrepresented as being the
original software.
4. The name of the author may not be used to endorse or
promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE.
* The auxiliary file src/etc/pkg/modpath.iss contains a
library routine compiled, by Inno Setup, into the Windows
installer binary. This file is licensed under the LGPL,
version 3, but, in our legal interpretation, this does not
affect the aggregate "collected work" license of the Rust
distribution (MIT/ASL2) nor any other components of it. We
believe that the terms governing distribution of the
binary Windows installer built from modpath.iss are
therefore LGPL, but not the terms governing distribution
of any of the files installed by such an installer (such
as the Rust compiler or runtime libraries themselves).
* The src/rt/miniz.c file, carrying an implementation of
RFC1950/RFC1951 DEFLATE, by Rich Geldreich
<richgel99@gmail.com>. All uses of this file are
permitted by the embedded "unlicense" notice
(effectively: public domain with warranty disclaimer).
* LLVM. Code for this package is found in src/llvm.
Copyright (c) 2003-2013 University of Illinois at
Urbana-Champaign. All rights reserved.
Developed by:
LLVM Team
University of Illinois at Urbana-Champaign
http://llvm.org
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal with the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
* Redistributions of source code must retain the
above copyright notice, this list of conditions
and the following disclaimers.
* Redistributions in binary form must reproduce the
above copyright notice, this list of conditions
and the following disclaimers in the documentation
and/or other materials provided with the
distribution.
* Neither the names of the LLVM Team, University of
Illinois at Urbana-Champaign, nor the names of its
contributors may be used to endorse or promote
products derived from this Software without
specific prior written permission.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS WITH THE SOFTWARE.
* Additional libraries included in LLVM carry separate
BSD-compatible licenses. See src/llvm/LICENSE.txt for
details.
* compiler-rt, in src/compiler-rt is dual licensed under
LLVM's license and MIT:
Copyright (c) 2009-2014 by the contributors listed in
CREDITS.TXT
All rights reserved.
Developed by:
LLVM Team
University of Illinois at Urbana-Champaign
http://llvm.org
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal with the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
* Redistributions of source code must retain the
above copyright notice, this list of conditions
and the following disclaimers.
* Redistributions in binary form must reproduce the
above copyright notice, this list of conditions
and the following disclaimers in the documentation
and/or other materials provided with the
distribution.
* Neither the names of the LLVM Team, University of
Illinois at Urbana-Champaign, nor the names of its
contributors may be used to endorse or promote
products derived from this Software without
specific prior written permission.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE
FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS WITH THE SOFTWARE.
========================================================
Copyright (c) 2009-2014 by the contributors listed in
CREDITS.TXT
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
* Portions of the FFI code for interacting with the native ABI
is derived from the Clay programming language, which carries
the following license.
Copyright (C) 2008-2010 Tachyon Technologies.
All rights reserved.
Redistribution and use in source and binary forms, with
or without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. Redistributions in binary form must reproduce the
above copyright notice, this list of conditions and
the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
DEVELOPERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE.
* Hoedown, the markdown parser, under src/rt/hoedown, is
licensed as follows.
Copyright (c) 2008, Natacha Porté
Copyright (c) 2011, Vicent Martí
Copyright (c) 2013, Devin Torres and the Hoedown authors
Permission to use, copy, modify, and distribute this
software for any purpose with or without fee is hereby
granted, provided that the above copyright notice and
this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR
DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR
ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
* libbacktrace, under src/libbacktrace:
Copyright (C) 2012-2014 Free Software Foundation, Inc.
Written by Ian Lance Taylor, Google.
Redistribution and use in source and binary forms, with
or without modification, are permitted provided that the
following conditions are met:
(1) Redistributions of source code must retain the
above copyright notice, this list of conditions and
the following disclaimer.
(2) Redistributions in binary form must reproduce
the above copyright notice, this list of conditions
and the following disclaimer in the documentation
and/or other materials provided with the
distribution.
(3) The name of the author may not be used to
endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE. */
* jemalloc, under src/jemalloc:
Copyright (C) 2002-2014 Jason Evans
<jasone@canonware.com>. All rights reserved.
Copyright (C) 2007-2012 Mozilla Foundation.
All rights reserved.
Copyright (C) 2009-2014 Facebook, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice(s),
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice(s),
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S)
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S)
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
OF SUCH DAMAGE.
* Additional copyright may be retained by contributors other
than Mozilla, the Rust Project Developers, or the parties
enumerated in this file. Such copyright can be determined
on a case-by-case basis by examining the author of each
portion of a file in the revision-control commit records
of the project, or by consulting representative comments
claiming copyright ownership for a file.
For example, the text:
"Copyright (c) 2011 Google Inc."
appears in some files, and these files thereby denote
that their author and copyright-holder is Google Inc.
In all such cases, the absence of explicit licensing text
indicates that the contributor chose to license their work
for distribution under identical terms to those Mozilla
has chosen for the collective work, enumerated at the top
of this file. The only difference is the retention of
copyright itself, held by the contributor.

17
vendor/cesu8/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,17 @@
[package]
name = "cesu8"
version = "1.1.0"
authors = ["Eric Kidd <git@randomhacks.net>"]
description = "Convert to and from CESU-8 encoding (similar to UTF-8)"
license = "Apache-2.0/MIT"
readme = "README.md"
repository = "https://github.com/emk/cesu8-rs"
documentation = "http://emk.github.io/cesu8-rs/cesu8/index.html"
[features]
# Allow access to unstable features when being built with a nightly compiler,
# to keep travis-cargo happy and enable access to benchmarks if we want them.
unstable = []

33
vendor/cesu8/README.md vendored Normal file
View File

@@ -0,0 +1,33 @@
# CESU-8 encoder/decoder for Rust
[![Build Status](https://travis-ci.org/emk/cesu8-rs.svg)](https://travis-ci.org/emk/cesu8-rs) [![Latest version](https://img.shields.io/crates/v/cesu8.svg)](https://crates.io/crates/cesu8) [![License](https://img.shields.io/crates/l/cesu8.svg)](https://crates.io/crates/cesu8)
[Documentation][apidoc].
[apidoc]: http://emk.github.io/cesu8-rs/cesu8/index.html
Convert between ordinary UTF-8 and [CESU-8][] encodings.
CESU-8 encodes characters outside the Basic Multilingual Plane as two
UTF-16 surrogate chacaters, which are then further re-encoded as invalid,
3-byte UTF-8 characters. This means that 4-byte UTF-8 sequences become
6-byte CESU-8 sequences.
**Note that CESU-8 is only intended for internal use within tightly-coupled
systems, and not for data interchange.**
This encoding is sometimes needed when working with Java, Oracle or MySQL,
and when trying to store emoji, hieroglyphs, or other characters on the
Supplementary Multilingual Plane or the Supplementary Ideographic Plane.
[CESU-8]: http://www.unicode.org/reports/tr26/tr26-2.html
## License
Some of this code is adapted from Rust's [`src/libcore/str.rs` file][str.rs].
This code is covered by LICENSE-RUST.txt and copyright by The Rust Project
Developers and individual Rust contributors, as described in that file.
The new code in this project is distributed under the same terms.
[str.rs]: https://github.com/rust-lang/rust/blob/master/src/libcore/str.rs

453
vendor/cesu8/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,453 @@
// Copyright 2012-2014 The Rust Project Developers and Eric Kidd. See the
// COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed except
// according to those terms.
//! A simple library implementing the [CESU-8 compatibility encoding
//! scheme](http://www.unicode.org/reports/tr26/tr26-2.html). This is a
//! non-standard variant of UTF-8 that is used internally by some systems
//! that need to represent UTF-16 data as 8-bit characters. Yes, this is
//! ugly.
//!
//! Use of this encoding is discouraged by the Unicode Consortium. It's OK
//! for working with existing internal APIs, but it should not be used for
//! transmitting or storing data.
//!
//! ```
//! use std::borrow::Cow;
//! use cesu8::{from_cesu8, to_cesu8};
//!
//! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
//! assert_eq!(Cow::Borrowed("aé日".as_bytes()),
//! to_cesu8("aé日"));
//! assert_eq!(Cow::Borrowed("aé日"),
//! from_cesu8("aé日".as_bytes()).unwrap());
//!
//! // This string is CESU-8 data containing a 6-byte surrogate pair,
//! // which decodes to a 4-byte UTF-8 string.
//! let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
//! assert_eq!(Cow::Borrowed("\u{10401}"),
//! from_cesu8(data).unwrap());
//! ```
//!
//! ### A note about security
//!
//! As a general rule, this library is intended to fail on malformed or
//! unexpected input. CESU-8 is supposed to be an internal-only format,
//! and if we're seeing malformed data, we assume that it's either a bug in
//! somebody's code, or an attacker is trying to improperly encode data to
//! evade security checks.
//!
//! If you have a use case for lossy conversion to UTF-8, or conversion
//! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
//! for `from_cesu8_lossy_permissive` with appropriate behavior.
//!
//! ### Java and U+0000, and other variants
//!
//! Java uses the CESU-8 encoding as described above, but with one
//! difference: The null character U+0000 is represented as an overlong
//! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
//! `to_java_cesu8` methods.
//!
//! ### Surrogate pairs and UTF-8
//!
//! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
//! points in the range from U+10000 to U+10FFFF. These are 16-bit numbers
//! in the range 0xD800 to 0xDFFF.
//!
//! * 0xD800 to 0xDBFF: First half of surrogate pair. When encoded as
//! CESU-8, these become **1110**1101 **10**100000 **10**000000 to
//! **1110**1101 **10**101111 **10**111111.
//!
//! * 0xDC00 to 0xDFFF: Second half of surrogate pair. These become
//! **1110**1101 **10**110000 **10**000000 to
//! **1110**1101 **10**111111 **10**111111.
//!
//! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
//! code point to UTF-16 conversion process:
//!
//! > Consider the encoding of U+10437 (𐐷):
//! >
//! > * Subtract 0x10000 from 0x10437. The result is 0x00437, 0000 0000 0100
//! > 0011 0111.
//! > * Split this into the high 10-bit value and the low 10-bit value:
//! > 0000000001 and 0000110111.
//! > * Add 0xD800 to the high value to form the high surrogate: 0xD800 +
//! > 0x0001 = 0xD801.
//! > * Add 0xDC00 to the low value to form the low surrogate: 0xDC00 +
//! > 0x0037 = 0xDC37.
#![warn(missing_docs)]
use std::borrow::Cow;
use std::error::Error;
use std::fmt;
use std::result::Result;
use std::slice;
use std::str::{from_utf8, from_utf8_unchecked};
use unicode::utf8_char_width;
mod unicode;
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
const TAG_CONT_U8: u8 = 0b1000_0000u8;
/// The CESU-8 data could not be decoded as valid UTF-8 data.
#[derive(Clone, Copy, Debug)]
pub struct Cesu8DecodingError;
impl Error for Cesu8DecodingError {
fn description(&self) -> &str { "decoding error" }
fn cause(&self) -> Option<&Error> { None }
}
impl fmt::Display for Cesu8DecodingError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "could not convert CESU-8 data to UTF-8")
}
}
/// Which variant of the encoding are we working with?
#[derive(PartialEq, Eq)]
enum Variant {
/// Regular CESU-8, with '\0' represented by itself.
Standard,
/// This is technically Java's "Modified UTF-8", which is supposedly
/// like CESU-8, except that it UTF-8 encodes the '\0' byte. I'm sure
/// it seemed like a good idea at the time.
Java,
}
/// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
/// Returns an error if the data cannot be represented as valid UTF-8.
///
/// ```
/// use std::borrow::Cow;
/// use cesu8::from_cesu8;
///
/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
/// // and we can convert it without allocating memory.
/// assert_eq!(Cow::Borrowed("aé日"),
/// from_cesu8("aé日".as_bytes()).unwrap());
///
/// // This string is CESU-8 data containing a 6-byte surrogate pair,
/// // which becomes a 4-byte UTF-8 string.
/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
/// assert_eq!(Cow::Borrowed("\u{10401}"),
/// from_cesu8(data).unwrap());
/// ```
pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
from_cesu8_internal(bytes, Variant::Standard)
}
/// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
/// necessary. Returns an error if the data cannot be represented as valid
/// UTF-8.
///
/// ```
/// use std::borrow::Cow;
/// use cesu8::from_java_cesu8;
///
/// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
/// // and we can convert it without allocating memory.
/// assert_eq!(Cow::Borrowed("aé日"),
/// from_java_cesu8("aé日".as_bytes()).unwrap());
///
/// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
/// // which becomes a 4-byte UTF-8 string.
/// let data = &[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
/// assert_eq!(Cow::Borrowed("\u{10401}"),
/// from_java_cesu8(data).unwrap());
///
/// // This string is modified UTF-8 data containing null code-points.
/// let data = &[0xC0, 0x80, 0xC0, 0x80];
/// assert_eq!(Cow::Borrowed("\0\0"),
/// from_java_cesu8(data).unwrap());
/// ```
pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {
from_cesu8_internal(bytes, Variant::Java)
}
/// Do the actual work of decoding.
fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->
Result<Cow<str>, Cesu8DecodingError>
{
match from_utf8(bytes) {
Ok(str) => Ok(Cow::Borrowed(str)),
_ => {
let mut decoded = Vec::with_capacity(bytes.len());
if decode_from_iter(&mut decoded, &mut bytes.iter(), variant) {
// Keep this assertion in debug mode only. It's important
// that this assertion is true, because Rust assumes that
// all UTF-8 strings are valid.
debug_assert!(from_utf8(&decoded[..]).is_ok());
Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
} else {
Err(Cesu8DecodingError)
}
}
}
}
#[test]
fn test_from_cesu8() {
// The surrogate-encoded character below is from the ICU library's
// icu/source/test/testdata/conversion.txt test case.
let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81, 0x7F];
assert_eq!(Cow::Borrowed("M日\u{10401}\u{7F}"),
from_cesu8(data).unwrap());
// We used to have test data from the CESU-8 specification, but when we
// worked it through manually, we got the wrong answer:
//
// Input: [0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80]
// Binary: 11101101 10101110 10000000 11101101 10110000 10000000
//
// 0b1101_101110_000000 -> 0xDB80
// 0b1101_110000_000000 -> 0xDC00
//
// ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000
// 0x10000 + 0xE0000 -> 0xF0000
//
// The spec claims that we are supposed to get 0x10000, not 0xF0000.
// Since I can't reconcile this example data with the text of the
// specification, I decided to use a test character from ICU instead.
}
// Our internal decoder, based on Rust's is_utf8 implementation.
fn decode_from_iter(
decoded: &mut Vec<u8>, iter: &mut slice::Iter<u8>, variant: Variant)
-> bool
{
macro_rules! err {
() => { return false }
}
macro_rules! next {
() => {
match iter.next() {
Some(a) => *a,
// We needed data, but there was none: error!
None => err!()
}
}
}
macro_rules! next_cont {
() => {
{
let byte = next!();
if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
}
}
}
loop {
let first = match iter.next() {
Some(&b) => b,
// We're at the end of the iterator and a codepoint boundary at
// the same time, so this string is valid.
None => return true
};
if variant == Variant::Java && first == 0 {
// Java's modified UTF-8 should never contain \0 directly.
err!();
} else if first < 128 {
// Pass ASCII through directly.
decoded.push(first);
} else if first == 0xc0 && variant == Variant::Java {
match next!() {
0x80 => decoded.push(0),
_ => err!(),
}
} else {
let w = utf8_char_width(first);
let second = next_cont!();
match w {
// Two-byte sequences can be used directly.
2 => { decoded.extend([first, second].iter().cloned()); }
3 => {
let third = next_cont!();
match (first, second) {
// These are valid UTF-8, so pass them through.
(0xE0 , 0xA0 ... 0xBF) |
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
(0xED , 0x80 ... 0x9F) |
(0xEE ... 0xEF, 0x80 ... 0xBF) => {
decoded.extend([first, second, third].iter()
.cloned())
}
// First half a surrogate pair, so decode.
(0xED , 0xA0 ... 0xAF) => {
if next!() != 0xED { err!() }
let fifth = next_cont!();
if fifth < 0xB0 || 0xBF < fifth { err!() }
let sixth = next_cont!();
let s = dec_surrogates(second, third, fifth, sixth);
decoded.extend(s.iter().cloned());
}
_ => err!()
}
}
_ => err!()
}
}
}
}
/// Convert the two trailing bytes from a CESU-8 surrogate to a regular
/// surrogate value.
fn dec_surrogate(second: u8, third: u8) -> u32 {
0xD000u32 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
}
/// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
/// sequence. Assumes input is valid.
fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
// Convert to a 32-bit code point.
let s1 = dec_surrogate(second, third);
let s2 = dec_surrogate(fifth, sixth);
let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
//println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
//println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
//println!("-> {:0>32b}", c);
assert!(0x010000 <= c && c <= 0x10FFFF);
// Convert to UTF-8.
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
[0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
TAG_CONT_U8 | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
TAG_CONT_U8 | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
TAG_CONT_U8 | ((c & 0b0_0000_0000_0000_0011_1111) ) as u8]
}
/// Convert a Rust `&str` to CESU-8 bytes.
///
/// ```
/// use std::borrow::Cow;
/// use cesu8::to_cesu8;
///
/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
/// // and we can convert it without allocating memory.
/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_cesu8("aé日"));
///
/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
/// // vector.
/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
/// to_cesu8("\u{10401}"));
/// ```
pub fn to_cesu8(text: &str) -> Cow<[u8]> {
if is_valid_cesu8(text) {
Cow::Borrowed(text.as_bytes())
} else {
Cow::Owned(to_cesu8_internal(text, Variant::Standard))
}
}
/// Convert a Rust `&str` to Java's modified UTF-8 bytes.
///
/// ```
/// use std::borrow::Cow;
/// use cesu8::to_java_cesu8;
///
/// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
/// // and we can convert it without allocating memory.
/// assert_eq!(Cow::Borrowed("aé日".as_bytes()), to_java_cesu8("aé日"));
///
/// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
/// // UTF-8 vector.
/// assert_eq!(Cow::Borrowed(&[0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81]),
/// to_java_cesu8("\u{10401}"));
///
/// // This string contains null, which becomes 2-byte modified UTF-8 encoding
/// assert_eq!(Cow::Borrowed(&[0xC0, 0x80, 0xC0, 0x80]),
/// to_java_cesu8("\0\0"));
/// ```
pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {
if is_valid_java_cesu8(text) {
Cow::Borrowed(text.as_bytes())
} else {
Cow::Owned(to_cesu8_internal(text, Variant::Java))
}
}
fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {
let bytes = text.as_bytes();
let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if variant == Variant::Java && b == 0 {
encoded.push(0xc0);
encoded.push(0x80);
i += 1;
} else if b < 128 {
// Pass ASCII through quickly.
encoded.push(b);
i += 1;
} else {
// Figure out how many bytes we need for this character.
let w = utf8_char_width(b);
assert!(w <= 4);
assert!(i + w <= bytes.len());
if w != 4 {
// Pass through short UTF-8 sequences unmodified.
encoded.extend(bytes[i..i+w].iter().cloned());
} else {
// Encode 4-byte sequences as 6 bytes.
let s = unsafe { from_utf8_unchecked(&bytes[i..i+w]) };
let c = s.chars().next().unwrap() as u32 - 0x10000;
let mut s: [u16; 2] = [0; 2];
s[0] = ((c >> 10) as u16) | 0xD800;
s[1] = ((c & 0x3FF) as u16) | 0xDC00;
encoded.extend(enc_surrogate(s[0]).iter().cloned());
encoded.extend(enc_surrogate(s[1]).iter().cloned());
}
i += w;
}
}
encoded
}
/// Check whether a Rust string contains valid CESU-8 data.
pub fn is_valid_cesu8(text: &str) -> bool {
// We rely on the fact that Rust strings are guaranteed to be valid
// UTF-8.
for b in text.bytes() {
if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
if utf8_char_width(b) > 3 { return false; }
}
true
}
/// Check whether a Rust string contains valid Java's modified UTF-8 data.
pub fn is_valid_java_cesu8(text: &str) -> bool {
!text.contains('\0') && is_valid_cesu8(text)
}
#[test]
fn test_valid_cesu8() {
assert!(is_valid_cesu8("aé日"));
assert!(is_valid_java_cesu8("aé日"));
assert!(!is_valid_cesu8("\u{10401}"));
assert!(!is_valid_java_cesu8("\u{10401}"));
assert!(is_valid_cesu8("\0\0"));
assert!(!is_valid_java_cesu8("\0\0"));
}
/// Encode a single surrogate as CESU-8.
fn enc_surrogate(surrogate: u16) -> [u8; 3] {
assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
// 1110xxxx 10xxxxxx 10xxxxxx
[0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
TAG_CONT_U8 | ((surrogate & 0b00000000_00111111) ) as u8]
}

37
vendor/cesu8/src/unicode.rs vendored Normal file
View File

@@ -0,0 +1,37 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// (This used to be in the Rust unicode crate, which is now gone, so we'll
// just include it inline.)
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];
/// Given a first byte, determine how many bytes are in this UTF-8 character
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
return UTF8_CHAR_WIDTH[b as usize] as usize;
}