561 lines
19 KiB
Rust
561 lines
19 KiB
Rust
//! This crate exposes the Unicode `Script` and `Script_Extension`
|
|
//! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
|
|
|
|
#![cfg_attr(not(test), no_std)]
|
|
#![cfg_attr(feature = "bench", feature(test))]
|
|
|
|
mod tables;
|
|
|
|
use core::convert::TryFrom;
|
|
use core::fmt;
|
|
use core::u64;
|
|
pub use tables::script_extensions;
|
|
use tables::{get_script, get_script_extension, NEXT_SCRIPT};
|
|
pub use tables::{Script, UNICODE_VERSION};
|
|
|
|
impl Script {
|
|
/// Get the full name of a script.
|
|
pub fn full_name(self) -> &'static str {
|
|
self.inner_full_name()
|
|
}
|
|
|
|
/// Attempts to parse script name from the provided string.
|
|
/// Returns `None` if the provided string does not represent a valid
|
|
/// script full name.
|
|
pub fn from_full_name(input: &str) -> Option<Self> {
|
|
Self::inner_from_full_name(input)
|
|
}
|
|
|
|
/// Get the four-character short name of a script.
|
|
pub fn short_name(self) -> &'static str {
|
|
self.inner_short_name()
|
|
}
|
|
|
|
/// Attempts to parse script name from the provided string.
|
|
/// Returns `None` if the provided string does not represent a valid
|
|
/// script four-character short name.
|
|
pub fn from_short_name(input: &str) -> Option<Self> {
|
|
Self::inner_from_short_name(input)
|
|
}
|
|
|
|
/// Is this script "Recommended" according to
|
|
/// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
|
|
pub fn is_recommended(self) -> bool {
|
|
use Script::*;
|
|
match self {
|
|
Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
|
|
| Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
|
|
| Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
|
|
| Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<Script> for ScriptExtension {
|
|
fn from(script: Script) -> Self {
|
|
if script == Script::Common {
|
|
ScriptExtension::new_common()
|
|
} else if script == Script::Inherited {
|
|
ScriptExtension::new_inherited()
|
|
} else if script == Script::Unknown {
|
|
ScriptExtension::new_unknown()
|
|
} else {
|
|
let mut first = 0;
|
|
let mut second = 0;
|
|
let mut third = 0;
|
|
let bit = script as u8;
|
|
// Find out which field it's in, and set the appropriate bit there
|
|
if bit < 64 {
|
|
first = 1 << bit as u64;
|
|
} else if bit < 128 {
|
|
// offset by 64 since `bit` is an absolute number,
|
|
// not relative to the chunk
|
|
second = 1 << (bit - 64) as u64;
|
|
} else {
|
|
third = 1 << (bit - 128) as u32;
|
|
}
|
|
ScriptExtension::new(first, second, third)
|
|
}
|
|
}
|
|
}
|
|
|
|
impl TryFrom<ScriptExtension> for Script {
|
|
type Error = ();
|
|
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
|
|
if ext.is_common_or_inherited() {
|
|
if ext.common {
|
|
Ok(Script::Common)
|
|
} else {
|
|
Ok(Script::Inherited)
|
|
}
|
|
} else if ext.is_empty() {
|
|
Ok(Script::Unknown)
|
|
} else {
|
|
// filled elements will have set ones
|
|
let fo = ext.first.count_ones();
|
|
let so = ext.second.count_ones();
|
|
let to = ext.third.count_ones();
|
|
// only one bit set, in the first chunk
|
|
if fo == 1 && so == 0 && to == 0 {
|
|
// use trailing_zeroes() to figure out which bit it is
|
|
Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
|
|
// only one bit set, in the second chunk
|
|
} else if fo == 0 && so == 1 && to == 0 {
|
|
Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
|
|
// only one bit set, in the third chunk
|
|
} else if fo == 0 && so == 0 && to == 1 {
|
|
Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
|
|
} else {
|
|
Err(())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for Script {
|
|
fn default() -> Self {
|
|
Script::Common
|
|
}
|
|
}
|
|
|
|
impl From<char> for Script {
|
|
fn from(o: char) -> Self {
|
|
o.script()
|
|
}
|
|
}
|
|
|
|
impl fmt::Display for Script {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
write!(f, "{}", self.full_name())
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
|
|
#[non_exhaustive]
|
|
/// A value for the `Script_Extension` property
|
|
///
|
|
/// [`ScriptExtension`] is one or more [`Script`]
|
|
///
|
|
/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
|
|
pub struct ScriptExtension {
|
|
// A bitset for the first 64 scripts
|
|
first: u64,
|
|
// A bitset for the scripts 65-128
|
|
second: u64,
|
|
// A bitset for scripts after 128
|
|
third: u64,
|
|
// Both Common and Inherited are represented by all used bits being set,
|
|
// this flag lets us distinguish the two.
|
|
common: bool,
|
|
}
|
|
|
|
impl ScriptExtension {
|
|
// We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
|
|
// Instead, we take the number of the next (unused) script bit, subtract 128 to bring
|
|
// it in the range of `third`, create a u64 with just that bit set, and subtract 1
|
|
// to create one with all the lower bits set.
|
|
const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1);
|
|
|
|
pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self {
|
|
ScriptExtension {
|
|
first,
|
|
second,
|
|
third,
|
|
common: false,
|
|
}
|
|
}
|
|
|
|
pub(crate) const fn new_common() -> Self {
|
|
ScriptExtension {
|
|
first: u64::MAX,
|
|
second: u64::MAX,
|
|
third: Self::THIRD_MAX,
|
|
common: true,
|
|
}
|
|
}
|
|
|
|
pub(crate) const fn new_inherited() -> Self {
|
|
ScriptExtension {
|
|
first: u64::MAX,
|
|
second: u64::MAX,
|
|
third: Self::THIRD_MAX,
|
|
common: false,
|
|
}
|
|
}
|
|
|
|
pub(crate) const fn new_unknown() -> Self {
|
|
ScriptExtension {
|
|
first: 0,
|
|
second: 0,
|
|
third: 0,
|
|
common: false,
|
|
}
|
|
}
|
|
|
|
const fn is_common_or_inherited(self) -> bool {
|
|
(self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
|
|
}
|
|
|
|
/// Checks if the script extension is Common
|
|
pub const fn is_common(self) -> bool {
|
|
self.is_common_or_inherited() & self.common
|
|
}
|
|
|
|
/// Checks if the script extension is Inherited
|
|
pub const fn is_inherited(self) -> bool {
|
|
self.is_common_or_inherited() & !self.common
|
|
}
|
|
|
|
/// Checks if the script extension is empty (unknown)
|
|
pub const fn is_empty(self) -> bool {
|
|
(self.first == 0) & (self.second == 0) & (self.third == 0)
|
|
}
|
|
|
|
/// Returns the number of scripts in the script extension
|
|
pub fn len(self) -> usize {
|
|
if self.is_common_or_inherited() {
|
|
1
|
|
} else {
|
|
(self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
|
|
}
|
|
}
|
|
|
|
/// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
|
|
/// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
|
|
/// in `self`
|
|
///
|
|
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
|
|
/// everything, the intersection of `Common` and `Inherited` is `Inherited`
|
|
pub fn intersect_with(&mut self, other: Self) {
|
|
*self = self.intersection(other)
|
|
}
|
|
|
|
/// Find the intersection between two ScriptExtensions. Returns Unknown if things
|
|
/// do not intersect.
|
|
///
|
|
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
|
|
/// everything, the intersection of `Common` and `Inherited` is `Inherited`
|
|
pub const fn intersection(self, other: Self) -> Self {
|
|
let first = self.first & other.first;
|
|
let second = self.second & other.second;
|
|
let third = self.third & other.third;
|
|
let common = self.common & other.common;
|
|
ScriptExtension {
|
|
first,
|
|
second,
|
|
third,
|
|
common,
|
|
}
|
|
}
|
|
|
|
/// Find the union between two ScriptExtensions.
|
|
///
|
|
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
|
|
/// everything, the union of `Common` and `Inherited` is `Common`
|
|
pub const fn union(self, other: Self) -> Self {
|
|
let first = self.first | other.first;
|
|
let second = self.second | other.second;
|
|
let third = self.third | other.third;
|
|
let common = self.common | other.common;
|
|
ScriptExtension {
|
|
first,
|
|
second,
|
|
third,
|
|
common,
|
|
}
|
|
}
|
|
|
|
/// Check if this ScriptExtension contains the given script
|
|
///
|
|
/// Should be used with specific scripts only, this will
|
|
/// return `true` if `self` is not `Unknown` and `script` is
|
|
/// `Common` or `Inherited`
|
|
pub fn contains_script(self, script: Script) -> bool {
|
|
!self.intersection(script.into()).is_empty()
|
|
}
|
|
|
|
/// Get the intersection of script extensions of all characters
|
|
/// in a string.
|
|
pub fn for_str(x: &str) -> Self {
|
|
let mut ext = ScriptExtension::default();
|
|
for ch in x.chars() {
|
|
ext.intersect_with(ch.into());
|
|
}
|
|
ext
|
|
}
|
|
|
|
/// Iterate over the scripts in this script extension
|
|
///
|
|
/// Will never yield Script::Unknown
|
|
pub fn iter(self) -> ScriptIterator {
|
|
ScriptIterator { ext: self }
|
|
}
|
|
}
|
|
|
|
impl Default for ScriptExtension {
|
|
fn default() -> Self {
|
|
ScriptExtension::new_common()
|
|
}
|
|
}
|
|
|
|
impl From<char> for ScriptExtension {
|
|
fn from(o: char) -> Self {
|
|
o.script_extension()
|
|
}
|
|
}
|
|
|
|
impl From<&'_ str> for ScriptExtension {
|
|
fn from(o: &'_ str) -> Self {
|
|
Self::for_str(o)
|
|
}
|
|
}
|
|
|
|
impl fmt::Debug for ScriptExtension {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
write!(f, "ScriptExtension(")?;
|
|
fmt::Display::fmt(self, f)?;
|
|
write!(f, ")")
|
|
}
|
|
}
|
|
|
|
impl fmt::Display for ScriptExtension {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
if self.is_common() {
|
|
write!(f, "Common")?;
|
|
} else if self.is_inherited() {
|
|
write!(f, "Inherited")?;
|
|
} else if self.is_empty() {
|
|
write!(f, "Unknown")?;
|
|
} else {
|
|
let mut first = true;
|
|
for script in self.iter() {
|
|
if !first {
|
|
write!(f, " + ")?;
|
|
first = false;
|
|
}
|
|
script.full_name().fmt(f)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Extension trait on `char` for calculating script properties
|
|
pub trait UnicodeScript {
|
|
/// Get the script for a given character
|
|
fn script(&self) -> Script;
|
|
/// Get the Script_Extension for a given character
|
|
fn script_extension(&self) -> ScriptExtension;
|
|
}
|
|
|
|
impl UnicodeScript for char {
|
|
fn script(&self) -> Script {
|
|
get_script(*self).unwrap_or(Script::Unknown)
|
|
}
|
|
|
|
fn script_extension(&self) -> ScriptExtension {
|
|
get_script_extension(*self).unwrap_or_else(|| self.script().into())
|
|
}
|
|
}
|
|
|
|
/// Iterator over scripts in a [ScriptExtension].
|
|
///
|
|
/// Can be obtained ia [ScriptExtension::iter()]
|
|
pub struct ScriptIterator {
|
|
ext: ScriptExtension,
|
|
}
|
|
|
|
impl Iterator for ScriptIterator {
|
|
type Item = Script;
|
|
|
|
fn next(&mut self) -> Option<Script> {
|
|
if self.ext.is_common_or_inherited() {
|
|
let common = self.ext.common;
|
|
self.ext = ScriptExtension::new_unknown();
|
|
if common {
|
|
Some(Script::Common)
|
|
} else {
|
|
Some(Script::Inherited)
|
|
}
|
|
// Are there bits left in the first chunk?
|
|
} else if self.ext.first != 0 {
|
|
// Find the next bit
|
|
let bit = self.ext.first.trailing_zeros();
|
|
// unset just that bit
|
|
self.ext.first &= !(1 << bit);
|
|
Some(Script::for_integer(bit as u8))
|
|
// Are there bits left in the second chunk?
|
|
} else if self.ext.second != 0 {
|
|
let bit = self.ext.second.trailing_zeros();
|
|
self.ext.second &= !(1 << bit);
|
|
Some(Script::for_integer(64 + bit as u8))
|
|
// Are there bits left in the third chunk?
|
|
} else if self.ext.third != 0 {
|
|
let bit = self.ext.third.trailing_zeros();
|
|
self.ext.third &= !(1 << bit);
|
|
Some(Script::for_integer(128 + bit as u8))
|
|
} else {
|
|
// Script::Unknown
|
|
None
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::*;
|
|
use std::collections::HashSet;
|
|
use std::convert::TryInto;
|
|
|
|
#[cfg(feature = "bench")]
|
|
use test::bench::Bencher;
|
|
#[cfg(feature = "bench")]
|
|
extern crate test;
|
|
|
|
#[test]
|
|
fn test_conversion() {
|
|
let mut seen_scripts = HashSet::new();
|
|
let mut seen_exts = HashSet::new();
|
|
for bit in 0..NEXT_SCRIPT {
|
|
let script = Script::for_integer(bit);
|
|
let ext = script.into();
|
|
if seen_scripts.contains(&script) {
|
|
panic!("Found script {:?} twice!", script)
|
|
}
|
|
if seen_exts.contains(&ext) {
|
|
panic!("Found extension {:?} twice!", ext)
|
|
}
|
|
seen_scripts.insert(script);
|
|
seen_exts.insert(ext);
|
|
assert_eq!(script as u8, bit);
|
|
assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
|
|
assert!(!ScriptExtension::new_inherited()
|
|
.intersection(ext)
|
|
.is_empty());
|
|
assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
|
|
assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
|
|
assert_eq!(Ok(script), ext.try_into());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_specific() {
|
|
let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
|
|
let ext = ScriptExtension::for_str(s);
|
|
assert_eq!(ext, script_extensions::DEVA);
|
|
println!(
|
|
"{:?}",
|
|
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
|
|
);
|
|
println!(
|
|
"{:?}",
|
|
ext.intersection(
|
|
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
|
|
)
|
|
);
|
|
assert!(!ext
|
|
.intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
|
|
.is_empty());
|
|
|
|
let u = ext.union(Script::Dogra.into());
|
|
assert_eq!(
|
|
u.intersection(
|
|
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
|
|
),
|
|
u
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_specific_ext() {
|
|
let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
|
|
|
|
let all: HashSet<_> = ext.iter().collect();
|
|
|
|
for bit in 0..NEXT_SCRIPT {
|
|
let script = Script::for_integer(bit);
|
|
|
|
if all.contains(&script) {
|
|
assert!(ext.contains_script(script))
|
|
} else {
|
|
assert!(!ext.contains_script(script))
|
|
}
|
|
}
|
|
|
|
assert!(ext.contains_script(Script::Devanagari));
|
|
assert!(ext.contains_script(Script::Dogra));
|
|
assert!(ext.contains_script(Script::Gujarati));
|
|
assert!(ext.contains_script(Script::Gurmukhi));
|
|
assert!(ext.contains_script(Script::Khojki));
|
|
assert!(ext.contains_script(Script::Kaithi));
|
|
assert!(ext.contains_script(Script::Mahajani));
|
|
assert!(ext.contains_script(Script::Modi));
|
|
assert!(ext.contains_script(Script::Khudawadi));
|
|
assert!(ext.contains_script(Script::Takri));
|
|
assert!(ext.contains_script(Script::Tirhuta));
|
|
|
|
let scr: Result<Script, _> = ext.try_into();
|
|
assert!(scr.is_err());
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[bench]
|
|
fn bench_script_intersection(b: &mut Bencher) {
|
|
b.iter(|| {
|
|
let script = test::black_box(Script::Devanagari);
|
|
let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
|
|
test::black_box(ext.intersection(script.into()));
|
|
})
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[bench]
|
|
fn bench_ext_to_script(b: &mut Bencher) {
|
|
let ext: ScriptExtension = Script::Devanagari.into();
|
|
b.iter(|| {
|
|
let ext = test::black_box(ext);
|
|
let script: Result<Script, _> = ext.try_into();
|
|
let _ = test::black_box(script);
|
|
})
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[bench]
|
|
fn bench_script_to_ext(b: &mut Bencher) {
|
|
b.iter(|| {
|
|
let script = test::black_box(Script::Devanagari);
|
|
let ext: ScriptExtension = script.into();
|
|
test::black_box(ext);
|
|
})
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[bench]
|
|
fn bench_ext_intersection(b: &mut Bencher) {
|
|
b.iter(|| {
|
|
let e1 = test::black_box(script_extensions::ARAB_GARA_NKOO_ROHG_SYRC_THAA_YEZI);
|
|
let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
|
|
test::black_box(e2.intersection(e1));
|
|
})
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[bench]
|
|
fn bench_to_vec(b: &mut Bencher) {
|
|
b.iter(|| {
|
|
let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
|
|
test::black_box(ext.iter().collect::<Vec<_>>());
|
|
})
|
|
}
|
|
|
|
#[cfg(feature = "bench")]
|
|
#[bench]
|
|
fn bench_string_ext(b: &mut Bencher) {
|
|
b.iter(|| {
|
|
let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
|
|
test::black_box(ScriptExtension::for_str(s));
|
|
})
|
|
}
|
|
}
|