Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

8
vendor/rustybuzz/scripts/README.md vendored Normal file
View File

@@ -0,0 +1,8 @@
## Usage
```sh
python3 gen-universal-table.py > ../src/hb/ot_shape_complex_use_table.rs
python3 ./gen-vowel-constraints.py > ../src/complex/vowel_constraints.rs
rustfmt ../src/complex/vowel_constraints.rs
```

View File

@@ -0,0 +1,179 @@
#!/usr/bin/env python3
# Based on harfbuzz/src/gen-arabic-table.py
import os
import urllib.request
DEPENDENCIES = [
"ArabicShaping.txt",
"UnicodeData.txt",
"Blocks.txt",
]
for dep in DEPENDENCIES:
if not os.path.exists(dep):
urllib.request.urlretrieve("https://unicode.org/Public/14.0.0/ucd/" + dep, dep)
files = [open(x, encoding="utf-8") for x in DEPENDENCIES]
headers = [
[files[0].readline(), files[0].readline()],
[files[2].readline(), files[2].readline()],
["UnicodeData.txt does not have a header."],
]
while files[0].readline().find("##################") < 0:
pass
blocks = {}
def read_blocks(f):
global blocks
for line in f:
j = line.find("#")
if j >= 0:
line = line[:j]
fields = [x.strip() for x in line.split(";")]
if len(fields) == 1:
continue
uu = fields[0].split("..")
start = int(uu[0], 16)
if len(uu) == 1:
end = start
else:
end = int(uu[1], 16)
t = fields[1]
for u in range(start, end + 1):
blocks[u] = t
def print_joining_table(f):
values = {}
for line in f:
if line[0] == "#":
continue
fields = [x.strip() for x in line.split(";")]
if len(fields) == 1:
continue
u = int(fields[0], 16)
if fields[3] in ["ALAPH", "DALATH RISH"]:
value = "JOINING_GROUP_" + fields[3].replace(" ", "_")
else:
value = "JOINING_TYPE_" + fields[2]
values[u] = value
short_value = {}
for value in sorted(set([v for v in values.values()] + ["JOINING_TYPE_X"])):
short = "".join(x[0] for x in value.split("_")[2:])
assert short not in short_value.values()
short_value[value] = short
uu = sorted(values.keys())
num = len(values)
all_blocks = set([blocks[u] for u in uu])
last = -100000
ranges = []
for u in uu:
if u - last <= 1 + 16 * 5:
ranges[-1][-1] = u
else:
ranges.append([u, u])
last = u
print("#[rustfmt::skip]")
print("pub const JOINING_TABLE: &[hb_arabic_joining_type_t] = &[")
last_block = None
offset = 0
join_offsets = []
for start, end in ranges:
join_offsets.append(
"const JOINING_OFFSET_0X%04X: usize = %d;" % (start, offset)
)
for u in range(start, end + 1):
block = blocks.get(u, last_block)
value = values.get(u, "JOINING_TYPE_X")
if block != last_block or u == start:
if u != start:
print()
if block in all_blocks:
print("\n /* %s */" % block)
else:
print("\n /* FILLER */")
last_block = block
if u % 32 != 0:
print()
print(" /* %04X */" % (u // 32 * 32), " " * (u % 32), end="")
if u % 32 == 0:
print()
print(" /* %04X */ " % u, end="")
val = short_value[value]
if val == "C":
val = "D"
print("%s," % val, end="")
print()
offset += end - start + 1
print("];")
print()
for offset in join_offsets:
print(offset)
page_bits = 12
print()
print("pub fn joining_type(u: char) -> hb_arabic_joining_type_t {")
print(" let u = u as u32;")
print(" match u >> %d {" % page_bits)
pages = set(
[u >> page_bits for u in [s for s, e in ranges] + [e for s, e in ranges]]
)
for p in sorted(pages):
print(" 0x%0X => {" % p)
for start, end in ranges:
if p not in [start >> page_bits, end >> page_bits]:
continue
offset = "JOINING_OFFSET_0X%04X" % start
print(" if (0x%04X..=0x%04X).contains(&u) {" % (start, end))
print(
" return JOINING_TABLE[u as usize - 0x%04X + %s]"
% (start, offset)
)
print(" }")
print(" }")
print(" _ => {}")
print(" }")
print()
print(" X")
print("}")
print()
print("// WARNING: this file was generated by scripts/gen-arabic-table.py")
print()
print(
"use super::ot_shape_complex_arabic::hb_arabic_joining_type_t::{\n"
" self, GroupAlaph as A, GroupDalathRish as DR, D, L, R, T, U, X,\n"
"};"
)
print()
read_blocks(files[2])
print_joining_table(files[0])

251
vendor/rustybuzz/scripts/gen-indic-table.py vendored Executable file
View File

@@ -0,0 +1,251 @@
#!/usr/bin/env python3
# Based on harfbuzz/src/gen-indic-table.py
import io
import os
import urllib.request
DEPENDENCIES = [
'IndicSyllabicCategory.txt',
'IndicPositionalCategory.txt',
'Blocks.txt',
]
for dep in DEPENDENCIES:
if not os.path.exists(dep):
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/' + dep, dep)
ALLOWED_SINGLES = [0x00A0, 0x25CC]
ALLOWED_BLOCKS = [
'Basic Latin',
'Latin-1 Supplement',
'Devanagari',
'Bengali',
'Gurmukhi',
'Gujarati',
'Oriya',
'Tamil',
'Telugu',
'Kannada',
'Malayalam',
'Sinhala',
'Myanmar',
'Khmer',
'Vedic Extensions',
'General Punctuation',
'Superscripts and Subscripts',
'Devanagari Extended',
'Myanmar Extended-B',
'Myanmar Extended-A',
]
files = [io.open(x, encoding='utf-8') for x in DEPENDENCIES]
headers = [[f.readline() for i in range(2)] for f in files]
data = [{} for f in files]
values = [{} for f in files]
for i, f in enumerate(files):
for line in f:
j = line.find('#')
if j >= 0:
line = line[:j]
fields = [x.strip() for x in line.split(';')]
if len(fields) == 1:
continue
uu = fields[0].split('..')
start = int(uu[0], 16)
if len(uu) == 1:
end = start
else:
end = int(uu[1], 16)
t = fields[1]
for u in range(start, end + 1):
data[i][u] = t
values[i][t] = values[i].get(t, 0) + end - start + 1
# Merge data into one dict:
defaults = ('Other', 'Not_Applicable', 'No_Block')
for i, v in enumerate(defaults):
values[i][v] = values[i].get(v, 0) + 1
combined = {}
for i, d in enumerate(data):
for u, v in d.items():
if i == 2 and u not in combined:
continue
if u not in combined:
combined[u] = list(defaults)
combined[u][i] = v
combined = {k: v for k, v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
data = combined
del combined
num = len(data)
# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
singles = {}
for u in ALLOWED_SINGLES:
singles[u] = data[u]
del data[u]
print('// WARNING: this file was generated by scripts/gen-indic-table.py')
print()
print('#![allow(non_camel_case_types)]')
print('#![allow(unused_imports)]')
print()
print('use super::ot_shape_complex_indic::{MatraCategory, SyllabicCategory};')
# Shorten values
short = [{
'Bindu': 'Bi',
'Cantillation_Mark': 'Ca',
'Joiner': 'ZWJ',
'Non_Joiner': 'ZWNJ',
'Number': 'Nd',
'Visarga': 'Vs',
'Vowel': 'Vo',
'Vowel_Dependent': 'M',
'Consonant_Prefixed': 'CPrf',
'Other': 'x',
}, {
'Not_Applicable': 'x',
}]
all_shorts = [{}, {}]
# Add some of the values, to make them more readable, and to avoid duplicates
for i in range(2):
for v, s in short[i].items():
all_shorts[i][s] = v
what = ['SyllabicCategory', 'MatraCategory']
what_short = ['ISC', 'IMC']
cat_defs = []
for i in range(2):
vv = sorted(values[i].keys())
for v in vv:
v_no_and = v.replace('_And_', '_')
if v in short[i]:
s = short[i][v]
else:
s = ''.join([c for c in v_no_and if ord('A') <= ord(c) <= ord('Z')])
if s in all_shorts[i]:
raise Exception('Duplicate short value alias', v, all_shorts[i][s])
all_shorts[i][s] = v
short[i][v] = s
cat_defs.append((what_short[i] + '_' + s, what[i] + '::' + v.replace('_', ''), str(values[i][v]), v))
maxlen_s = max([len(c[0]) for c in cat_defs])
maxlen_l = max([len(c[1]) for c in cat_defs])
maxlen_n = max([len(c[2]) for c in cat_defs])
for s in what_short:
print()
for c in [c for c in cat_defs if s in c[0]]:
print('use %s as %s;' % (c[1].ljust(maxlen_l), c[0]))
print()
print()
total = 0
used = 0
last_block = None
def print_block(block, start, end, data):
global total, used, last_block
if block and block != last_block:
print()
print()
print(' /* %s */' % block)
num = 0
assert start % 8 == 0
assert (end + 1) % 8 == 0
for u in range(start, end + 1):
if u % 8 == 0:
print()
print(' /* %04X */' % u, end='')
if u in data:
num += 1
d = data.get(u, defaults)
print('%16s' % ('(ISC_%s,IMC_%s),' % (short[0][d[0]], short[1][d[1]])), end='')
total += end - start + 1
used += num
if block:
last_block = block
uu = sorted(data.keys())
last = -100000
num = 0
offset = 0
starts = []
ends = []
print('#[rustfmt::skip]')
print('const TABLE: &[(SyllabicCategory, MatraCategory)] = &[')
offsets = []
for u in uu:
if u <= last:
continue
block = data[u][2]
start = u // 8 * 8
end = start + 1
while end in uu and block == data[end][2]:
end += 1
end = (end - 1) // 8 * 8 + 7
if start != last + 1:
if start - last <= 1 + 16 * 3:
print_block(None, last + 1, start - 1, data)
last = start - 1
else:
if last >= 0:
ends.append(last + 1)
offset += ends[-1] - starts[-1]
# print()
# print()
offsets.append('const OFFSET_0X%04X: usize = %d;' % (start, offset))
starts.append(start)
print_block(block, start, end, data)
last = end
ends.append(last + 1)
offset += ends[-1] - starts[-1]
print()
print()
occupancy = used * 100. / total
page_bits = 12
print('];')
print()
for o in offsets:
print(o)
print()
print('#[rustfmt::skip]')
print('pub fn get_categories(u: u32) -> (SyllabicCategory, MatraCategory) {')
print(' match u >> %d {' % page_bits)
pages = set([u >> page_bits for u in starts + ends + list(singles.keys())])
for p in sorted(pages):
print(' 0x%0X => {' % p)
for u, d in singles.items():
if p != u >> page_bits: continue
print(' if u == 0x%04X { return (ISC_%s, IMC_%s); }' % (u, short[0][d[0]], short[1][d[1]]))
for (start, end) in zip(starts, ends):
if p not in [start >> page_bits, end >> page_bits]: continue
offset = 'OFFSET_0X%04X' % start
print(' if (0x%04X..=0x%04X).contains(&u) { return TABLE[u as usize - 0x%04X + %s]; }' % (start, end - 1, start, offset))
print(' }')
print(' _ => {}')
print(' }')
print()
print(' (ISC_x, IMC_x)')
print('}')
# Maintain at least 30% occupancy in the table */
if occupancy < 30:
raise Exception('Table too sparse, please investigate: ', occupancy)

204
vendor/rustybuzz/scripts/gen-shaping-tests.py vendored Executable file
View File

@@ -0,0 +1,204 @@
#!/usr/bin/env python3
import os
import shutil
import sys
import subprocess
from pathlib import Path
# There is no sane way to test them.
IGNORE_TESTS = [
'macos.tests',
'coretext.tests',
'directwrite.tests',
'uniscribe.tests',
]
IGNORE_TEST_CASES = [
# aots tests
# in-house tests
# --shaper=fallback is not supported.
'simple_002',
# Not possible to implement without shaping.
'arabic_fallback_shaping_001',
# `dfont` is not supported.
'collections_001',
'collections_002',
'collections_003',
# Face index out of bounds. ttf-parser doesn't permit this.
'collections_006',
# no `hhea` table.
'indic_decompose_001',
# ttf-parser doesn't support phantom points
'variations_003',
# Resource exhaustion tests with large outputs
'morx_34_001',
'morx_36_001',
# ttf-parser uses different rounding, not a bug
'fallback_positioning_001',
]
def update_relative_path(tests_name, fontfile):
fontfile = fontfile.replace('../fonts/', '')
return f'tests/fonts/{tests_name}/{fontfile}' # relative to the root dir
# Converts `U+0041,U+0078` or `0041,0078` into `\u{0041}\u{0078}`
def convert_unicodes(unicodes):
text = ''
for (i, u) in enumerate(unicodes.split(',')):
if i > 0 and i % 10 == 0:
text += '\\\n '
if u.startswith("U+"):
u = u[2:]
text += f'\\u{{{u}}}'
return text
def convert_test(hb_dir, hb_shape_exe, tests_name, file_name, idx, data, fonts):
if file_name == 'emoji-clusters.tests':
return '' # There are a lot of these; let's skip them
fontfile, options, unicodes, glyphs_expected = data.split(';')
fontfile_rs = update_relative_path(tests_name, fontfile)
unicodes_rs = convert_unicodes(unicodes)
test_name = file_name.replace(
'.tests', '').replace('-', '_') + f'_{idx:03d}'
test_name = test_name.lower()
options = options.replace('--shaper=ot', '')
options = options.replace(
' --font-funcs=ft', '').replace('--font-funcs=ft', '')
options = options.replace(
' --font-funcs=ot', '').replace('--font-funcs=ot', '')
# we don't support font scaling
options = options.replace('--font-size=1000', '')
options = options.strip()
# We have to actually run hb-shape instead of using predefined results,
# because hb sometimes stores results for freetype and not for embedded OpenType
# engine, which we are using.
# Right now, it only affects 'text-rendering-tests'.
if len(options) != 0:
options_list = options.split(' ')
else:
options_list = []
options_list.insert(0, str(hb_shape_exe))
# Force OT functions, since this is the only one we support in rustybuzz.
options_list.append('--font-funcs=ot')
abs_font_path = hb_dir.joinpath('test/shape/data')\
.joinpath(tests_name)\
.joinpath('tests') \
.joinpath(fontfile)
options_list.append(str(abs_font_path))
options_list.append(f'--unicodes={unicodes}') # no need to escape it
glyphs_expected = subprocess.run(options_list, check=True, stdout=subprocess.PIPE)\
.stdout.decode()
glyphs_expected = glyphs_expected.strip()[1:-1] # remove leading and trailing whitespaces and `[..]`
glyphs_expected = glyphs_expected.replace('|', '|\\\n ')
options = options.replace('"', '\\"')
options = options.replace(' --single-par', '')
fonts.add(os.path.split(fontfile_rs)[1])
if test_name in IGNORE_TEST_CASES:
return ''
return (f'#[test]\n'
f'fn {test_name}() {{\n'
f' assert_eq!(\n'
f' shape(\n'
f' "{fontfile_rs}",\n'
f' "{unicodes_rs}",\n'
f' "{options}",\n'
f' ),\n'
f' "{glyphs_expected}"\n'
f' );\n'
f'}}\n'
'\n')
def convert(hb_dir, hb_shape_exe, tests_dir, tests_name):
files = sorted(os.listdir(tests_dir))
files = [f for f in files if f.endswith('.tests')]
fonts = set()
rust_code = ('// WARNING: this file was generated by ../scripts/gen-shaping-tests.py\n'
'\n'
'use crate::shape;\n'
'\n')
for file in files:
if file in IGNORE_TESTS:
continue
with open(tests_dir / file, 'r') as f:
for idx, test in enumerate(f.read().splitlines()):
# skip comments and empty lines
if test.startswith('#') or len(test) == 0:
continue
rust_code += convert_test(hb_dir, hb_shape_exe, tests_name,
file, idx + 1, test, fonts)
tests_name_snake_case = tests_name.replace('-', '_')
with open(f'../tests/shaping/{tests_name_snake_case}.rs', 'w') as f:
f.write(rust_code)
return fonts
if len(sys.argv) != 2:
print('Usage: gen-shaping-tests.py /path/to/harfbuzz-src')
exit(1)
hb_dir = Path(sys.argv[1])
assert hb_dir.exists()
# Check that harfbuzz was built.
hb_shape_exe = hb_dir.joinpath('builddir/util/hb-shape')
if not hb_shape_exe.exists():
print('Build harfbuzz first using:')
print(' meson builddir')
print(' ninja -Cbuilddir')
exit(1)
used_fonts = []
font_files = []
test_dir_names = ['aots', 'in-house', 'text-rendering-tests']
for test_dir_name in test_dir_names:
tests_dir = hb_dir / f'test/shape/data/{test_dir_name}/tests'
dir_used_fonts = convert(hb_dir, hb_shape_exe, tests_dir, test_dir_name)
for filename in dir_used_fonts:
shutil.copy(
hb_dir / f'test/shape/data/{test_dir_name}/fonts/{filename}',
f'../tests/fonts/{test_dir_name}')
used_fonts += dir_used_fonts
font_files += os.listdir(hb_dir /
f'test/shape/data/{test_dir_name}/fonts')
# Check for unused fonts. Just for debugging.
# unused_fonts = sorted(list(set(font_files).difference(used_fonts)))
# if len(unused_fonts) != 0:
# print('Unused fonts:')
# for font in unused_fonts:
# print(font)

1093
vendor/rustybuzz/scripts/gen-tag-table.py vendored Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
import urllib.request
import os
URL = 'https://www.unicode.org/Public/emoji/12.1/emoji-data.txt'
FILE_NAME = 'emoji-data.txt'
if not os.path.exists(FILE_NAME):
urllib.request.urlretrieve(URL, FILE_NAME)
is_ext_pict_section = False
ranges = []
with open(FILE_NAME) as f:
for line in f:
line = line.strip()
if not is_ext_pict_section:
if line == '# All omitted code points have Extended_Pictographic=No':
is_ext_pict_section = True
continue
if not line:
continue
if line.startswith('#'):
continue
range, _ = line.split(' ; ')
range = range.strip()
if '..' in range:
start, end = range.split('..')
ranges.append([start, end])
else:
ranges.append([range, range])
# Merge ranges.
idx = 0
while idx < len(ranges)-1:
if int(ranges[idx][1], 16) + 1 == int(ranges[idx+1][0], 16):
ranges[idx][1] = ranges[idx+1][1]
del ranges[idx+1]
else:
idx += 1;
for range in ranges:
if range[0] == range[1]:
print('0x{} => true,'.format(range[0], range[1]))
else:
print('0x{}..=0x{} => true,'.format(range[0], range[1]))

View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python3
import urllib.request
import os
URL = 'https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
FILE_NAME = 'UnicodeData.txt'
def hex_to_char_rs(c):
return f"'\\u{{{c}}}'"
if not os.path.exists(FILE_NAME):
urllib.request.urlretrieve(URL, FILE_NAME)
print('// WARNING: this file was generated by ../scripts/gen-unicode-norm-table.py')
print()
print('//! This module provides Unicode tables for canonical (de)composition.')
print('//!')
print('//! The current implementation is not the fastest one. Just good enough.')
print()
print('#[allow(dead_code)]')
print('pub const UNICODE_VERSION: (u8, u8, u8) = (14, 0, 0);')
print()
print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')
compose_data = []
with open(FILE_NAME) as f:
for line in f:
parts = line.split(';')
if len(parts[5]) == 0:
continue
# Skip codepoints with compatibility formatting tags
# since we care only about canonical mapping.
if parts[5][0] == '<':
continue
# Print the decomposition table as is, since `UnicodeData` is already sorted.
c = parts[0]
mapping = parts[5].split(' ')
if len(mapping) == 2:
print(f" ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, Some({hex_to_char_rs(mapping[1])})),")
# Remember only codepoints that should be decomposed into two codepoints.
compose_data.append([mapping[0], mapping[1], c])
elif len(mapping) == 1:
print(f' ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, None),')
else:
raise 'invalid unicode data'
print('];')
print()
print('// The first value is `a << 32 | b`.')
print('// Sorted by the first value.')
print('pub const COMPOSITION_TABLE: &[(u64, char)] = &[')
pairs = []
for mapping in compose_data:
needle = int(mapping[0], 16) << 32 | int(mapping[1], 16)
pairs.append((needle, mapping[2]))
pairs.sort(key=lambda x: x[0])
# Make sure that needles are unique.
needles = set()
for pair in pairs:
needles.add(pair[0])
assert len(pairs) == len(needles)
for pair in pairs:
print(f' ({pair[0]}, {hex_to_char_rs(pair[1])}),')
print('];')

View File

@@ -0,0 +1,584 @@
#!/usr/bin/env python3
# Based on harfbuzz/src/gen-use-table.py
import io
import os
import urllib.request
DISABLED_SCRIPTS = {
'Arabic',
'Lao',
'Samaritan',
'Syriac',
'Thai',
}
files = ['IndicSyllabicCategory.txt', 'IndicPositionalCategory.txt', 'ArabicShaping.txt',
'DerivedCoreProperties.txt', 'UnicodeData.txt', 'Blocks.txt', 'Scripts.txt',
'ms-use/IndicSyllabicCategory-Additional.txt', 'ms-use/IndicPositionalCategory-Additional.txt']
for f in files:
if not os.path.exists(f):
urllib.request.urlretrieve(
'https://unicode.org/Public/14.0.0/ucd/' + f, f)
files = [io.open(x, encoding='utf-8') for x in files]
headers = [[f.readline() for i in range(2)]
for j, f in enumerate(files) if j != 2]
for j in range(7, 9):
for line in files[j]:
line = line.rstrip()
if not line:
break
headers[j - 1].append(line)
headers.append(["UnicodeData.txt does not have a header."])
unicode_data = [{} for _ in files]
values = [{} for _ in files]
for i, f in enumerate(files):
for line in f:
j = line.find('#')
if j >= 0:
line = line[:j]
fields = [x.strip() for x in line.split(';')]
if len(fields) == 1:
continue
uu = fields[0].split('..')
start = int(uu[0], 16)
if len(uu) == 1:
end = start
else:
end = int(uu[1], 16)
t = fields[1 if i not in [2, 4] else 2]
if i == 2:
t = 'jt_' + t
elif i == 3 and t != 'Default_Ignorable_Code_Point':
continue
elif i == 7 and t == 'Consonant_Final_Modifier':
# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
t = 'Syllable_Modifier'
elif i == 8 and t == 'NA':
t = 'Not_Applicable'
i0 = i if i < 7 else i - 7
for u in range(start, end + 1):
unicode_data[i0][u] = t
values[i0][t] = values[i0].get(t, 0) + end - start + 1
defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
# Merge data into one dict:
for i,v in enumerate (defaults):
values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i,d in enumerate (unicode_data):
for u,v in d.items ():
if not u in combined:
if i >= 4:
continue
combined[u] = list (defaults)
combined[u][i] = v
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
property_names = [
# General_Category
'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
# Indic_Syllabic_Category
'Other',
'Bindu',
'Visarga',
'Avagraha',
'Nukta',
'Virama',
'Pure_Killer',
'Invisible_Stacker',
'Vowel_Independent',
'Vowel_Dependent',
'Vowel',
'Consonant_Placeholder',
'Consonant',
'Consonant_Dead',
'Consonant_With_Stacker',
'Consonant_Prefixed',
'Consonant_Preceding_Repha',
'Consonant_Succeeding_Repha',
'Consonant_Subjoined',
'Consonant_Medial',
'Consonant_Final',
'Consonant_Head_Letter',
'Consonant_Initial_Postfixed',
'Modifying_Letter',
'Tone_Letter',
'Tone_Mark',
'Gemination_Mark',
'Cantillation_Mark',
'Register_Shifter',
'Syllable_Modifier',
'Consonant_Killer',
'Non_Joiner',
'Joiner',
'Number_Joiner',
'Number',
'Brahmi_Joining_Number',
'Hieroglyph',
'Hieroglyph_Joiner',
'Hieroglyph_Segment_Begin',
'Hieroglyph_Segment_End',
# Indic_Positional_Category
'Not_Applicable',
'Right',
'Left',
'Visual_Order_Left',
'Left_And_Right',
'Top',
'Bottom',
'Top_And_Bottom',
'Top_And_Bottom_And_Left',
'Top_And_Right',
'Top_And_Left',
'Top_And_Left_And_Right',
'Bottom_And_Left',
'Bottom_And_Right',
'Top_And_Bottom_And_Right',
'Overstruck',
# Joining_Type
'jt_C',
'jt_D',
'jt_L',
'jt_R',
'jt_T',
'jt_U',
'jt_X',
]
class PropertyValue(object):
def __init__(self, name_):
self.name = name_
def __str__(self):
return self.name
def __eq__(self, other):
return self.name == (other if isinstance(other, str) else other.name)
def __ne__(self, other):
return not (self == other)
def __hash__(self):
return hash(str(self))
property_values = {}
for name in property_names:
value = PropertyValue(name)
assert value not in property_values
assert value not in globals()
property_values[name] = value
globals().update(property_values)
def is_BASE(U, UISC, UDI, UGC, AJT):
return (UISC in [Number, Consonant, Consonant_Head_Letter,
Tone_Letter,
Vowel_Independent,
] or
# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
Consonant_Subjoined, Vowel, Vowel_Dependent]))
def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
return UISC == Brahmi_Joining_Number
def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
if UISC == Consonant_Placeholder:
return True
return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
def is_CGJ(U, UISC, UDI, UGC, AJT):
# Also includes VARIATION_SELECTOR and ZWJ
return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
return ((UISC == Consonant_Final and UGC != Lo) or
UISC == Consonant_Succeeding_Repha)
def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
return UISC == Syllable_Modifier
def is_CONS_MED(U, UISC, UDI, UGC, AJT):
# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
return (UISC == Consonant_Medial and UGC != Lo or
UISC == Consonant_Initial_Postfixed)
def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and
not is_SYM_MOD(U, UISC, UDI, UGC, AJT))
def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
return UISC == Consonant_Subjoined and UGC != Lo
def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
return UISC == Consonant_With_Stacker
def is_HALANT(U, UISC, UDI, UGC, AJT):
return UISC == Virama
def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
return UISC == Number_Joiner
def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
return UISC == Hieroglyph
def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
return UISC == Hieroglyph_Joiner
def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
return UISC == Hieroglyph_Segment_Begin
def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
return UISC == Hieroglyph_Segment_End
def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
# Split off of HALANT
return (UISC == Invisible_Stacker
and not is_SAKOT(U, UISC, UDI, UGC, AJT)
)
def is_ZWNJ(U, UISC, UDI, UGC, AJT):
return UISC == Non_Joiner
def is_OTHER(U, UISC, UDI, UGC, AJT):
# Also includes BASE_IND, and SYM
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
and not is_BASE(U, UISC, UDI, UGC, AJT)
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
and not is_CGJ(U, UISC, UDI, UGC, AJT)
and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
)
def is_REPHA(U, UISC, UDI, UGC, AJT):
return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
def is_SAKOT(U, UISC, UDI, UGC, AJT):
# Split off of HALANT
return U == 0x1A60
def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
def is_VOWEL(U, UISC, UDI, UGC, AJT):
return (UISC == Pure_Killer or
UGC != Lo and UISC in [Vowel, Vowel_Dependent])
def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
UGC != Lo and UISC == Bindu)
def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
# Also includes Rsv
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
and UISC == Other
and not is_CGJ(U, UISC, UDI, UGC, AJT)
) or UGC == Cn
use_mapping = {
'B': is_BASE,
'N': is_BASE_NUM,
'GB': is_BASE_OTHER,
'CGJ': is_CGJ,
'F': is_CONS_FINAL,
'FM': is_CONS_FINAL_MOD,
'M': is_CONS_MED,
'CM': is_CONS_MOD,
'SUB': is_CONS_SUB,
'CS': is_CONS_WITH_STACKER,
'H': is_HALANT,
'HN': is_HALANT_NUM,
'IS': is_INVISIBLE_STACKER,
'G': is_HIEROGLYPH,
'J': is_HIEROGLYPH_JOINER,
'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
'SE': is_HIEROGLYPH_SEGMENT_END,
'ZWNJ': is_ZWNJ,
'O': is_OTHER,
'R': is_REPHA,
'SK': is_SAKOT,
'SM': is_SYM_MOD,
'V': is_VOWEL,
'VM': is_VOWEL_MOD,
'WJ': is_Word_Joiner,
}
use_positions = {
'F': {
'ABV': [Top],
'BLW': [Bottom],
'PST': [Right],
},
'M': {
'ABV': [Top],
'BLW': [Bottom, Bottom_And_Left, Bottom_And_Right],
'PST': [Right],
'PRE': [Left, Top_And_Bottom_And_Left],
},
'CM': {
'ABV': [Top],
'BLW': [Bottom, Overstruck],
},
'V': {
'ABV': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
'BLW': [Bottom, Overstruck, Bottom_And_Right],
'PST': [Right],
'PRE': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
},
'VM': {
'ABV': [Top],
'BLW': [Bottom, Overstruck],
'PST': [Right],
'PRE': [Left],
},
'SM': {
'ABV': [Top],
'BLW': [Bottom],
},
'H': None,
'IS': None,
'B': None,
'FM': {
'ABV': [Top],
'BLW': [Bottom],
'PST': [Not_Applicable],
},
'R': None,
'SUB': None,
}
def map_to_use(data):
out = {}
items = use_mapping.items()
for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
# Resolve Indic_Syllabic_Category
# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
if 0x1CE2 <= U <= 0x1CE8:
UISC = Cantillation_Mark
# Tibetan:
# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F:
UISC = Vowel_Dependent
# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
if 0x1BF2 <= U <= 0x1BF3:
UISC = Nukta
UIPC = Bottom
# TODO: U+1CED should only be allowed after some of
# the nasalization marks, maybe only for U+1CE9..U+1CF1.
if U == 0x1CED:
UISC = Tone_Mark
values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
assert len(values) == 1, "%s %s %s %s %s %s" % (
hex(U), UISC, UDI, UGC, AJT, values)
USE = values[0]
# Resolve Indic_Positional_Category
# TODO: These should die, but have UIPC in Unicode 13.0.0
if U in [0x953, 0x954]:
UIPC = Not_Applicable
# TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0
if 0xA926 <= U <= 0xA92A:
UIPC = Top
# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
# and https://github.com/harfbuzz/harfbuzz/issues/1631
if U in [0x11302, 0x11303, 0x114C1]:
UIPC = Top
if 0x1CF8 <= U <= 0x1CF9:
UIPC = Top
# TODO: https://github.com/harfbuzz/harfbuzz/issues/3550
if U == 0x10A38: UIPC = Bottom
# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
# also https://github.com/harfbuzz/harfbuzz/issues/1012
if 0x1112A <= U <= 0x1112B:
UIPC = Top
if 0x11131 <= U <= 0x11132:
UIPC = Top
assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
pos_mapping = use_positions.get(USE, None)
if pos_mapping:
values = [k for k,v in pos_mapping.items() if v and UIPC in v]
assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (
hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
USE = USE + values[0]
out[U] = (USE, UBlock)
return out
use_data = map_to_use(combined)
print('// WARNING: this file was generated by scripts/gen-universal-table.py')
print()
print('use super::hb_glyph_info_t;')
print('use super::ot_shape_complex_use::{category::*, Category};')
print('use unicode_properties::GeneralCategory;')
total = 0
used = 0
last_block = None
def print_block(block, start, end, use_data):
global total, used, last_block
if block and block != last_block:
print()
print()
print(' /* %s */' % block)
if start % 16:
print(' ' * (20 + (start % 16 * 6)), end='')
num = 0
assert start % 8 == 0
assert (end + 1) % 8 == 0
for u in range(start, end + 1):
if u % 16 == 0:
print()
print(' /* %04X */' % u, end='')
if u in use_data:
num += 1
d = use_data.get(u)
if d is not None:
d = d[0]
elif u in unicode_data[4]:
d = 'O'
else:
d = 'WJ'
print("%6s," % d, end='')
total += end - start + 1
used += num
if block:
last_block = block
uu = sorted(use_data.keys())
last = -100000
num = 0
offset = 0
starts = []
ends = []
print()
print('#[rustfmt::skip]')
print('const USE_TABLE: &[Category] = &[')
offsets = []
for u in uu:
if u <= last:
continue
if use_data[u][0] == 'O':
continue
block = use_data[u][1]
start = u // 8 * 8
end = start + 1
while end in uu and block == use_data[end][1]:
end += 1
end = (end - 1) // 8 * 8 + 7
if start != last + 1:
if start - last <= 1 + 16 * 3:
print_block(None, last + 1, start - 1, use_data)
last = start - 1
else:
if last >= 0:
ends.append(last + 1)
offset += ends[-1] - starts[-1]
offsets.append('const USE_OFFSET_0X%04X: usize = %d;' %
(start, offset))
starts.append(start)
print_block(block, start, end, use_data)
last = end
ends.append(last + 1)
offset += ends[-1] - starts[-1]
print()
print()
occupancy = used * 100. / total
page_bits = 12
print('];')
print()
for o in offsets:
print(o)
print()
print('#[rustfmt::skip]')
print('pub fn get_category(info: &hb_glyph_info_t) -> Category {')
print(' let u = info.glyph_id;')
print(' match u >> %d {' % page_bits)
pages = set([u >> page_bits for u in starts + ends])
for p in sorted(pages):
print(' 0x%0X => {' % p)
for (start, end) in zip(starts, ends):
if p not in [start >> page_bits, end >> page_bits]:
continue
offset = 'USE_OFFSET_0X%04X' % start
print(' if (0x%04X..=0x%04X).contains(&u) { return USE_TABLE[u as usize - 0x%04X + %s]; }' % (
start, end - 1, start, offset))
print(' }')
print(' _ => {}')
print(' }')
print()
print(' if crate::hb::ot_layout::_hb_glyph_info_get_general_category(info) == GeneralCategory::Unassigned {')
print(' return WJ;')
print(' }')
print()
print(' O')
print('}')
# Maintain at least 50% occupancy in the table */
if occupancy < 50:
raise Exception('Table too sparse, please investigate: ', occupancy)

View File

@@ -0,0 +1,192 @@
#!/usr/bin/env python3
"""
Generator of the function to prohibit certain vowel sequences.
It creates ``preprocess_text_vowel_constraints``, which inserts dotted
circles into sequences prohibited by the USE script development spec.
Based on harfbuzz/src/gen-vowel-constraints.py
"""
import collections
import io
import os
import urllib.request
if not os.path.exists('Scripts.txt'):
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/Scripts.txt', 'Scripts.txt')
with io.open('Scripts.txt', encoding='utf-8') as f:
scripts_header = [f.readline() for i in range(2)]
scripts = {}
script_order = {}
for line in f:
j = line.find('#')
if j >= 0:
line = line[:j]
fields = [x.strip() for x in line.split(';')]
if len(fields) == 1:
continue
uu = fields[0].split('..')
start = int(uu[0], 16)
if len(uu) == 1:
end = start
else:
end = int(uu[1], 16)
script = fields[1]
for u in range(start, end + 1):
scripts[u] = script
if script not in script_order:
script_order[script] = start
class ConstraintSet(object):
"""A set of prohibited code point sequences.
Args:
constraint (List[int]): A prohibited code point sequence.
"""
def __init__(self, constraint):
# Either a list or a dictionary. As a list of code points, it
# represents a prohibited code point sequence. As a dictionary,
# it represents a set of prohibited sequences, where each item
# represents the set of prohibited sequences starting with the
# key (a code point) concatenated with any of the values
# (ConstraintSets).
self._c = constraint
def add(self, constraint):
"""Add a constraint to this set."""
if not constraint:
return
first = constraint[0]
rest = constraint[1:]
if isinstance(self._c, list):
if constraint == self._c[:len(constraint)]:
self._c = constraint
elif self._c != constraint[:len(self._c)]:
self._c = {self._c[0]: ConstraintSet(self._c[1:])}
if isinstance(self._c, dict):
if first in self._c:
self._c[first].add(rest)
else:
self._c[first] = ConstraintSet(rest)
def __str__(self, index=0, depth=4):
s = []
if isinstance(self._c, list):
if len(self._c) == 0:
assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
s.append('matched = true;\n')
elif len(self._c) == 1:
assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
s.append('matched = 0x{:04X} == buffer.cur({}).glyph_id;\n'.format(next(
iter(self._c)), index))
else:
s.append('if 0x{:04X} == buffer.cur({}).glyph_id &&\n'.format(self._c[0], index))
if index:
s.append('buffer.idx + {} < buffer.len &&\n'.format(index + 1))
for i, cp in enumerate(self._c[1:], start=1):
s.append('0x{:04X} == buffer.cur({}).glyph_id{}\n'.format(
cp, index + i, '' if i == len(self._c) - 1 else ' &&'))
s.append('{\n')
for i in range(index + 1):
s.append('buffer.next_glyph();\n')
s.append('output_dotted_circle(buffer);\n')
s.append('}\n')
else:
s.append('match buffer.cur({}).glyph_id {{\n'.format(index))
cases = collections.defaultdict(set)
for first, rest in sorted(self._c.items()):
cases[rest.__str__(index + 1, depth + 2)].add(first)
for ii, (body, labels) in enumerate(sorted(cases.items(), key=lambda b_ls: sorted(b_ls[1])[0])):
for i, cp in enumerate(sorted(labels)):
if i == len(labels) - 1:
s.append(' 0x{:04X} => {{ {}'.format(cp, '\n' if i % 4 == 3 else ''))
else:
s.append(' 0x{:04X} | {}'.format(cp, '\n' if i % 4 == 3 else ''))
s.append(body)
s.append('}')
if ii == len(cases.items()) - 1:
s.append('_ => {}')
s.append('}\n')
return ''.join(s)
constraints = {}
with io.open('ms-use/IndicShapingInvalidCluster.txt', encoding='utf-8') as f:
constraints_header = []
while True:
line = f.readline().strip()
if line == '#':
break
constraints_header.append(line)
for line in f:
j = line.find('#')
if j >= 0:
line = line[:j]
constraint = [int(cp, 16) for cp in line.split(';')[0].split()]
if not constraint:
continue
assert 2 <= len(constraint), 'Prohibited sequence is too short: {}'.format(constraint)
script = scripts[constraint[0]]
if script in constraints:
constraints[script].add(constraint)
else:
constraints[script] = ConstraintSet(constraint)
assert constraints, 'No constraints found'
print('// WARNING: this file was generated by scripts/gen-vowel-constraints.py')
print()
print('use super::buffer::hb_buffer_t;')
print('use super::ot_layout::*;')
print('use super::script;')
print('use crate::BufferFlags;')
print()
print('fn output_dotted_circle(buffer: &mut hb_buffer_t) {')
print(' buffer.output_glyph(0x25CC);')
print(' {')
print(' let out_idx = buffer.out_len - 1;')
print(' _hb_glyph_info_reset_continuation(&mut buffer.out_info_mut()[out_idx]);')
print(' }')
print('}')
print()
print('fn output_with_dotted_circle(buffer: &mut hb_buffer_t) {')
print(' output_dotted_circle(buffer);')
print(' buffer.next_glyph();')
print('}')
print()
print('pub fn preprocess_text_vowel_constraints(buffer: &mut hb_buffer_t) {')
print(' if buffer.flags.contains(BufferFlags::DO_NOT_INSERT_DOTTED_CIRCLE) {')
print(' return;')
print(' }')
print()
print(' // UGLY UGLY UGLY business of adding dotted-circle in the middle of')
print(' // vowel-sequences that look like another vowel. Data for each script')
print(' // collected from the USE script development spec.')
print(' //')
print(' // https://github.com/harfbuzz/harfbuzz/issues/1019')
print(' buffer.clear_output();')
print(' match buffer.script {')
for script, constraints in sorted(constraints.items(), key=lambda s_c: script_order[s_c[0]]):
print(' Some(script::{}) => {{'.format(script.upper()))
print(' buffer.idx = 0;')
print(' while buffer.idx + 1 < buffer.len {')
print(' #[allow(unused_mut)]')
print(' let mut matched = false;')
print(str(constraints), end='')
print(' buffer.next_glyph();')
print(' if matched { output_with_dotted_circle(buffer); }')
print(' }')
print(' }')
print()
print(' _ => {}')
print(' }')
print(' buffer.sync();')
print('}')
print()

21
vendor/rustybuzz/scripts/ms-use/COPYING vendored Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

View File

@@ -0,0 +1,109 @@
# Override values For Indic_Positional_Category
# Not derivable
# Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17
# Updated for Unicode 10.0 by Andrew Glass 2017-07-25
# Ammended for Unicode 10.0 by Andrew Glass 2018-09-21
# Updated for L2/19-083 by Andrew Glass 2019-05-06
# Updated for Unicode 12.1 by Andrew Glass 2019-05-30
# Updated for Unicode 13.0 by Andrew Glass 2020-07-28
# Updated for Unicode 14.0 by Andrew Glass 2021-09-28
# ================================================
# ================================================
# OVERRIDES TO ASSIGNED VALUES
# ================================================
# ================================================
# Indic_Positional_Category=Bottom
0F72 ; Bottom # Mn TIBETAN VOWEL SIGN I # Not really below, but need to override to fit into Universal model
0F7A..0F7D ; Bottom # Mn [4] TIBETAN VOWEL SIGN E..TIBETAN VOWEL SIGN OO # Not really below, but need to override to fit into Universal model
0F80 ; Bottom # Mn TIBETAN VOWEL SIGN REVERSED I # Not really below, but need to override to fit into Universal model
A9BF ; Bottom # Mc JAVANESE CONSONANT SIGN CAKRA
11127..11129 ; Bottom # Mn [3] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN II
1112D ; Bottom # Mn CHAKMA VOWEL SIGN AI
11130 ; Bottom # Mn CHAKMA VOWEL SIGN OI
# ================================================
# Indic_Positional_Category=Left
1C29 ; Left # Mc LEPCHA VOWEL SIGN OO # Reduced from Top_And_Left
# ================================================
# Indic_Positional_Category=Right
A9BE ; Right # Mc JAVANESE CONSONANT SIGN PENGKAL # Reduced from Bottom_And_Right
10A0C ; Right # Mn KHAROSHTHI VOWEL LENGTH MARK # Follows vowels and precedes vowel modifiers
11942 ; Right # Mc DIVES AKURU MEDIAL RA # Reduced from Bottom_And_Right
# ================================================
# Indic_Positional_Category=Top
0F74 ; Top # Mn TIBETAN VOWEL SIGN U # Not really above, but need to override to fit into Universal model
1A18 ; Top # Mn BUGINESE VOWEL SIGN U # Workaround to allow below to occur before above by treating all below marks as above
AA35   ; Top # Mn       CHAM CONSONANT SIGN
# ================================================
# Indic_Positional_Category=Top_And_Right
0E33 ; Top_And_Right # Lo THAI CHARACTER SARA AM # IMC has Right, which seems to be a mistake.
0EB3 ; Top_And_Right # Lo LAO VOWEL SIGN AM # IMC has Right, which seems to be a mistake.
# ================================================
# ================================================
# VALUES NOT ASSIGNED IN Indic_Positional_Category
# ================================================
# ================================================
# Indic_Positional_Category=Bottom
0859..085B ; Bottom # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
18A9 ; Bottom # Mn MONGOLIAN LETTER ALI GALI DAGALGA
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overriden, ccc controls order
10AE6 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK BELOW
10F46..10F47 ; Bottom # Mn [2] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING TWO DOTS BELOW
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overriden, ccc controls order
10F4B ; Bottom # Mn SOGDIAN COMBINING CURVE BELOW
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overriden, ccc controls order
10F4D..10F50 ; Bottom # Mn [4] SOGDIAN COMBINING HOOK BELOW..SOGDIAN COMBINING STROKE BELOW
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overriden, ccc controls order
10F83 ; Bottom # Mn OLD UYGHUR COMBINING DOT BELOW
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overriden, ccc controls order
10F85 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS BELOW
16F4F ; Bottom # Mn MIAO SIGN CONSONANT MODIFIER BAR
16F51..16F87 ; Bottom # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
16F8F..16F92 ; Bottom # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
# ================================================
# Indic_Positional_Category=Left
103C ; Left # Mc MYANMAR CONSONANT SIGN MEDIAL RA
# ================================================
# Indic_Positional_Category=Top
07EB..07F3 ; Top # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
07FD ; Top # Mn NKO DANTAYALAN # Not really top, but assigned here to allow ccc to control mark order
1885..1886 ; Top # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
10D24..10D27 ; Top # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
10EAB..10EAC ; Top # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
16B30..16B36 ; Top # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
1E130..1E136 ; Top # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E2AE ; Top # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Top # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
1E944..1E94A ; Top # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
# ================================================
# Indic_Positional_Category=Overstruck
1BC9D..1BC9E ; Overstruck # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
# ================================================
# ================================================
# Deliberately suppressed
# ================================================
# ================================================
# Indic_Positional_Category=NA
180B..180D ; NA # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
180F ; NA # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR
2D7F ; NA # Mn TIFINAGH CONSONANT JOINER

View File

@@ -0,0 +1,105 @@
# IndicShapingInvalidCluster.txt
# Date: 2015-03-12, 21:17:00 GMT [AG]
# Date: 2019-11-08, 23:22:00 GMT [AG]
#
# This file defines the following property:
#
# Indic_Shaping_Invalid_Cluster
#
# Scope: This file enumerates sequences of characters that should be treated as invalid clusters
0905 0946 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT E
0905 093E ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AA
0930 094D 0907 ; # DEVANAGARI LETTER RA, DEVANAGARI SIGN VIRAMA, DEVANAGARI LETTER I
0909 0941 ; # DEVANAGARI LETTER U, DEVANAGARI VOWEL SIGN U
090F 0945 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN CANDRA E
090F 0946 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN SHORT E
090F 0947 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN E
0905 0949 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA O
0906 0945 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN CANDRA E
0905 094A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT O
0906 0946 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN SHORT E
0905 094B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN O
0906 0947 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN E
0905 094C ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AU
0906 0948 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN AI
0905 0945 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA E
0905 093A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OE
0905 093B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OOE
0906 093A ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN OE
0905 094F ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AW
0905 0956 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UE
0905 0957 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UUE
0985 09BE ; # BENGALI LETTER A, BENGALI VOWEL SIGN AA
098B 09C3 ; # BENGALI LETTER VOCALIC R, BENGALI VOWEL SIGN VOCALIC R
098C 09E2 ; # BENGALI LETTER VOCALIC L, BENGALI VOWEL SIGN VOCALIC L
0A05 0A3E ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AA
0A72 0A3F ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN I
0A72 0A40 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN II
0A73 0A41 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN U
0A73 0A42 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN UU
0A72 0A47 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN EE
0A05 0A48 ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AI
0A73 0A4B ; # GURMUKHI URA, GURMUKHI VOWEL SIGN OO
0A05 0A4C ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AU
0A85 0ABE ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA
0A85 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA E
0A85 0AC7 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN E
0A85 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AI
0A85 0AC9 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA O
0A85 0ACB ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN O
0A85 0ABE 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN CANDRA E
0A85 0ACC ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AU
0A85 0ABE 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN AI
0AC5 0ABE ; # GUJARATI VOWEL SIGN CANDRA E, GUJARATI VOWEL SIGN AA
0B05 0B3E ; # ORIYA LETTER A, ORIYA VOWEL SIGN AA
0B0F 0B57 ; # ORIYA LETTER E, ORIYA AU LENGTH MARK
0B13 0B57 ; # ORIYA LETTER O, ORIYA AU LENGTH MARK
0B85 0BC2 ; # TAMIL LETTER A, TAMIL VOWEL SIGN UU
0C12 0C55 ; # TELUGU LETTER O, TELUGU LENGTH MARK
0C12 0C4C ; # TELUGU LETTER O, TELUGU VOWEL SIGN AU
0C3F 0C55 ; # TELUGU VOWEL SIGN I, TELUGU LENGTH MARK
0C46 0C55 ; # TELUGU VOWEL SIGN E, TELUGU LENGTH MARK
0C4A 0C55 ; # TELUGU VOWEL SIGN O, TELUGU LENGTH MARK
0C89 0CBE ; # KANNADA LETTER U, KANNADA VOWEL SIGN AA
0C92 0CCC ; # KANNADA LETTER O, KANNADA VOWEL SIGN AU
0C8B 0CBE ; # KANNADA LETTER VOCALIC R, KANNADA VOWEL SIGN AA
0D07 0D57 ; # MALAYALAM LETTER I, MALAYALAM AU LENGTH MARK
0D09 0D57 ; # MALAYALAM LETTER U, MALAYALAM AU LENGTH MARK
0D0E 0D46 ; # MALAYALAM LETTER E, MALAYALAM VOWEL SIGN E
0D12 0D3E ; # MALAYALAM LETTER O, MALAYALAM VOWEL SIGN AA
0D12 0D57 ; # MALAYALAM LETTER O, MALAYALAM AU LENGTH MARK
0D85 0DCF ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN AELA-PILLA
0D85 0DD0 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN KETTI AEDA-PILLA
0D85 0DD1 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN DIGA AEDA-PILLA
0D8B 0DDF ; # SINHALA LETTER UYANNA, SINHALA VOWEL SIGN GAYANUKITTA
0D8D 0DD8 ; # SINHALA LETTER IRUYANNA, SINHALA VOWEL SIGN GAETTA-PILLA
0D8F 0DDF ; # SINHALA LETTER ILUYANNA, SINHALA VOWEL SIGN GAYANUKITTA
0D91 0DCA ; # SINHALA LETTER EYANNA, SINHALA SIGN AL-LAKUNA
0D91 0DD9 ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA
0D91 0DDA ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN DIGA KOMBUVA
0D91 0DDC ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA
0D91 0DDD ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA
0D91 0DDE ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA
0D94 0DDF ; # SINHALA LETTER OYANNA, SINHALA VOWEL SIGN GAYANUKITTA
11005 11038 ; # BRAHMI LETTER A, BRAHMI VOWEL SIGN AA
1100B 1103E ; # BRAHMI LETTER VOCALIC R, BRAHMI VOWEL SIGN VOCALIC R
1100F 11042 ; # BRAHMI LETTER E, BRAHMI VOWEL SIGN E
11680 116AD ; # TAKRI LETTER A, TAKRI VOWEL SIGN AA
11686 116B2 ; # TAKRI LETTER E, TAKRI VOWEL SIGN E
11680 116B4 ; # TAKRI LETTER A, TAKRI VOWEL SIGN O
11680 116B5 ; # TAKRI LETTER A, TAKRI VOWEL SIGN AU
112B0 112E0 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AA
112B0 112E5 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN E
112B0 112E6 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AI
112B0 112E7 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN O
112B0 112E8 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AU
11481 114B0 ; # TIRHUTA LETTER A, TIRHUTA VOWEL SIGN AA
114AA 114B5 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC R
114AA 114B6 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC RR
1148B 114BA ; # TIRHUTA LETTER E, TIRHUTA VOWEL SIGN SHORT E
1148D 114BA ; # TIRHUTA LETTER O, TIRHUTA VOWEL SIGN SHORT E
11600 11639 ; # MODI LETTER A, MODI VOWEL SIGN E
11600 1163A ; # MODI LETTER A, MODI VOWEL SIGN AI
11601 11639 ; # MODI LETTER AA, MODI VOWEL SIGN E
11601 1163A ; # MODI LETTER AA, MODI VOWEL SIGN AI

View File

@@ -0,0 +1,221 @@
# Override values For Indic_Syllabic_Category
# Not derivable
# Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17
# Updated for Unicode 10.0 by Andrew Glass 2017-07-25
# Updated for Unicode 12.1 by Andrew Glass 2019-05-24
# Updated for Unicode 13.0 by Andrew Glass 2020-07-28
# Updated for Unicode 14.0 by Andrew Glass 2021-09-25
# ================================================
# OVERRIDES TO ASSIGNED VALUES
# ================================================
# Indic_Syllabic_Category=Bindu
193A ; Bindu # Mn LIMBU SIGN KEMPHRENG
AA29 ; Bindu # Mn  CHAM VOWEL SIGN AA
10A0D ; Bindu # Mn KHAROSHTHI SIGN DOUBLE RING BELOW
# ================================================
# Indic_Syllabic_Category=Consonant
0840..0858 ; Consonant # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
0F00..0F01 ; Consonant # Lo [2] TIBETAN SYLLABLE OM..TIBETAN MARK GTER YIG MGO TRUNCATED
0F04..0F06 ; Consonant # Po TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK CARET YIG MGO PHUR SHAD MA
19C1..19C7 ; Consonant # Lo [7] NEW TAI LUE LETTER FINAL V..NEW TAI LUE LETTER FINAL B # Reassigned to avoid clustering with a base consonant
25CC ; Consonant # So DOTTED CIRCLE
# ================================================
# Indic_Syllabic_Category=Consonant_Dead
0F7F ; Consonant_Dead # Mc TIBETAN SIGN RNAM BCAD # reassigned so that visarga will form an independent cluster
# ================================================
# Indic_Syllabic_Category=Consonant_Final
0F35 ; Consonant_Final # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
0F37 ; Consonant_Final # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
0FC6 ; Consonant_Final # Mn TIBETAN SYMBOL PADMA GDAN
# ================================================
# Indic_Syllabic_Category=Consonant_Final_Modifier
1C36 ; Consonant_Final_Modifier # Mn LEPCHA SIGN RAN
# ================================================
# Indic_Syllabic_Category=Gemination_Mark
11134 ; Gemination_Mark # Mc CHAKMA MAAYYAA
# ================================================
# Indic_Syllabic_Category=Nukta
0F71 ; Nukta # Mn TIBETAN VOWEL SIGN AA # Reassigned to get this before an above vowel
10A38..10A3A ; Nukta # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
# ================================================
# Indic_Syllabic_Category=Tone_Mark
1A7B..1A7C ; Tone_Mark # Mn [2] TAI THAM SIGN MAI SAM..TAI THAM SIGN KHUEN-LUE KARAN
1A7F ; Tone_Mark # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
# ================================================
# Indic_Syllabic_Category=Vowel_Independent
AAB1 ; Vowel_Independent # Lo TAI VIET VOWEL AA
AABA ; Vowel_Independent # Lo TAI VIET VOWEL UA
AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN
# ================================================
# ================================================
# VALUES NOT ASSIGNED IN Indic_Syllabic_Category
# ================================================
# ================================================
# Indic_Syllabic_Category=Consonant
0800..0815 ; Consonant # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
1800 ; Consonant # Po MONGOLIAN BIRGA # Reassigned so that legacy Birga + MFVS sequences still work
1807 ; Consonant # Po MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER
180A ; Consonant # Po MONGOLIAN NIRUGU
1820..1878 ; Consonant # Lo [88] MONGOLIAN LETTER A..MONGOLIAN LETTER CHA WITH TWO DOTS
1843 ; Consonant # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
2D30..2D67 ; Consonant # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO
2D6F ; Consonant # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK
10570..1057A ; Consonant # Lo [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA
1057C..1058A ; Consonant # Lo [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE
1058C..10592 ; Consonant # Lo [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE
10594..10595 ; Consonant # Lo [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE
10597..105A1 ; Consonant # Lo [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA
105A3..105B1 ; Consonant # Lo [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE
105B3..105B9 ; Consonant # Lo [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE
105BB..105BC ; Consonant # Lo [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE
10AC0..10AC7 ; Consonant # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
10AC9..10AE4 ; Consonant # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
10D00..10D23 ; Consonant # Lo [36] HANIFI ROHINGYA LETTER A..HANIFI ROHINGYA MARK NA KHONNA
10E80..10EA9 ; Consonant # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
10EB0..10EB1 ; Consonant # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
10F30..10F45 ; Consonant # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
111DA ; Consonant # Lo SHARADA EKAM
#HIEROGLYPHS to be moved to new category
13000..1342E ; Consonant # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
#For the Begin and End segment to be handled fully correctly, the cluster model needs to be modified.
13437..13438 ; Consonant # Lo [2] EGYPTIAN HIEROGLYPH BEGIN SEGMENT..EGYPTIAN HIEROGLYPH END SEGMENT
16B00..16B2F ; Consonant # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
16F00..16F4A ; Consonant # Lo [75] MIAO LETTER PA..MIAO LETTER RTE
16FE4 ; Consonant # Mn KHITAN SMALL SCRIPT FILLER # Avoids Mn pushing this into VOWEL class
18B00..18CD5 ; Consonant # Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5
1BC00..1BC6A ; Consonant # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
1BC70..1BC7C ; Consonant # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
1BC80..1BC88 ; Consonant # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
1BC90..1BC99 ; Consonant # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
1E100..1E12C ; Consonant # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
1E137..1E13D ; Consonant # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
1E14E ; Consonant # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
1E14F ; Consonant # So NYIAKENG PUACHUE HMONG CIRCLED CA
1E290..1E2AD ; Consonant # Lo [30] TOTO LETTER PA..TOTO LETTER A
1E2C0..1E2EB ; Consonant # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
1E900..1E921 ; Consonant # Lu [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA
1E922..1E943 ; Consonant # Ll [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
1E94B ; Consonant # Lm ADLAM NASALIZATION MARK
# ================================================
# Indic_Syllabic_Category=Consonant_Placeholder
1880..1884 ; Consonant_Placeholder # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
# ================================================
# Indic_Syllabic_Category=Gemination_Mark
10D27 ; Gemination_Mark # Mn HANIFI ROHINGYA SIGN TASSI
# ================================================
# Indic_Syllabic_Category=Modifying_Letter
FE00..FE0F ; Modifying_Letter # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16# Need to treat them as isolated bases so they don't merge with a cluster in invalid scenarios
16F50 ; Modifying_Letter # Lo MIAO LETTER NASALIZATION
# ================================================
# Indic_Syllabic_Category=Nukta
0859..085B ; Nukta # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
0F39 ; Nukta # Mn TIBETAN MARK TSA -PHRU # NOW IN UNICODE 10.0
1885..1886 ; Nukta # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
18A9 ; Nukta # Mn MONGOLIAN LETTER ALI GALI DAGALGA
1B6B..1B73 ; Nukta # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
10AE5..10AE6 ; Nukta # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
16F4F ; Nukta # Mn MIAO SIGN CONSONANT MODIFIER BAR
1BC9D..1BC9E ; Nukta # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
1E944..1E94A ; Nukta # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
10F82..10F85 ; Nukta # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
# ================================================
# Indic_Syllabic_Category=Number
10D30..10D39 ; Number # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE
10F51..10F54 ; Number # No [4] SOGDIAN NUMBER ONE..SOGDIAN NUMBER ONE HUNDRED
16AC0..16AC9 ; Number # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
1E140..1E149 ; Number # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
1E2F0..1E2F9 ; Number # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
1E950..1E959 ; Number # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
# ================================================
# Indic_Syllabic_Category=Tone_Mark
07EB..07F3 ; Tone_Mark # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
07FD ; Tone_Mark # Mn NKO DANTAYALAN
0F86..0F87 ; Tone_Mark # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
17CF ; Tone_Mark # Mn KHMER SIGN AHSDA
10D24..10D26 ; Tone_Mark # Mn [3] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TANA
10F46..10F50 ; Tone_Mark # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
16B30..16B36 ; Tone_Mark # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F8F..16F92 ; Tone_Mark # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
1E130..1E136 ; Tone_Mark # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
1E2AE ; Tone_Mark # Mn TOTO SIGN RISING TONE
1E2EC..1E2EF ; Tone_Mark # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
# ================================================
# Indic_Syllabic_Category=Virama
2D7F ; Virama # Mn TIFINAGH CONSONANT JOINER
13430..13436 ; Virama # Cf [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE
# ================================================
# Indic_Syllabic_Category=Vowel_Independent
AAB1 ; Vowel_Independent # Lo TAI VIET VOWEL AA
AABA ; Vowel_Independent # Lo TAI VIET VOWEL UA
AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN
# ================================================
# Indic_Syllabic_Category=Vowel_Dependent
0B55 ; Vowel_Dependent # Mn ORIYA SIGN OVERLINE
10EAB..10EAC ; Vowel_Dependent # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
16F51..16F87 ; Vowel_Dependent # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
# ================================================
# ================================================
# PROPERTIES NOT ASSIGNED IN Indic_Syllabic_Category
# ================================================
# ================================================
# USE_Syllabic_Category=Hieroglyph
# 13000..1342E ; Hieroglyph # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
# ================================================
# USE_Syllabic_Category=Hieroglyph_Joiner
# 13430..13436 ; Hieroglyph_Joiner # Cf EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE
# ================================================
# USE_Syllabic_Category= Hieroglyph_Segment_Begin
# 13437 ; Hieroglyph_Segment_Begin # Cf EGYPTIAN HIEROGLYPH BEGIN SEGMENT
# ================================================
# USE_Syllabic_Category= Hieroglyph_Segment_End
# 13438 ; Hieroglyph_Segment_End # Cf EGYPTIAN HIEROGLYPH END SEGMENT
# ================================================
# eof