193 lines
7.1 KiB
Python
Executable File
193 lines
7.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Generator of the function to prohibit certain vowel sequences.
|
|
|
|
It creates ``preprocess_text_vowel_constraints``, which inserts dotted
|
|
circles into sequences prohibited by the USE script development spec.
|
|
|
|
Based on harfbuzz/src/gen-vowel-constraints.py
|
|
"""
|
|
|
|
import collections
|
|
import io
|
|
import os
|
|
import urllib.request
|
|
|
|
if not os.path.exists('Scripts.txt'):
|
|
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/Scripts.txt', 'Scripts.txt')
|
|
|
|
with io.open('Scripts.txt', encoding='utf-8') as f:
|
|
scripts_header = [f.readline() for i in range(2)]
|
|
scripts = {}
|
|
script_order = {}
|
|
for line in f:
|
|
j = line.find('#')
|
|
if j >= 0:
|
|
line = line[:j]
|
|
fields = [x.strip() for x in line.split(';')]
|
|
if len(fields) == 1:
|
|
continue
|
|
uu = fields[0].split('..')
|
|
start = int(uu[0], 16)
|
|
if len(uu) == 1:
|
|
end = start
|
|
else:
|
|
end = int(uu[1], 16)
|
|
script = fields[1]
|
|
for u in range(start, end + 1):
|
|
scripts[u] = script
|
|
if script not in script_order:
|
|
script_order[script] = start
|
|
|
|
|
|
class ConstraintSet(object):
|
|
"""A set of prohibited code point sequences.
|
|
|
|
Args:
|
|
constraint (List[int]): A prohibited code point sequence.
|
|
|
|
"""
|
|
|
|
def __init__(self, constraint):
|
|
# Either a list or a dictionary. As a list of code points, it
|
|
# represents a prohibited code point sequence. As a dictionary,
|
|
# it represents a set of prohibited sequences, where each item
|
|
# represents the set of prohibited sequences starting with the
|
|
# key (a code point) concatenated with any of the values
|
|
# (ConstraintSets).
|
|
self._c = constraint
|
|
|
|
def add(self, constraint):
|
|
"""Add a constraint to this set."""
|
|
if not constraint:
|
|
return
|
|
first = constraint[0]
|
|
rest = constraint[1:]
|
|
if isinstance(self._c, list):
|
|
if constraint == self._c[:len(constraint)]:
|
|
self._c = constraint
|
|
elif self._c != constraint[:len(self._c)]:
|
|
self._c = {self._c[0]: ConstraintSet(self._c[1:])}
|
|
if isinstance(self._c, dict):
|
|
if first in self._c:
|
|
self._c[first].add(rest)
|
|
else:
|
|
self._c[first] = ConstraintSet(rest)
|
|
|
|
def __str__(self, index=0, depth=4):
|
|
s = []
|
|
if isinstance(self._c, list):
|
|
if len(self._c) == 0:
|
|
assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
|
|
s.append('matched = true;\n')
|
|
elif len(self._c) == 1:
|
|
assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
|
|
s.append('matched = 0x{:04X} == buffer.cur({}).glyph_id;\n'.format(next(
|
|
iter(self._c)), index))
|
|
else:
|
|
s.append('if 0x{:04X} == buffer.cur({}).glyph_id &&\n'.format(self._c[0], index))
|
|
if index:
|
|
s.append('buffer.idx + {} < buffer.len &&\n'.format(index + 1))
|
|
for i, cp in enumerate(self._c[1:], start=1):
|
|
s.append('0x{:04X} == buffer.cur({}).glyph_id{}\n'.format(
|
|
cp, index + i, '' if i == len(self._c) - 1 else ' &&'))
|
|
s.append('{\n')
|
|
for i in range(index + 1):
|
|
s.append('buffer.next_glyph();\n')
|
|
s.append('output_dotted_circle(buffer);\n')
|
|
s.append('}\n')
|
|
else:
|
|
s.append('match buffer.cur({}).glyph_id {{\n'.format(index))
|
|
cases = collections.defaultdict(set)
|
|
for first, rest in sorted(self._c.items()):
|
|
cases[rest.__str__(index + 1, depth + 2)].add(first)
|
|
for ii, (body, labels) in enumerate(sorted(cases.items(), key=lambda b_ls: sorted(b_ls[1])[0])):
|
|
for i, cp in enumerate(sorted(labels)):
|
|
if i == len(labels) - 1:
|
|
s.append(' 0x{:04X} => {{ {}'.format(cp, '\n' if i % 4 == 3 else ''))
|
|
else:
|
|
s.append(' 0x{:04X} | {}'.format(cp, '\n' if i % 4 == 3 else ''))
|
|
s.append(body)
|
|
s.append('}')
|
|
if ii == len(cases.items()) - 1:
|
|
s.append('_ => {}')
|
|
s.append('}\n')
|
|
return ''.join(s)
|
|
|
|
|
|
constraints = {}
|
|
with io.open('ms-use/IndicShapingInvalidCluster.txt', encoding='utf-8') as f:
|
|
constraints_header = []
|
|
while True:
|
|
line = f.readline().strip()
|
|
if line == '#':
|
|
break
|
|
constraints_header.append(line)
|
|
for line in f:
|
|
j = line.find('#')
|
|
if j >= 0:
|
|
line = line[:j]
|
|
constraint = [int(cp, 16) for cp in line.split(';')[0].split()]
|
|
if not constraint:
|
|
continue
|
|
assert 2 <= len(constraint), 'Prohibited sequence is too short: {}'.format(constraint)
|
|
script = scripts[constraint[0]]
|
|
if script in constraints:
|
|
constraints[script].add(constraint)
|
|
else:
|
|
constraints[script] = ConstraintSet(constraint)
|
|
assert constraints, 'No constraints found'
|
|
|
|
print('// WARNING: this file was generated by scripts/gen-vowel-constraints.py')
|
|
print()
|
|
print('use super::buffer::hb_buffer_t;')
|
|
print('use super::ot_layout::*;')
|
|
print('use super::script;')
|
|
print('use crate::BufferFlags;')
|
|
print()
|
|
print('fn output_dotted_circle(buffer: &mut hb_buffer_t) {')
|
|
print(' buffer.output_glyph(0x25CC);')
|
|
print(' {')
|
|
print(' let out_idx = buffer.out_len - 1;')
|
|
print(' _hb_glyph_info_reset_continuation(&mut buffer.out_info_mut()[out_idx]);')
|
|
print(' }')
|
|
print('}')
|
|
print()
|
|
print('fn output_with_dotted_circle(buffer: &mut hb_buffer_t) {')
|
|
print(' output_dotted_circle(buffer);')
|
|
print(' buffer.next_glyph();')
|
|
print('}')
|
|
print()
|
|
print('pub fn preprocess_text_vowel_constraints(buffer: &mut hb_buffer_t) {')
|
|
print(' if buffer.flags.contains(BufferFlags::DO_NOT_INSERT_DOTTED_CIRCLE) {')
|
|
print(' return;')
|
|
print(' }')
|
|
print()
|
|
print(' // UGLY UGLY UGLY business of adding dotted-circle in the middle of')
|
|
print(' // vowel-sequences that look like another vowel. Data for each script')
|
|
print(' // collected from the USE script development spec.')
|
|
print(' //')
|
|
print(' // https://github.com/harfbuzz/harfbuzz/issues/1019')
|
|
print(' buffer.clear_output();')
|
|
print(' match buffer.script {')
|
|
|
|
for script, constraints in sorted(constraints.items(), key=lambda s_c: script_order[s_c[0]]):
|
|
print(' Some(script::{}) => {{'.format(script.upper()))
|
|
print(' buffer.idx = 0;')
|
|
print(' while buffer.idx + 1 < buffer.len {')
|
|
print(' #[allow(unused_mut)]')
|
|
print(' let mut matched = false;')
|
|
print(str(constraints), end='')
|
|
print(' buffer.next_glyph();')
|
|
print(' if matched { output_with_dotted_circle(buffer); }')
|
|
print(' }')
|
|
print(' }')
|
|
print()
|
|
|
|
print(' _ => {}')
|
|
print(' }')
|
|
print(' buffer.sync();')
|
|
print('}')
|
|
print()
|