cheep-crator-2/vendor/unicode-linebreak/build.rs

460 lines
18 KiB
Rust
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

//! Parses the rules into a state machine using a pair table. Each value in the table specifies the
//! next state and whether it's an forced/allowed break. To handles rules such as
//!
//! B SP* ÷ A
//!
//! the extra state BSP is employed in the pair table friendly equivalent rules
//!
//! (B | BSP) ÷ A, Treat (B | BSP) SP as if it were BSP, Treat BSP as if it were SP
#![recursion_limit = "512"]
use regex::Regex;
use std::env;
use std::error::Error;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::iter;
use std::path::Path;
use std::str::FromStr;
include!("src/shared.rs");
impl FromStr for BreakClass {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"BK" => BK,
"CR" => CR,
"LF" => LF,
"CM" => CM,
"NL" => NL,
"SG" => SG,
"WJ" => WJ,
"ZW" => ZW,
"GL" => GL,
"SP" => SP,
"ZWJ" => ZWJ,
"B2" => B2,
"BA" => BA,
"BB" => BB,
"HY" => HY,
"CB" => CB,
"CL" => CL,
"CP" => CP,
"EX" => EX,
"IN" => IN,
"NS" => NS,
"OP" => OP,
"QU" => QU,
"IS" => IS,
"NU" => NU,
"PO" => PO,
"PR" => PR,
"SY" => SY,
"AI" => AI,
"AL" => AL,
"CJ" => CJ,
"EB" => EB,
"EM" => EM,
"H2" => H2,
"H3" => H3,
"HL" => HL,
"ID" => ID,
"JL" => JL,
"JV" => JV,
"JT" => JT,
"RI" => RI,
"SA" => SA,
"XX" => XX,
_ => return Err("Invalid break class"),
})
}
}
const NUM_CLASSES: usize = 43;
static BREAK_CLASS_TABLE: [&str; NUM_CLASSES] = [
"BK", "CR", "LF", "CM", "NL", "SG", "WJ", "ZW", "GL", "SP", "ZWJ", "B2", "BA", "BB", "HY",
"CB", "CL", "CP", "EX", "IN", "NS", "OP", "QU", "IS", "NU", "PO", "PR", "SY", "AI", "AL", "CJ",
"EB", "EM", "H2", "H3", "HL", "ID", "JL", "JV", "JT", "RI", "SA", "XX",
];
fn default_value(codepoint: u32) -> BreakClass {
match codepoint {
// The unassigned code points in the following blocks default to "ID"
0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF => ID,
// All undesignated code points in Planes 2 and 3, whether inside or outside of allocated blocks, default to "ID"
0x20000..=0x2FFFD | 0x30000..=0x3FFFD => ID,
// All unassigned code points in the following Plane 1 range, whether inside or outside of allocated blocks, also default to "ID"
0x1F000..=0x1FAFF | 0x1FC00..=0x1FFFD => ID,
// The unassigned code points in the following block default to "PR"
0x20A0..=0x20CF => PR,
// All code points, assigned and unassigned, that are not listed explicitly are given the value "XX"
_ => XX,
}
}
#[derive(Copy, Clone)]
#[repr(u8)]
enum ExtraState {
ZWSP = sot + 1,
OPSP,
QUSP,
CLSP,
CPSP,
B2SP,
HLHYBA,
RIRI,
}
use ExtraState::*;
/// The number of classes plus the eot state.
const NUM_CLASSES_EOT: usize = NUM_CLASSES + 1;
const NUM_STATES: usize = NUM_CLASSES + 10;
/// Separate implementation to prevent infinite recursion.
#[doc(hidden)]
macro_rules! rules2table_impl {
// Operators
(($len:ident $($args:tt)*) '÷' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '÷') $($tt)+}};
(($len:ident $($args:tt)*) '×' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '×') $($tt)+}};
(($len:ident $($args:tt)*) '!' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '!') $($tt)+}};
// Perform operator
(($len:ident $pair_table:ident $($first:ident)? $operator:literal $($second:ident)?) $(, $($tt:tt)*)?) => {
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
#[allow(unused)] let first = 0..NUM_STATES; // Default to ALL
$(let first = $first;)?
#[allow(unused)] let second = 0..NUM_CLASSES_EOT; // Default to ALL
$(let second = $second;)?
for i in first {
for j in second.clone() {
let cell = &mut $pair_table[i][j];
match $operator {
'!' => *cell |= ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT,
'÷' => *cell |= ALLOWED_BREAK_BIT,
'×' => *cell &= !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT),
_ => unreachable!("Bad operator"),
}
}
}
};
(($len:ident $($args:tt)*) Treat X $($tt:tt)*) => {
rules2table_impl! {(NUM_CLASSES_EOT $($args)* treat_x) $($tt)*}
};
(($len:ident $($args:tt)*) Treat $($tt:tt)*) => {
rules2table_impl! {(NUM_STATES $($args)* treat) $($tt)*}
};
(($len:ident $($args:tt)*) * as if it were X where X = $($tt:tt)*) => {
rules2table_impl! {(NUM_STATES $($args)* as_if_it_were_x_where_x_is) $($tt)*}
};
(($len:ident $pair_table:ident treat_x $second:ident as_if_it_were_x_where_x_is $X:ident) $(, $($tt:tt)*)?) => {
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
for i in $X {
for j in $second.clone() {
$pair_table[i][j] = i as u8;
}
}
};
(($len:ident $pair_table:ident treat $first:ident $second:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => {
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
let cls = $cls as u8;
for i in $first {
for j in $second.clone() {
$pair_table[i][j] = cls;
}
}
};
(($len:ident $pair_table:ident treat $first:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => {
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
for j in $first.clone().filter(|&j| j < NUM_CLASSES_EOT) {
for row in $pair_table.iter_mut() {
row[j] = row[$cls as usize];
}
}
for i in $first {
$pair_table.copy_within($cls as usize..$cls as usize + 1, i);
}
};
// All classes pattern
(($len:ident $($args:tt)*) ALL $($tt:tt)*) => {
let indices = 0..$len;
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
};
// Single class pattern
(($len:ident $($args:tt)*) $cls:ident $($tt:tt)*) => {
let indices = iter::once($cls as usize);
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
};
// Parse (X | ...) patterns
(($len:ident $($args:tt)*) ($($cls:ident)|+) $($tt:tt)*) => {
let indices = [$($cls as usize),+].iter().cloned();
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
};
// Parse [^ ...] patterns
(($len:ident $($args:tt)*) [^$($cls:ident)+] $($tt:tt)*) => {
let excluded = [$($cls as usize),+];
let indices = (0..$len).filter(|i| !excluded.contains(i)).collect::<Vec<_>>();
let indices = indices.iter().cloned();
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
};
(($len:ident $pair_table:ident)) => {}; // Exit condition
}
/// Returns a pair table conforming to the specified rules.
///
/// The rule syntax is a modified subset of the one in Unicode Standard Annex #14.
macro_rules! rules2table {
($($tt:tt)+) => {{
let mut pair_table = [[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
]; NUM_STATES];
rules2table_impl! {(NUM_STATES pair_table) $($tt)+}
pair_table
}};
}
trait IteratorExt: Iterator {
/// Tests if all elements of the iterator are equal.
fn all_equal(&mut self) -> bool
where
<Self as Iterator>::Item: PartialEq,
Self: Sized,
{
if let Some(first) = self.next() {
self.all(|x| x == first)
} else {
true
}
}
}
impl<I: Iterator> IteratorExt for I {}
fn main() -> Result<(), Box<dyn Error>> {
println!("cargo:rerun-if-changed=LineBreak.txt");
assert!(NUM_STATES <= 0x3F, "Too many states");
let pair_table = rules2table! {
// Non-tailorable Line Breaking Rules
// LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
// SA, SG, and XX into other line breaking classes depending on criteria outside the scope
// of this algorithm.
Treat (AI | SG | XX | SA) as if it were AL, Treat CJ as if it were NS,
// Start and end of text:
sot '×', // LB2 Never break at the start of text.
'!' eot, // LB3 Always break at the end of text.
// Mandatory breaks:
BK '!', // LB4 Always break after hard line breaks.
// LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
CR '×' LF, CR '!', LF '!', NL '!',
'×' (BK | CR | LF | NL), // LB6 Do not break before hard line breaks.
// Explicit breaks and non-breaks:
'×' SP, '×' ZW, // LB7 Do not break before spaces or zero width space.
// LB8 Break before any character following a zero-width space, even if one or more spaces
// intervene.
(ZW | ZWSP) '÷', Treat (ZW | ZWSP) SP as if it were ZWSP, Treat ZWSP as if it were SP,
// ZWJ '×', // XXX Handled explicitly // LB8a Do not break after a zero width joiner.
// Combining marks:
// LB9 Do not break a combining character sequence; treat it as if it has the line breaking
// class of the base character in all of the following rules. Treat ZWJ as if it were CM.
Treat X (CM | ZWJ)* as if it were X where X = [^BK CR LF NL SP ZW sot eot ZWSP OPSP QUSP CLSP CPSP B2SP],
Treat (CM | ZWJ) as if it were AL, // LB10 Treat any remaining combining mark or ZWJ as AL.
// Word joiner:
'×' WJ, WJ '×', // LB11 Do not break before or after Word joiner and related characters.
// Non-breaking characters:
GL '×', // LB12 Do not break after NBSP and related characters.
// Tailorable Line Breaking Rules
[^SP BA HY sot eot ZWSP OPSP QUSP CLSP CPSP B2SP] '×' GL, // LB12a Do not break before NBSP and related characters, except after spaces and hyphens.
// LB13 Do not break before ] or ! or ; or /, even after spaces.
'×' CL, '×' CP, '×' EX, '×' IS, '×' SY,
// LB14 Do not break after [, even after spaces.
(OP | OPSP) '×', Treat (OP | OPSP) SP as if it were OPSP, Treat ZWSP as if it were SP,
// LB15 Do not break within ‘”[, even with intervening spaces.
(QU | QUSP) '×' OP, Treat (QU | QUSP) SP as if it were QUSP, Treat QUSP as if it were SP,
// LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
// intervening spaces.
(CL | CLSP | CP | CPSP) '×' NS,
Treat (CL | CLSP) SP as if it were CLSP, Treat CLSP as if it were SP,
Treat (CP | CPSP) SP as if it were CPSP, Treat CPSP as if it were SP,
// LB17 Do not break within ‘——’, even with intervening spaces.
(B2 | B2SP) '×' B2, Treat (B2 | B2SP) SP as if it were B2SP, Treat B2SP as if it were SP,
// Spaces:
SP '÷', // LB18 Break after spaces.
// Special case rules:
'×' QU, QU '×', // LB19 Do not break before or after quotation marks, such as ‘”’.
'÷' CB, CB '÷', // LB20 Break before and after unresolved CB.
// LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana,
// and other non-starters, or after acute accents.
'×' BA, '×' HY, '×' NS, BB '×',
// LB21a Don't break after Hebrew + Hyphen. // XXX Use a single state, HLHYBA, for HLHY and HLBA
HLHYBA '×', Treat HL (HY | BA) as if it were HLHYBA, Treat HLHYBA as if it were HY,
SY '×' HL, // LB21b Dont break between Solidus and Hebrew letters.
'×' IN, // LB22 Do not break before ellipses.
// Numbers:
(AL | HL) '×' NU, NU '×' (AL | HL), // LB23 Do not break between digits and letters.
// LB23a Do not break between numeric prefixes and ideographs, or between ideographs and
// numeric postfixes.
PR '×' (ID | EB | EM), (ID | EB | EM) '×' PO,
// LB24 Do not break between numeric prefix/postfix and letters, or between letters and
// prefix/postfix.
(PR | PO) '×' (AL | HL), (AL | HL) '×' (PR | PO),
// LB25 Do not break between the following pairs of classes relevant to numbers:
CL '×' PO, CP '×' PO, CL '×' PR, CP '×' PR, NU '×' PO, NU '×' PR, PO '×' OP, PO '×' NU, PR '×' OP, PR '×' NU, HY '×' NU, IS '×' NU, NU '×' NU, SY '×' NU,
// Korean syllable blocks
// LB26 Do not break a Korean syllable.
JL '×' (JL | JV | H2 | H3), (JV | H2) '×' (JV | JT), (JT | H3) '×' JT,
// LB27 Treat a Korean Syllable Block the same as ID.
(JL | JV | JT | H2 | H3) '×' IN, (JL | JV | JT | H2 | H3) '×' PO, PR '×' (JL | JV | JT | H2 | H3),
// Finally, join alphabetic letters into words and break everything else.
(AL | HL) '×' (AL | HL), // LB28 Do not break between alphabetics (“at”).
IS '×' (AL | HL), // LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
// LB30 Do not break between letters, numbers, or ordinary symbols and opening or closing
// parentheses.
(AL | HL | NU) '×' OP, CP '×' (AL | HL | NU),
// LB30a Break between two regional indicator symbols if and only if there are an even
// number of regional indicators preceding the position of the break.
RI '×' RI, Treat RI RI as if it were RIRI, Treat RIRI as if it were RI,
EB '×' EM, // LB30b Do not break between an emoji base and an emoji modifier.
'÷' ALL, ALL '÷', // LB31 Break everywhere else.
};
// Synthesize all non-"safe" pairs from pair table. There are generally more safe pairs.
let unsafe_pairs = (0..NUM_CLASSES).into_iter().flat_map(|j| {
(0..NUM_CLASSES).into_iter().filter_map(move |i| {
// All states that could have resulted from break class "i"
let possible_states = pair_table
.iter()
.map(|row| (row[i] & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT)) as usize);
// Check if all state transitions due to "j" are the same
if possible_states.map(|s| pair_table[s][j]).all_equal() {
None
} else {
Some((i, j))
}
})
});
let out_dir = env::var("OUT_DIR")?;
let dest_path = Path::new(&out_dir).join("tables.rs");
let mut stream = BufWriter::new(File::create(&dest_path)?);
stream.write_all(b"static BREAK_PROP_DATA: [[BreakClass; 256]; PAGE_COUNT] = [")?;
let re = Regex::new(
r"(?x)^
(?P<start>[[:xdigit:]]{4,}) # Unicode code point
(?:\.{2}(?P<end>[[:xdigit:]]{4,}))? # End range
;
(?P<lb>\w{2,3}) # Line_Break property",
)?;
let mut values = BufReader::new(File::open("LineBreak.txt")?)
.lines()
.map(Result::unwrap)
.filter(|l| !(l.starts_with('#') || l.is_empty()))
.scan(0, |next, l| {
let caps = re.captures(&l).unwrap();
let start = u32::from_str_radix(&caps["start"], 16).unwrap();
let end = caps
.name("end")
.map(|m| u32::from_str_radix(m.as_str(), 16).unwrap())
.unwrap_or(start);
let lb = caps["lb"].parse().unwrap();
let iter = (*next..=end).map(move |code| {
if code < start {
default_value(code)
} else {
lb
}
});
*next = end + 1;
Some(iter)
})
.flatten();
let mut page = Vec::with_capacity(256);
let mut page_count = 0;
let mut page_indices = Vec::new();
loop {
page.clear();
page.extend(values.by_ref().take(256));
if let Some(&first) = page.first() {
page_indices.push(if page.iter().all_equal() {
first as usize | UNIFORM_PAGE
} else {
writeln!(
stream,
"[{}],",
page.iter()
.copied()
.chain(iter::repeat(XX))
.take(256)
.map(|v| BREAK_CLASS_TABLE[v as usize])
.collect::<Vec<_>>()
.join(",")
)?;
let page_index = page_count;
page_count += 1;
page_index
});
} else {
break;
}
}
writeln!(
stream,
r"];
const PAGE_COUNT: usize = {};
static PAGE_INDICES: [usize; {}] = [",
page_count,
page_indices.len()
)?;
for page_idx in page_indices {
write!(stream, "{},", page_idx)?;
}
write!(
stream,
r"];
static PAIR_TABLE: [[u8; {}]; {}] = [",
NUM_CLASSES_EOT, NUM_STATES
)?;
for row in &pair_table {
write!(stream, "[")?;
for x in row {
write!(stream, "{},", x)?;
}
write!(stream, "],")?;
}
writeln!(
stream,
r"];
fn is_safe_pair(a: BreakClass, b: BreakClass) -> bool {{
!matches!((a, b), {})
}}",
unsafe_pairs
.map(|(i, j)| format!("({}, {})", BREAK_CLASS_TABLE[i], BREAK_CLASS_TABLE[j]))
.collect::<Vec<_>>()
.join("|")
)?;
Ok(())
}