460 lines
18 KiB
Rust
460 lines
18 KiB
Rust
//! Parses the rules into a state machine using a pair table. Each value in the table specifies the
|
||
//! next state and whether it's an forced/allowed break. To handles rules such as
|
||
//!
|
||
//! B SP* ÷ A
|
||
//!
|
||
//! the extra state BSP is employed in the pair table friendly equivalent rules
|
||
//!
|
||
//! (B | BSP) ÷ A, Treat (B | BSP) SP as if it were BSP, Treat BSP as if it were SP
|
||
#![recursion_limit = "512"]
|
||
|
||
use regex::Regex;
|
||
use std::env;
|
||
use std::error::Error;
|
||
use std::fs::File;
|
||
use std::io::{BufRead, BufReader, BufWriter, Write};
|
||
use std::iter;
|
||
use std::path::Path;
|
||
use std::str::FromStr;
|
||
|
||
include!("src/shared.rs");
|
||
|
||
impl FromStr for BreakClass {
|
||
type Err = &'static str;
|
||
|
||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||
Ok(match s {
|
||
"BK" => BK,
|
||
"CR" => CR,
|
||
"LF" => LF,
|
||
"CM" => CM,
|
||
"NL" => NL,
|
||
"SG" => SG,
|
||
"WJ" => WJ,
|
||
"ZW" => ZW,
|
||
"GL" => GL,
|
||
"SP" => SP,
|
||
"ZWJ" => ZWJ,
|
||
"B2" => B2,
|
||
"BA" => BA,
|
||
"BB" => BB,
|
||
"HY" => HY,
|
||
"CB" => CB,
|
||
"CL" => CL,
|
||
"CP" => CP,
|
||
"EX" => EX,
|
||
"IN" => IN,
|
||
"NS" => NS,
|
||
"OP" => OP,
|
||
"QU" => QU,
|
||
"IS" => IS,
|
||
"NU" => NU,
|
||
"PO" => PO,
|
||
"PR" => PR,
|
||
"SY" => SY,
|
||
"AI" => AI,
|
||
"AL" => AL,
|
||
"CJ" => CJ,
|
||
"EB" => EB,
|
||
"EM" => EM,
|
||
"H2" => H2,
|
||
"H3" => H3,
|
||
"HL" => HL,
|
||
"ID" => ID,
|
||
"JL" => JL,
|
||
"JV" => JV,
|
||
"JT" => JT,
|
||
"RI" => RI,
|
||
"SA" => SA,
|
||
"XX" => XX,
|
||
_ => return Err("Invalid break class"),
|
||
})
|
||
}
|
||
}
|
||
|
||
const NUM_CLASSES: usize = 43;
|
||
static BREAK_CLASS_TABLE: [&str; NUM_CLASSES] = [
|
||
"BK", "CR", "LF", "CM", "NL", "SG", "WJ", "ZW", "GL", "SP", "ZWJ", "B2", "BA", "BB", "HY",
|
||
"CB", "CL", "CP", "EX", "IN", "NS", "OP", "QU", "IS", "NU", "PO", "PR", "SY", "AI", "AL", "CJ",
|
||
"EB", "EM", "H2", "H3", "HL", "ID", "JL", "JV", "JT", "RI", "SA", "XX",
|
||
];
|
||
|
||
fn default_value(codepoint: u32) -> BreakClass {
|
||
match codepoint {
|
||
// The unassigned code points in the following blocks default to "ID"
|
||
0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF => ID,
|
||
// All undesignated code points in Planes 2 and 3, whether inside or outside of allocated blocks, default to "ID"
|
||
0x20000..=0x2FFFD | 0x30000..=0x3FFFD => ID,
|
||
// All unassigned code points in the following Plane 1 range, whether inside or outside of allocated blocks, also default to "ID"
|
||
0x1F000..=0x1FAFF | 0x1FC00..=0x1FFFD => ID,
|
||
// The unassigned code points in the following block default to "PR"
|
||
0x20A0..=0x20CF => PR,
|
||
// All code points, assigned and unassigned, that are not listed explicitly are given the value "XX"
|
||
_ => XX,
|
||
}
|
||
}
|
||
|
||
#[derive(Copy, Clone)]
|
||
#[repr(u8)]
|
||
enum ExtraState {
|
||
ZWSP = sot + 1,
|
||
OPSP,
|
||
QUSP,
|
||
CLSP,
|
||
CPSP,
|
||
B2SP,
|
||
HLHYBA,
|
||
RIRI,
|
||
}
|
||
|
||
use ExtraState::*;
|
||
|
||
/// The number of classes plus the eot state.
|
||
const NUM_CLASSES_EOT: usize = NUM_CLASSES + 1;
|
||
const NUM_STATES: usize = NUM_CLASSES + 10;
|
||
|
||
/// Separate implementation to prevent infinite recursion.
|
||
#[doc(hidden)]
|
||
macro_rules! rules2table_impl {
|
||
// Operators
|
||
(($len:ident $($args:tt)*) '÷' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '÷') $($tt)+}};
|
||
(($len:ident $($args:tt)*) '×' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '×') $($tt)+}};
|
||
(($len:ident $($args:tt)*) '!' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '!') $($tt)+}};
|
||
|
||
// Perform operator
|
||
(($len:ident $pair_table:ident $($first:ident)? $operator:literal $($second:ident)?) $(, $($tt:tt)*)?) => {
|
||
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
|
||
|
||
#[allow(unused)] let first = 0..NUM_STATES; // Default to ALL
|
||
$(let first = $first;)?
|
||
#[allow(unused)] let second = 0..NUM_CLASSES_EOT; // Default to ALL
|
||
$(let second = $second;)?
|
||
for i in first {
|
||
for j in second.clone() {
|
||
let cell = &mut $pair_table[i][j];
|
||
match $operator {
|
||
'!' => *cell |= ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT,
|
||
'÷' => *cell |= ALLOWED_BREAK_BIT,
|
||
'×' => *cell &= !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT),
|
||
_ => unreachable!("Bad operator"),
|
||
}
|
||
}
|
||
}
|
||
};
|
||
|
||
(($len:ident $($args:tt)*) Treat X $($tt:tt)*) => {
|
||
rules2table_impl! {(NUM_CLASSES_EOT $($args)* treat_x) $($tt)*}
|
||
};
|
||
(($len:ident $($args:tt)*) Treat $($tt:tt)*) => {
|
||
rules2table_impl! {(NUM_STATES $($args)* treat) $($tt)*}
|
||
};
|
||
(($len:ident $($args:tt)*) * as if it were X where X = $($tt:tt)*) => {
|
||
rules2table_impl! {(NUM_STATES $($args)* as_if_it_were_x_where_x_is) $($tt)*}
|
||
};
|
||
|
||
(($len:ident $pair_table:ident treat_x $second:ident as_if_it_were_x_where_x_is $X:ident) $(, $($tt:tt)*)?) => {
|
||
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
|
||
|
||
for i in $X {
|
||
for j in $second.clone() {
|
||
$pair_table[i][j] = i as u8;
|
||
}
|
||
}
|
||
};
|
||
(($len:ident $pair_table:ident treat $first:ident $second:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => {
|
||
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
|
||
|
||
let cls = $cls as u8;
|
||
for i in $first {
|
||
for j in $second.clone() {
|
||
$pair_table[i][j] = cls;
|
||
}
|
||
}
|
||
};
|
||
(($len:ident $pair_table:ident treat $first:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => {
|
||
$(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})?
|
||
|
||
for j in $first.clone().filter(|&j| j < NUM_CLASSES_EOT) {
|
||
for row in $pair_table.iter_mut() {
|
||
row[j] = row[$cls as usize];
|
||
}
|
||
}
|
||
for i in $first {
|
||
$pair_table.copy_within($cls as usize..$cls as usize + 1, i);
|
||
}
|
||
};
|
||
|
||
// All classes pattern
|
||
(($len:ident $($args:tt)*) ALL $($tt:tt)*) => {
|
||
let indices = 0..$len;
|
||
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
|
||
};
|
||
// Single class pattern
|
||
(($len:ident $($args:tt)*) $cls:ident $($tt:tt)*) => {
|
||
let indices = iter::once($cls as usize);
|
||
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
|
||
};
|
||
// Parse (X | ...) patterns
|
||
(($len:ident $($args:tt)*) ($($cls:ident)|+) $($tt:tt)*) => {
|
||
let indices = [$($cls as usize),+].iter().cloned();
|
||
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
|
||
};
|
||
// Parse [^ ...] patterns
|
||
(($len:ident $($args:tt)*) [^$($cls:ident)+] $($tt:tt)*) => {
|
||
let excluded = [$($cls as usize),+];
|
||
let indices = (0..$len).filter(|i| !excluded.contains(i)).collect::<Vec<_>>();
|
||
let indices = indices.iter().cloned();
|
||
rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*}
|
||
};
|
||
|
||
(($len:ident $pair_table:ident)) => {}; // Exit condition
|
||
}
|
||
|
||
/// Returns a pair table conforming to the specified rules.
|
||
///
|
||
/// The rule syntax is a modified subset of the one in Unicode Standard Annex #14.
|
||
macro_rules! rules2table {
|
||
($($tt:tt)+) => {{
|
||
let mut pair_table = [[
|
||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
|
||
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
|
||
]; NUM_STATES];
|
||
rules2table_impl! {(NUM_STATES pair_table) $($tt)+}
|
||
pair_table
|
||
}};
|
||
}
|
||
|
||
trait IteratorExt: Iterator {
|
||
/// Tests if all elements of the iterator are equal.
|
||
fn all_equal(&mut self) -> bool
|
||
where
|
||
<Self as Iterator>::Item: PartialEq,
|
||
Self: Sized,
|
||
{
|
||
if let Some(first) = self.next() {
|
||
self.all(|x| x == first)
|
||
} else {
|
||
true
|
||
}
|
||
}
|
||
}
|
||
|
||
impl<I: Iterator> IteratorExt for I {}
|
||
|
||
fn main() -> Result<(), Box<dyn Error>> {
|
||
println!("cargo:rerun-if-changed=LineBreak.txt");
|
||
assert!(NUM_STATES <= 0x3F, "Too many states");
|
||
|
||
let pair_table = rules2table! {
|
||
// Non-tailorable Line Breaking Rules
|
||
// LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
|
||
// SA, SG, and XX into other line breaking classes depending on criteria outside the scope
|
||
// of this algorithm.
|
||
Treat (AI | SG | XX | SA) as if it were AL, Treat CJ as if it were NS,
|
||
// Start and end of text:
|
||
sot '×', // LB2 Never break at the start of text.
|
||
'!' eot, // LB3 Always break at the end of text.
|
||
// Mandatory breaks:
|
||
BK '!', // LB4 Always break after hard line breaks.
|
||
// LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
|
||
CR '×' LF, CR '!', LF '!', NL '!',
|
||
'×' (BK | CR | LF | NL), // LB6 Do not break before hard line breaks.
|
||
// Explicit breaks and non-breaks:
|
||
'×' SP, '×' ZW, // LB7 Do not break before spaces or zero width space.
|
||
// LB8 Break before any character following a zero-width space, even if one or more spaces
|
||
// intervene.
|
||
(ZW | ZWSP) '÷', Treat (ZW | ZWSP) SP as if it were ZWSP, Treat ZWSP as if it were SP,
|
||
// ZWJ '×', // XXX Handled explicitly // LB8a Do not break after a zero width joiner.
|
||
// Combining marks:
|
||
// LB9 Do not break a combining character sequence; treat it as if it has the line breaking
|
||
// class of the base character in all of the following rules. Treat ZWJ as if it were CM.
|
||
Treat X (CM | ZWJ)* as if it were X where X = [^BK CR LF NL SP ZW sot eot ZWSP OPSP QUSP CLSP CPSP B2SP],
|
||
Treat (CM | ZWJ) as if it were AL, // LB10 Treat any remaining combining mark or ZWJ as AL.
|
||
// Word joiner:
|
||
'×' WJ, WJ '×', // LB11 Do not break before or after Word joiner and related characters.
|
||
// Non-breaking characters:
|
||
GL '×', // LB12 Do not break after NBSP and related characters.
|
||
|
||
// Tailorable Line Breaking Rules
|
||
[^SP BA HY sot eot ZWSP OPSP QUSP CLSP CPSP B2SP] '×' GL, // LB12a Do not break before NBSP and related characters, except after spaces and hyphens.
|
||
// LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||
'×' CL, '×' CP, '×' EX, '×' IS, '×' SY,
|
||
// LB14 Do not break after ‘[’, even after spaces.
|
||
(OP | OPSP) '×', Treat (OP | OPSP) SP as if it were OPSP, Treat ZWSP as if it were SP,
|
||
// LB15 Do not break within ‘”[’, even with intervening spaces.
|
||
(QU | QUSP) '×' OP, Treat (QU | QUSP) SP as if it were QUSP, Treat QUSP as if it were SP,
|
||
// LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
|
||
// intervening spaces.
|
||
(CL | CLSP | CP | CPSP) '×' NS,
|
||
Treat (CL | CLSP) SP as if it were CLSP, Treat CLSP as if it were SP,
|
||
Treat (CP | CPSP) SP as if it were CPSP, Treat CPSP as if it were SP,
|
||
// LB17 Do not break within ‘——’, even with intervening spaces.
|
||
(B2 | B2SP) '×' B2, Treat (B2 | B2SP) SP as if it were B2SP, Treat B2SP as if it were SP,
|
||
// Spaces:
|
||
SP '÷', // LB18 Break after spaces.
|
||
// Special case rules:
|
||
'×' QU, QU '×', // LB19 Do not break before or after quotation marks, such as ‘”’.
|
||
'÷' CB, CB '÷', // LB20 Break before and after unresolved CB.
|
||
// LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana,
|
||
// and other non-starters, or after acute accents.
|
||
'×' BA, '×' HY, '×' NS, BB '×',
|
||
// LB21a Don't break after Hebrew + Hyphen. // XXX Use a single state, HLHYBA, for HLHY and HLBA
|
||
HLHYBA '×', Treat HL (HY | BA) as if it were HLHYBA, Treat HLHYBA as if it were HY,
|
||
SY '×' HL, // LB21b Don’t break between Solidus and Hebrew letters.
|
||
'×' IN, // LB22 Do not break before ellipses.
|
||
// Numbers:
|
||
(AL | HL) '×' NU, NU '×' (AL | HL), // LB23 Do not break between digits and letters.
|
||
// LB23a Do not break between numeric prefixes and ideographs, or between ideographs and
|
||
// numeric postfixes.
|
||
PR '×' (ID | EB | EM), (ID | EB | EM) '×' PO,
|
||
// LB24 Do not break between numeric prefix/postfix and letters, or between letters and
|
||
// prefix/postfix.
|
||
(PR | PO) '×' (AL | HL), (AL | HL) '×' (PR | PO),
|
||
// LB25 Do not break between the following pairs of classes relevant to numbers:
|
||
CL '×' PO, CP '×' PO, CL '×' PR, CP '×' PR, NU '×' PO, NU '×' PR, PO '×' OP, PO '×' NU, PR '×' OP, PR '×' NU, HY '×' NU, IS '×' NU, NU '×' NU, SY '×' NU,
|
||
// Korean syllable blocks
|
||
// LB26 Do not break a Korean syllable.
|
||
JL '×' (JL | JV | H2 | H3), (JV | H2) '×' (JV | JT), (JT | H3) '×' JT,
|
||
// LB27 Treat a Korean Syllable Block the same as ID.
|
||
(JL | JV | JT | H2 | H3) '×' IN, (JL | JV | JT | H2 | H3) '×' PO, PR '×' (JL | JV | JT | H2 | H3),
|
||
// Finally, join alphabetic letters into words and break everything else.
|
||
(AL | HL) '×' (AL | HL), // LB28 Do not break between alphabetics (“at”).
|
||
IS '×' (AL | HL), // LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
|
||
// LB30 Do not break between letters, numbers, or ordinary symbols and opening or closing
|
||
// parentheses.
|
||
(AL | HL | NU) '×' OP, CP '×' (AL | HL | NU),
|
||
// LB30a Break between two regional indicator symbols if and only if there are an even
|
||
// number of regional indicators preceding the position of the break.
|
||
RI '×' RI, Treat RI RI as if it were RIRI, Treat RIRI as if it were RI,
|
||
EB '×' EM, // LB30b Do not break between an emoji base and an emoji modifier.
|
||
'÷' ALL, ALL '÷', // LB31 Break everywhere else.
|
||
};
|
||
|
||
// Synthesize all non-"safe" pairs from pair table. There are generally more safe pairs.
|
||
let unsafe_pairs = (0..NUM_CLASSES).into_iter().flat_map(|j| {
|
||
(0..NUM_CLASSES).into_iter().filter_map(move |i| {
|
||
// All states that could have resulted from break class "i"
|
||
let possible_states = pair_table
|
||
.iter()
|
||
.map(|row| (row[i] & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT)) as usize);
|
||
// Check if all state transitions due to "j" are the same
|
||
if possible_states.map(|s| pair_table[s][j]).all_equal() {
|
||
None
|
||
} else {
|
||
Some((i, j))
|
||
}
|
||
})
|
||
});
|
||
|
||
let out_dir = env::var("OUT_DIR")?;
|
||
let dest_path = Path::new(&out_dir).join("tables.rs");
|
||
let mut stream = BufWriter::new(File::create(&dest_path)?);
|
||
|
||
stream.write_all(b"static BREAK_PROP_DATA: [[BreakClass; 256]; PAGE_COUNT] = [")?;
|
||
|
||
let re = Regex::new(
|
||
r"(?x)^
|
||
(?P<start>[[:xdigit:]]{4,}) # Unicode code point
|
||
(?:\.{2}(?P<end>[[:xdigit:]]{4,}))? # End range
|
||
;
|
||
(?P<lb>\w{2,3}) # Line_Break property",
|
||
)?;
|
||
|
||
let mut values = BufReader::new(File::open("LineBreak.txt")?)
|
||
.lines()
|
||
.map(Result::unwrap)
|
||
.filter(|l| !(l.starts_with('#') || l.is_empty()))
|
||
.scan(0, |next, l| {
|
||
let caps = re.captures(&l).unwrap();
|
||
let start = u32::from_str_radix(&caps["start"], 16).unwrap();
|
||
let end = caps
|
||
.name("end")
|
||
.map(|m| u32::from_str_radix(m.as_str(), 16).unwrap())
|
||
.unwrap_or(start);
|
||
let lb = caps["lb"].parse().unwrap();
|
||
|
||
let iter = (*next..=end).map(move |code| {
|
||
if code < start {
|
||
default_value(code)
|
||
} else {
|
||
lb
|
||
}
|
||
});
|
||
*next = end + 1;
|
||
Some(iter)
|
||
})
|
||
.flatten();
|
||
|
||
let mut page = Vec::with_capacity(256);
|
||
let mut page_count = 0;
|
||
let mut page_indices = Vec::new();
|
||
loop {
|
||
page.clear();
|
||
page.extend(values.by_ref().take(256));
|
||
|
||
if let Some(&first) = page.first() {
|
||
page_indices.push(if page.iter().all_equal() {
|
||
first as usize | UNIFORM_PAGE
|
||
} else {
|
||
writeln!(
|
||
stream,
|
||
"[{}],",
|
||
page.iter()
|
||
.copied()
|
||
.chain(iter::repeat(XX))
|
||
.take(256)
|
||
.map(|v| BREAK_CLASS_TABLE[v as usize])
|
||
.collect::<Vec<_>>()
|
||
.join(",")
|
||
)?;
|
||
let page_index = page_count;
|
||
page_count += 1;
|
||
page_index
|
||
});
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
|
||
writeln!(
|
||
stream,
|
||
r"];
|
||
|
||
const PAGE_COUNT: usize = {};
|
||
static PAGE_INDICES: [usize; {}] = [",
|
||
page_count,
|
||
page_indices.len()
|
||
)?;
|
||
for page_idx in page_indices {
|
||
write!(stream, "{},", page_idx)?;
|
||
}
|
||
write!(
|
||
stream,
|
||
r"];
|
||
|
||
static PAIR_TABLE: [[u8; {}]; {}] = [",
|
||
NUM_CLASSES_EOT, NUM_STATES
|
||
)?;
|
||
for row in &pair_table {
|
||
write!(stream, "[")?;
|
||
for x in row {
|
||
write!(stream, "{},", x)?;
|
||
}
|
||
write!(stream, "],")?;
|
||
}
|
||
writeln!(
|
||
stream,
|
||
r"];
|
||
|
||
fn is_safe_pair(a: BreakClass, b: BreakClass) -> bool {{
|
||
!matches!((a, b), {})
|
||
}}",
|
||
unsafe_pairs
|
||
.map(|(i, j)| format!("({}, {})", BREAK_CLASS_TABLE[i], BREAK_CLASS_TABLE[j]))
|
||
.collect::<Vec<_>>()
|
||
.join("|")
|
||
)?;
|
||
|
||
Ok(())
|
||
}
|