| //! Parses the rules into a state machine using a pair table. Each value in the table specifies the |
| //! next state and whether it's an forced/allowed break. To handles rules such as |
| //! |
| //! B SP* ÷ A |
| //! |
| //! the extra state BSP is employed in the pair table friendly equivalent rules |
| //! |
| //! (B | BSP) ÷ A, Treat (B | BSP) SP as if it were BSP, Treat BSP as if it were SP |
| #![recursion_limit = "512"] |
| |
| use regex::Regex; |
| use std::env; |
| use std::error::Error; |
| use std::fs::File; |
| use std::io::{BufRead, BufReader, BufWriter, Write}; |
| use std::iter; |
| use std::path::Path; |
| use std::str::FromStr; |
| |
| include!("src/shared.rs"); |
| |
| impl FromStr for BreakClass { |
| type Err = &'static str; |
| |
| fn from_str(s: &str) -> Result<Self, Self::Err> { |
| Ok(match s { |
| "BK" => BK, |
| "CR" => CR, |
| "LF" => LF, |
| "CM" => CM, |
| "NL" => NL, |
| "SG" => SG, |
| "WJ" => WJ, |
| "ZW" => ZW, |
| "GL" => GL, |
| "SP" => SP, |
| "ZWJ" => ZWJ, |
| "B2" => B2, |
| "BA" => BA, |
| "BB" => BB, |
| "HY" => HY, |
| "CB" => CB, |
| "CL" => CL, |
| "CP" => CP, |
| "EX" => EX, |
| "IN" => IN, |
| "NS" => NS, |
| "OP" => OP, |
| "QU" => QU, |
| "IS" => IS, |
| "NU" => NU, |
| "PO" => PO, |
| "PR" => PR, |
| "SY" => SY, |
| "AI" => AI, |
| "AL" => AL, |
| "CJ" => CJ, |
| "EB" => EB, |
| "EM" => EM, |
| "H2" => H2, |
| "H3" => H3, |
| "HL" => HL, |
| "ID" => ID, |
| "JL" => JL, |
| "JV" => JV, |
| "JT" => JT, |
| "RI" => RI, |
| "SA" => SA, |
| "XX" => XX, |
| _ => return Err("Invalid break class"), |
| }) |
| } |
| } |
| |
| const NUM_CLASSES: usize = 43; |
| static BREAK_CLASS_TABLE: [&str; NUM_CLASSES] = [ |
| "BK", "CR", "LF", "CM", "NL", "SG", "WJ", "ZW", "GL", "SP", "ZWJ", "B2", "BA", "BB", "HY", |
| "CB", "CL", "CP", "EX", "IN", "NS", "OP", "QU", "IS", "NU", "PO", "PR", "SY", "AI", "AL", "CJ", |
| "EB", "EM", "H2", "H3", "HL", "ID", "JL", "JV", "JT", "RI", "SA", "XX", |
| ]; |
| |
| fn default_value(codepoint: u32) -> BreakClass { |
| match codepoint { |
| // The unassigned code points in the following blocks default to "ID" |
| 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF => ID, |
| // All undesignated code points in Planes 2 and 3, whether inside or outside of allocated blocks, default to "ID" |
| 0x20000..=0x2FFFD | 0x30000..=0x3FFFD => ID, |
| // All unassigned code points in the following Plane 1 range, whether inside or outside of allocated blocks, also default to "ID" |
| 0x1F000..=0x1FAFF | 0x1FC00..=0x1FFFD => ID, |
| // The unassigned code points in the following block default to "PR" |
| 0x20A0..=0x20CF => PR, |
| // All code points, assigned and unassigned, that are not listed explicitly are given the value "XX" |
| _ => XX, |
| } |
| } |
| |
| #[derive(Copy, Clone)] |
| #[repr(u8)] |
| enum ExtraState { |
| ZWSP = sot + 1, |
| OPSP, |
| QUSP, |
| CLSP, |
| CPSP, |
| B2SP, |
| HLHYBA, |
| RIRI, |
| } |
| |
| use ExtraState::*; |
| |
| /// The number of classes plus the eot state. |
| const NUM_CLASSES_EOT: usize = NUM_CLASSES + 1; |
| const NUM_STATES: usize = NUM_CLASSES + 10; |
| |
| /// Separate implementation to prevent infinite recursion. |
| #[doc(hidden)] |
| macro_rules! rules2table_impl { |
| // Operators |
| (($len:ident $($args:tt)*) '÷' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '÷') $($tt)+}}; |
| (($len:ident $($args:tt)*) '×' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '×') $($tt)+}}; |
| (($len:ident $($args:tt)*) '!' $($tt:tt)+) => {rules2table_impl! {(NUM_CLASSES_EOT $($args)* '!') $($tt)+}}; |
| |
| // Perform operator |
| (($len:ident $pair_table:ident $($first:ident)? $operator:literal $($second:ident)?) $(, $($tt:tt)*)?) => { |
| $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? |
| |
| #[allow(unused)] let first = 0..NUM_STATES; // Default to ALL |
| $(let first = $first;)? |
| #[allow(unused)] let second = 0..NUM_CLASSES_EOT; // Default to ALL |
| $(let second = $second;)? |
| for i in first { |
| for j in second.clone() { |
| let cell = &mut $pair_table[i][j]; |
| match $operator { |
| '!' => *cell |= ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT, |
| '÷' => *cell |= ALLOWED_BREAK_BIT, |
| '×' => *cell &= !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), |
| _ => unreachable!("Bad operator"), |
| } |
| } |
| } |
| }; |
| |
| (($len:ident $($args:tt)*) Treat X $($tt:tt)*) => { |
| rules2table_impl! {(NUM_CLASSES_EOT $($args)* treat_x) $($tt)*} |
| }; |
| (($len:ident $($args:tt)*) Treat $($tt:tt)*) => { |
| rules2table_impl! {(NUM_STATES $($args)* treat) $($tt)*} |
| }; |
| (($len:ident $($args:tt)*) * as if it were X where X = $($tt:tt)*) => { |
| rules2table_impl! {(NUM_STATES $($args)* as_if_it_were_x_where_x_is) $($tt)*} |
| }; |
| |
| (($len:ident $pair_table:ident treat_x $second:ident as_if_it_were_x_where_x_is $X:ident) $(, $($tt:tt)*)?) => { |
| $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? |
| |
| for i in $X { |
| for j in $second.clone() { |
| $pair_table[i][j] = i as u8; |
| } |
| } |
| }; |
| (($len:ident $pair_table:ident treat $first:ident $second:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => { |
| $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? |
| |
| let cls = $cls as u8; |
| for i in $first { |
| for j in $second.clone() { |
| $pair_table[i][j] = cls; |
| } |
| } |
| }; |
| (($len:ident $pair_table:ident treat $first:ident) as if it were $cls:ident $(, $($tt:tt)*)?) => { |
| $(rules2table_impl! {(NUM_STATES $pair_table) $($tt)*})? |
| |
| for j in $first.clone().filter(|&j| j < NUM_CLASSES_EOT) { |
| for row in $pair_table.iter_mut() { |
| row[j] = row[$cls as usize]; |
| } |
| } |
| for i in $first { |
| $pair_table.copy_within($cls as usize..$cls as usize + 1, i); |
| } |
| }; |
| |
| // All classes pattern |
| (($len:ident $($args:tt)*) ALL $($tt:tt)*) => { |
| let indices = 0..$len; |
| rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} |
| }; |
| // Single class pattern |
| (($len:ident $($args:tt)*) $cls:ident $($tt:tt)*) => { |
| let indices = iter::once($cls as usize); |
| rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} |
| }; |
| // Parse (X | ...) patterns |
| (($len:ident $($args:tt)*) ($($cls:ident)|+) $($tt:tt)*) => { |
| let indices = [$($cls as usize),+].iter().cloned(); |
| rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} |
| }; |
| // Parse [^ ...] patterns |
| (($len:ident $($args:tt)*) [^$($cls:ident)+] $($tt:tt)*) => { |
| let excluded = [$($cls as usize),+]; |
| let indices = (0..$len).filter(|i| !excluded.contains(i)).collect::<Vec<_>>(); |
| let indices = indices.iter().cloned(); |
| rules2table_impl! {(NUM_CLASSES_EOT $($args)* indices) $($tt)*} |
| }; |
| |
| (($len:ident $pair_table:ident)) => {}; // Exit condition |
| } |
| |
| /// Returns a pair table conforming to the specified rules. |
| /// |
| /// The rule syntax is a modified subset of the one in Unicode Standard Annex #14. |
| macro_rules! rules2table { |
| ($($tt:tt)+) => {{ |
| let mut pair_table = [[ |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, |
| 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, |
| ]; NUM_STATES]; |
| rules2table_impl! {(NUM_STATES pair_table) $($tt)+} |
| pair_table |
| }}; |
| } |
| |
| trait IteratorExt: Iterator { |
| /// Tests if all elements of the iterator are equal. |
| fn all_equal(&mut self) -> bool |
| where |
| <Self as Iterator>::Item: PartialEq, |
| Self: Sized, |
| { |
| if let Some(first) = self.next() { |
| self.all(|x| x == first) |
| } else { |
| true |
| } |
| } |
| } |
| |
| impl<I: Iterator> IteratorExt for I {} |
| |
| fn main() -> Result<(), Box<dyn Error>> { |
| println!("cargo:rerun-if-changed=LineBreak.txt"); |
| assert!(NUM_STATES <= 0x3F, "Too many states"); |
| |
| let pair_table = rules2table! { |
| // Non-tailorable Line Breaking Rules |
| // LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, |
| // SA, SG, and XX into other line breaking classes depending on criteria outside the scope |
| // of this algorithm. |
| Treat (AI | SG | XX | SA) as if it were AL, Treat CJ as if it were NS, |
| // Start and end of text: |
| sot '×', // LB2 Never break at the start of text. |
| '!' eot, // LB3 Always break at the end of text. |
| // Mandatory breaks: |
| BK '!', // LB4 Always break after hard line breaks. |
| // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. |
| CR '×' LF, CR '!', LF '!', NL '!', |
| '×' (BK | CR | LF | NL), // LB6 Do not break before hard line breaks. |
| // Explicit breaks and non-breaks: |
| '×' SP, '×' ZW, // LB7 Do not break before spaces or zero width space. |
| // LB8 Break before any character following a zero-width space, even if one or more spaces |
| // intervene. |
| (ZW | ZWSP) '÷', Treat (ZW | ZWSP) SP as if it were ZWSP, Treat ZWSP as if it were SP, |
| // ZWJ '×', // XXX Handled explicitly // LB8a Do not break after a zero width joiner. |
| // Combining marks: |
| // LB9 Do not break a combining character sequence; treat it as if it has the line breaking |
| // class of the base character in all of the following rules. Treat ZWJ as if it were CM. |
| Treat X (CM | ZWJ)* as if it were X where X = [^BK CR LF NL SP ZW sot eot ZWSP OPSP QUSP CLSP CPSP B2SP], |
| Treat (CM | ZWJ) as if it were AL, // LB10 Treat any remaining combining mark or ZWJ as AL. |
| // Word joiner: |
| '×' WJ, WJ '×', // LB11 Do not break before or after Word joiner and related characters. |
| // Non-breaking characters: |
| GL '×', // LB12 Do not break after NBSP and related characters. |
| |
| // Tailorable Line Breaking Rules |
| [^SP BA HY sot eot ZWSP OPSP QUSP CLSP CPSP B2SP] '×' GL, // LB12a Do not break before NBSP and related characters, except after spaces and hyphens. |
| // LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. |
| '×' CL, '×' CP, '×' EX, '×' IS, '×' SY, |
| // LB14 Do not break after ‘[’, even after spaces. |
| (OP | OPSP) '×', Treat (OP | OPSP) SP as if it were OPSP, Treat ZWSP as if it were SP, |
| // LB15 Do not break within ‘”[’, even with intervening spaces. |
| (QU | QUSP) '×' OP, Treat (QU | QUSP) SP as if it were QUSP, Treat QUSP as if it were SP, |
| // LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with |
| // intervening spaces. |
| (CL | CLSP | CP | CPSP) '×' NS, |
| Treat (CL | CLSP) SP as if it were CLSP, Treat CLSP as if it were SP, |
| Treat (CP | CPSP) SP as if it were CPSP, Treat CPSP as if it were SP, |
| // LB17 Do not break within ‘——’, even with intervening spaces. |
| (B2 | B2SP) '×' B2, Treat (B2 | B2SP) SP as if it were B2SP, Treat B2SP as if it were SP, |
| // Spaces: |
| SP '÷', // LB18 Break after spaces. |
| // Special case rules: |
| '×' QU, QU '×', // LB19 Do not break before or after quotation marks, such as ‘”’. |
| '÷' CB, CB '÷', // LB20 Break before and after unresolved CB. |
| // LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana, |
| // and other non-starters, or after acute accents. |
| '×' BA, '×' HY, '×' NS, BB '×', |
| // LB21a Don't break after Hebrew + Hyphen. // XXX Use a single state, HLHYBA, for HLHY and HLBA |
| HLHYBA '×', Treat HL (HY | BA) as if it were HLHYBA, Treat HLHYBA as if it were HY, |
| SY '×' HL, // LB21b Don’t break between Solidus and Hebrew letters. |
| '×' IN, // LB22 Do not break before ellipses. |
| // Numbers: |
| (AL | HL) '×' NU, NU '×' (AL | HL), // LB23 Do not break between digits and letters. |
| // LB23a Do not break between numeric prefixes and ideographs, or between ideographs and |
| // numeric postfixes. |
| PR '×' (ID | EB | EM), (ID | EB | EM) '×' PO, |
| // LB24 Do not break between numeric prefix/postfix and letters, or between letters and |
| // prefix/postfix. |
| (PR | PO) '×' (AL | HL), (AL | HL) '×' (PR | PO), |
| // LB25 Do not break between the following pairs of classes relevant to numbers: |
| CL '×' PO, CP '×' PO, CL '×' PR, CP '×' PR, NU '×' PO, NU '×' PR, PO '×' OP, PO '×' NU, PR '×' OP, PR '×' NU, HY '×' NU, IS '×' NU, NU '×' NU, SY '×' NU, |
| // Korean syllable blocks |
| // LB26 Do not break a Korean syllable. |
| JL '×' (JL | JV | H2 | H3), (JV | H2) '×' (JV | JT), (JT | H3) '×' JT, |
| // LB27 Treat a Korean Syllable Block the same as ID. |
| (JL | JV | JT | H2 | H3) '×' IN, (JL | JV | JT | H2 | H3) '×' PO, PR '×' (JL | JV | JT | H2 | H3), |
| // Finally, join alphabetic letters into words and break everything else. |
| (AL | HL) '×' (AL | HL), // LB28 Do not break between alphabetics (“at”). |
| IS '×' (AL | HL), // LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). |
| // LB30 Do not break between letters, numbers, or ordinary symbols and opening or closing |
| // parentheses. |
| (AL | HL | NU) '×' OP, CP '×' (AL | HL | NU), |
| // LB30a Break between two regional indicator symbols if and only if there are an even |
| // number of regional indicators preceding the position of the break. |
| RI '×' RI, Treat RI RI as if it were RIRI, Treat RIRI as if it were RI, |
| EB '×' EM, // LB30b Do not break between an emoji base and an emoji modifier. |
| '÷' ALL, ALL '÷', // LB31 Break everywhere else. |
| }; |
| |
| // Synthesize all non-"safe" pairs from pair table. There are generally more safe pairs. |
| let unsafe_pairs = (0..NUM_CLASSES).into_iter().flat_map(|j| { |
| (0..NUM_CLASSES).into_iter().filter_map(move |i| { |
| // All states that could have resulted from break class "i" |
| let possible_states = pair_table |
| .iter() |
| .map(|row| (row[i] & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT)) as usize); |
| // Check if all state transitions due to "j" are the same |
| if possible_states.map(|s| pair_table[s][j]).all_equal() { |
| None |
| } else { |
| Some((i, j)) |
| } |
| }) |
| }); |
| |
| let out_dir = env::var("OUT_DIR")?; |
| let dest_path = Path::new(&out_dir).join("tables.rs"); |
| let mut stream = BufWriter::new(File::create(&dest_path)?); |
| |
| stream.write_all(b"static BREAK_PROP_DATA: [[BreakClass; 256]; PAGE_COUNT] = [")?; |
| |
| let re = Regex::new( |
| r"(?x)^ |
| (?P<start>[[:xdigit:]]{4,}) # Unicode code point |
| (?:\.{2}(?P<end>[[:xdigit:]]{4,}))? # End range |
| ; |
| (?P<lb>\w{2,3}) # Line_Break property", |
| )?; |
| |
| let mut values = BufReader::new(File::open("LineBreak.txt")?) |
| .lines() |
| .map(Result::unwrap) |
| .filter(|l| !(l.starts_with('#') || l.is_empty())) |
| .scan(0, |next, l| { |
| let caps = re.captures(&l).unwrap(); |
| let start = u32::from_str_radix(&caps["start"], 16).unwrap(); |
| let end = caps |
| .name("end") |
| .map(|m| u32::from_str_radix(m.as_str(), 16).unwrap()) |
| .unwrap_or(start); |
| let lb = caps["lb"].parse().unwrap(); |
| |
| let iter = (*next..=end).map(move |code| { |
| if code < start { |
| default_value(code) |
| } else { |
| lb |
| } |
| }); |
| *next = end + 1; |
| Some(iter) |
| }) |
| .flatten(); |
| |
| let mut page = Vec::with_capacity(256); |
| let mut page_count = 0; |
| let mut page_indices = Vec::new(); |
| loop { |
| page.clear(); |
| page.extend(values.by_ref().take(256)); |
| |
| if let Some(&first) = page.first() { |
| page_indices.push(if page.iter().all_equal() { |
| first as usize | UNIFORM_PAGE |
| } else { |
| writeln!( |
| stream, |
| "[{}],", |
| page.iter() |
| .copied() |
| .chain(iter::repeat(XX)) |
| .take(256) |
| .map(|v| BREAK_CLASS_TABLE[v as usize]) |
| .collect::<Vec<_>>() |
| .join(",") |
| )?; |
| let page_index = page_count; |
| page_count += 1; |
| page_index |
| }); |
| } else { |
| break; |
| } |
| } |
| |
| writeln!( |
| stream, |
| r"]; |
| |
| const PAGE_COUNT: usize = {}; |
| static PAGE_INDICES: [usize; {}] = [", |
| page_count, |
| page_indices.len() |
| )?; |
| for page_idx in page_indices { |
| write!(stream, "{},", page_idx)?; |
| } |
| write!( |
| stream, |
| r"]; |
| |
| static PAIR_TABLE: [[u8; {}]; {}] = [", |
| NUM_CLASSES_EOT, NUM_STATES |
| )?; |
| for row in &pair_table { |
| write!(stream, "[")?; |
| for x in row { |
| write!(stream, "{},", x)?; |
| } |
| write!(stream, "],")?; |
| } |
| writeln!( |
| stream, |
| r"]; |
| |
| fn is_safe_pair(a: BreakClass, b: BreakClass) -> bool {{ |
| !matches!((a, b), {}) |
| }}", |
| unsafe_pairs |
| .map(|(i, j)| format!("({}, {})", BREAK_CLASS_TABLE[i], BREAK_CLASS_TABLE[j])) |
| .collect::<Vec<_>>() |
| .join("|") |
| )?; |
| |
| Ok(()) |
| } |