Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2015 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 17 | #include "minikin/Hyphenator.h" |
| 18 | |
Seigo Nonaka | 1d46158 | 2017-10-19 17:43:56 -0700 | [diff] [blame] | 19 | #include <algorithm> |
| 20 | #include <memory> |
| 21 | #include <string> |
| 22 | #include <vector> |
| 23 | |
| 24 | #include <unicode/uchar.h> |
| 25 | #include <unicode/uscript.h> |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 26 | |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 27 | #include "minikin/Characters.h" |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 28 | |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 29 | namespace minikin { |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 30 | |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 31 | // The following are structs that correspond to tables inside the hyb file format |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 32 | |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 33 | struct AlphabetTable0 { |
| 34 | uint32_t version; |
| 35 | uint32_t min_codepoint; |
| 36 | uint32_t max_codepoint; |
| 37 | uint8_t data[1]; // actually flexible array, size is known at runtime |
| 38 | }; |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 39 | |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 40 | struct AlphabetTable1 { |
| 41 | uint32_t version; |
| 42 | uint32_t n_entries; |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 43 | uint32_t data[1]; // actually flexible array, size is known at runtime |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 44 | |
| 45 | static uint32_t codepoint(uint32_t entry) { return entry >> 11; } |
| 46 | static uint32_t value(uint32_t entry) { return entry & 0x7ff; } |
| 47 | }; |
| 48 | |
| 49 | struct Trie { |
| 50 | uint32_t version; |
| 51 | uint32_t char_mask; |
| 52 | uint32_t link_shift; |
| 53 | uint32_t link_mask; |
| 54 | uint32_t pattern_shift; |
| 55 | uint32_t n_entries; |
| 56 | uint32_t data[1]; // actually flexible array, size is known at runtime |
| 57 | }; |
| 58 | |
| 59 | struct Pattern { |
| 60 | uint32_t version; |
| 61 | uint32_t n_entries; |
| 62 | uint32_t pattern_offset; |
| 63 | uint32_t pattern_size; |
| 64 | uint32_t data[1]; // actually flexible array, size is known at runtime |
| 65 | |
| 66 | // accessors |
| 67 | static uint32_t len(uint32_t entry) { return entry >> 26; } |
| 68 | static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } |
| 69 | const uint8_t* buf(uint32_t entry) const { |
| 70 | return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff); |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 71 | } |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 72 | }; |
| 73 | |
| 74 | struct Header { |
| 75 | uint32_t magic; |
| 76 | uint32_t version; |
| 77 | uint32_t alphabet_offset; |
| 78 | uint32_t trie_offset; |
| 79 | uint32_t pattern_offset; |
| 80 | uint32_t file_size; |
| 81 | |
| 82 | // accessors |
| 83 | const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); } |
| 84 | uint32_t alphabetVersion() const { |
| 85 | return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); |
| 86 | } |
| 87 | const AlphabetTable0* alphabetTable0() const { |
| 88 | return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); |
| 89 | } |
| 90 | const AlphabetTable1* alphabetTable1() const { |
| 91 | return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); |
| 92 | } |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 93 | const Trie* trieTable() const { return reinterpret_cast<const Trie*>(bytes() + trie_offset); } |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 94 | const Pattern* patternTable() const { |
| 95 | return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); |
| 96 | } |
| 97 | }; |
| 98 | |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 99 | // static |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 100 | Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 101 | const std::string& locale) { |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 102 | HyphenationLocale hyphenLocale = HyphenationLocale::OTHER; |
Seigo Nonaka | b76fd0c | 2017-10-12 11:11:43 -0700 | [diff] [blame] | 103 | if (locale == "pl") { |
| 104 | hyphenLocale = HyphenationLocale::POLISH; |
| 105 | } else if (locale == "ca") { |
| 106 | hyphenLocale = HyphenationLocale::CATALAN; |
Seigo Nonaka | f1c8c29 | 2017-10-19 11:07:24 -0700 | [diff] [blame] | 107 | } else if (locale == "sl") { |
Seigo Nonaka | b76fd0c | 2017-10-12 11:11:43 -0700 | [diff] [blame] | 108 | hyphenLocale = HyphenationLocale::SLOVENIAN; |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 109 | } |
| 110 | return new Hyphenator(patternData, minPrefix, minSuffix, hyphenLocale); |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 111 | } |
| 112 | |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 113 | Hyphenator::Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 114 | HyphenationLocale hyphenLocale) |
| 115 | : mPatternData(patternData), |
| 116 | mMinPrefix(minPrefix), |
| 117 | mMinSuffix(minSuffix), |
| 118 | mHyphenationLocale(hyphenLocale) {} |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 119 | |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 120 | void Hyphenator::hyphenate(const U16StringPiece& word, HyphenationType* out) const { |
| 121 | const size_t len = word.size(); |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 122 | const size_t paddedLen = len + 2; // start and stop code each count for 1 |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 123 | if (mPatternData != nullptr && len >= mMinPrefix + mMinSuffix && |
| 124 | paddedLen <= MAX_HYPHENATED_SIZE) { |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 125 | uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 126 | const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word); |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 127 | if (hyphenValue != HyphenationType::DONT_BREAK) { |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 128 | hyphenateFromCodes(alpha_codes, paddedLen, hyphenValue, out); |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 129 | return; |
| 130 | } |
| 131 | // TODO: try NFC normalization |
| 132 | // TODO: handle non-BMP Unicode (requires remapping of offsets) |
| 133 | } |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 134 | // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the |
| 135 | // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup |
| 136 | // would return DONT_BREAK. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 137 | hyphenateWithNoPatterns(word, out); |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 138 | } |
| 139 | |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 140 | // This function determines whether a character is like U+2010 HYPHEN in |
| 141 | // line breaking and usage: a character immediately after which line breaks |
| 142 | // are allowed, but words containing it should not be automatically |
| 143 | // hyphenated using patterns. This is a curated set, created by manually |
| 144 | // inspecting all the characters that have the Unicode line breaking |
| 145 | // property of BA or HY and seeing which ones are hyphens. |
| 146 | bool Hyphenator::isLineBreakingHyphen(uint32_t c) { |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 147 | return (c == 0x002D || // HYPHEN-MINUS |
| 148 | c == 0x058A || // ARMENIAN HYPHEN |
| 149 | c == 0x05BE || // HEBREW PUNCTUATION MAQAF |
| 150 | c == 0x1400 || // CANADIAN SYLLABICS HYPHEN |
| 151 | c == 0x2010 || // HYPHEN |
| 152 | c == 0x2013 || // EN DASH |
| 153 | c == 0x2027 || // HYPHENATION POINT |
| 154 | c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN |
| 155 | c == 0x2E40); // DOUBLE HYPHEN |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 156 | } |
| 157 | |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 158 | EndHyphenEdit editForThisLine(HyphenationType type) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 159 | switch (type) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 160 | case HyphenationType::BREAK_AND_INSERT_HYPHEN: |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 161 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 162 | return EndHyphenEdit::INSERT_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 163 | case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 164 | return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 165 | case HyphenationType::BREAK_AND_INSERT_MAQAF: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 166 | return EndHyphenEdit::INSERT_MAQAF; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 167 | case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 168 | return EndHyphenEdit::INSERT_UCAS_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 169 | case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 170 | return EndHyphenEdit::REPLACE_WITH_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 171 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 172 | return EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN; |
| 173 | case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense. |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 174 | default: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 175 | return EndHyphenEdit::NO_EDIT; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 176 | } |
| 177 | } |
| 178 | |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 179 | StartHyphenEdit editForNextLine(HyphenationType type) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 180 | switch (type) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 181 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 182 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 183 | return StartHyphenEdit::INSERT_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 184 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 185 | return StartHyphenEdit::INSERT_ZWJ; |
| 186 | case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense. |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 187 | default: |
Seigo Nonaka | b1363f2 | 2017-11-01 16:02:49 -0700 | [diff] [blame] | 188 | return StartHyphenEdit::NO_EDIT; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 189 | } |
| 190 | } |
| 191 | |
| 192 | static UScriptCode getScript(uint32_t codePoint) { |
| 193 | UErrorCode errorCode = U_ZERO_ERROR; |
| 194 | const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode); |
| 195 | if (U_SUCCESS(errorCode)) { |
| 196 | return script; |
| 197 | } else { |
| 198 | return USCRIPT_INVALID_CODE; |
| 199 | } |
| 200 | } |
| 201 | |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 202 | static inline int32_t getJoiningType(UChar32 codepoint) { |
| 203 | return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); |
| 204 | } |
| 205 | |
| 206 | // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN. |
| 207 | // This function decides if the letters before and after the hyphen should appear as joining. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 208 | static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, size_t location) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 209 | ssize_t i = location; |
| 210 | int32_t type = U_JT_NON_JOINING; |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 211 | while (static_cast<size_t>(i) < word.size() && |
| 212 | (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 213 | i++; |
| 214 | } |
| 215 | if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) { |
| 216 | // The next character is of the type that may join the last character. See if the last |
| 217 | // character is also of the right type. |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 218 | i = location - 2; // Skip the soft hyphen |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 219 | type = U_JT_NON_JOINING; |
| 220 | while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { |
| 221 | i--; |
| 222 | } |
| 223 | if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) { |
| 224 | return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ; |
| 225 | } |
| 226 | } |
| 227 | return HyphenationType::BREAK_AND_INSERT_HYPHEN; |
| 228 | } |
| 229 | |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 230 | HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const { |
| 231 | // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen |
| 232 | // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens |
| 233 | // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch |
| 234 | // to maqaf for Hebrew, we can simply add a condition here. |
| 235 | const UScriptCode script = getScript(codePoint); |
| 236 | if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL || |
| 237 | script == USCRIPT_TELUGU) { |
| 238 | // Grantha is not included, since we don't support non-BMP hyphenation yet. |
| 239 | return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; |
| 240 | } else if (script == USCRIPT_ARMENIAN) { |
| 241 | return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; |
| 242 | } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { |
| 243 | return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; |
| 244 | } else if (isRepeatHyphen(script, mHyphenationLocale)) { |
| 245 | return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE; |
| 246 | } else { |
| 247 | return HyphenationType::BREAK_AND_INSERT_HYPHEN; |
| 248 | } |
| 249 | } |
| 250 | |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 251 | // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words |
| 252 | // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections |
| 253 | // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen). |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 254 | void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const { |
| 255 | out[0] = HyphenationType::DONT_BREAK; |
| 256 | for (size_t i = 1; i < word.size(); i++) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 257 | const uint16_t prevChar = word[i - 1]; |
| 258 | if (i > 1 && isLineBreakingHyphen(prevChar)) { |
| 259 | // Break after hyphens, but only if they don't start the word. |
| 260 | |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 261 | if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) && |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 262 | isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) { |
Roozbeh Pournader | 237f066 | 2017-10-13 15:20:23 -0700 | [diff] [blame] | 263 | // In Polish and Slovenian, hyphens get repeated at the next line. To be safe, |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 264 | // we will do this only if the next character is Latin. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 265 | out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 266 | } else { |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 267 | out[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 268 | } |
| 269 | } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) { |
| 270 | // Break after soft hyphens, but only if they don't start the word (a soft hyphen |
| 271 | // starting the word doesn't give any useful break opportunities). The type of the break |
| 272 | // is based on the script of the character we break on. |
| 273 | if (getScript(word[i]) == USCRIPT_ARABIC) { |
| 274 | // For Arabic, we need to look and see if the characters around the soft hyphen |
| 275 | // actually join. If they don't, we'll just insert a normal hyphen. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 276 | out[i] = getHyphTypeForArabic(word, i); |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 277 | } else { |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 278 | out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]); |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 279 | } |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 280 | } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix && |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 281 | ((word[i - 2] == 'l' && word[i] == 'l') || |
| 282 | (word[i - 2] == 'L' && word[i] == 'L')) && |
| 283 | mHyphenationLocale == HyphenationLocale::CATALAN) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 284 | // In Catalan, "l·l" should break as "l-" on the first line |
| 285 | // and "l" on the next line. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 286 | out[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 287 | } else { |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 288 | out[i] = HyphenationType::DONT_BREAK; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 289 | } |
Seigo Nonaka | 6c8722e | 2017-11-29 16:37:49 -0800 | [diff] [blame] | 290 | } |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 291 | } |
| 292 | |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 293 | HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, |
| 294 | const U16StringPiece& word) const { |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 295 | const Header* header = getHeader(); |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 296 | HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN; |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 297 | // TODO: check header magic |
| 298 | uint32_t alphabetVersion = header->alphabetVersion(); |
| 299 | if (alphabetVersion == 0) { |
| 300 | const AlphabetTable0* alphabet = header->alphabetTable0(); |
| 301 | uint32_t min_codepoint = alphabet->min_codepoint; |
| 302 | uint32_t max_codepoint = alphabet->max_codepoint; |
| 303 | alpha_codes[0] = 0; // word start |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 304 | for (size_t i = 0; i < word.size(); i++) { |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 305 | uint16_t c = word[i]; |
| 306 | if (c < min_codepoint || c >= max_codepoint) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 307 | return HyphenationType::DONT_BREAK; |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 308 | } |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 309 | uint8_t code = alphabet->data[c - min_codepoint]; |
| 310 | if (code == 0) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 311 | return HyphenationType::DONT_BREAK; |
| 312 | } |
| 313 | if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 314 | result = hyphenationTypeBasedOnScriptAndLocale(c); |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 315 | } |
| 316 | alpha_codes[i + 1] = code; |
| 317 | } |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 318 | alpha_codes[word.size() + 1] = 0; // word termination |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 319 | return result; |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 320 | } else if (alphabetVersion == 1) { |
| 321 | const AlphabetTable1* alphabet = header->alphabetTable1(); |
| 322 | size_t n_entries = alphabet->n_entries; |
| 323 | const uint32_t* begin = alphabet->data; |
| 324 | const uint32_t* end = begin + n_entries; |
| 325 | alpha_codes[0] = 0; |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 326 | for (size_t i = 0; i < word.size(); i++) { |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 327 | uint16_t c = word[i]; |
| 328 | auto p = std::lower_bound(begin, end, c << 11); |
| 329 | if (p == end) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 330 | return HyphenationType::DONT_BREAK; |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 331 | } |
| 332 | uint32_t entry = *p; |
| 333 | if (AlphabetTable1::codepoint(entry) != c) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 334 | return HyphenationType::DONT_BREAK; |
| 335 | } |
| 336 | if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 337 | result = hyphenationTypeBasedOnScriptAndLocale(c); |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 338 | } |
| 339 | alpha_codes[i + 1] = AlphabetTable1::value(entry); |
| 340 | } |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 341 | alpha_codes[word.size() + 1] = 0; |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 342 | return result; |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 343 | } |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 344 | return HyphenationType::DONT_BREAK; |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 345 | } |
| 346 | |
| 347 | /** |
| 348 | * Internal implementation, after conversion to codes. All case folding and normalization |
| 349 | * has been done by now, and all characters have been found in the alphabet. |
| 350 | * Note: len here is the padded length including 0 codes at start and end. |
| 351 | **/ |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 352 | void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue, |
| 353 | HyphenationType* out) const { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 354 | static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t."); |
| 355 | // Reuse the result array as a buffer for calculating intermediate hyphenation numbers. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 356 | uint8_t* buffer = reinterpret_cast<uint8_t*>(out); |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 357 | |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 358 | const Header* header = getHeader(); |
| 359 | const Trie* trie = header->trieTable(); |
| 360 | const Pattern* pattern = header->patternTable(); |
| 361 | uint32_t char_mask = trie->char_mask; |
| 362 | uint32_t link_shift = trie->link_shift; |
| 363 | uint32_t link_mask = trie->link_mask; |
| 364 | uint32_t pattern_shift = trie->pattern_shift; |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 365 | size_t maxOffset = len - mMinSuffix - 1; |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 366 | for (size_t i = 0; i < len - 1; i++) { |
| 367 | uint32_t node = 0; // index into Trie table |
| 368 | for (size_t j = i; j < len; j++) { |
| 369 | uint16_t c = codes[j]; |
| 370 | uint32_t entry = trie->data[node + c]; |
| 371 | if ((entry & char_mask) == c) { |
| 372 | node = (entry & link_mask) >> link_shift; |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 373 | } else { |
| 374 | break; |
| 375 | } |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 376 | uint32_t pat_ix = trie->data[node] >> pattern_shift; |
| 377 | // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset |
| 378 | // into the buf pool. This is the pattern for the substring (i..j) we just matched, |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 379 | // which we combine (via point-wise max) into the buffer vector. |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 380 | if (pat_ix != 0) { |
| 381 | uint32_t pat_entry = pattern->data[pat_ix]; |
| 382 | int pat_len = Pattern::len(pat_entry); |
| 383 | int pat_shift = Pattern::shift(pat_entry); |
| 384 | const uint8_t* pat_buf = pattern->buf(pat_entry); |
| 385 | int offset = j + 1 - (pat_len + pat_shift); |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 386 | // offset is the index within buffer that lines up with the start of pat_buf |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 387 | int start = std::max((int)mMinPrefix - offset, 0); |
Raph Levien | f0be43d | 2015-08-27 13:50:00 -0700 | [diff] [blame] | 388 | int end = std::min(pat_len, (int)maxOffset - offset); |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 389 | for (int k = start; k < end; k++) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 390 | buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]); |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 391 | } |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 392 | } |
| 393 | } |
| 394 | } |
| 395 | // Since the above calculation does not modify values outside |
Seigo Nonaka | 5aa870f | 2017-09-01 11:16:44 -0700 | [diff] [blame] | 396 | // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK. |
| 397 | for (size_t i = mMinPrefix; i < maxOffset; i++) { |
Roozbeh Pournader | c7ef400 | 2017-02-17 18:55:02 -0800 | [diff] [blame] | 398 | // Hyphenation opportunities happen when the hyphenation numbers are odd. |
Seigo Nonaka | 524d294 | 2017-12-11 21:24:19 -0800 | [diff] [blame] | 399 | out[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK; |
Raph Levien | 5cdad92 | 2015-03-30 14:20:18 -0700 | [diff] [blame] | 400 | } |
| 401 | } |
| 402 | |
Calvin Pan | d70a891 | 2022-07-27 14:52:12 +0800 | [diff] [blame] | 403 | bool Hyphenator::isRepeatHyphen(UScriptCode script, |
| 404 | HyphenationLocale locale) const { |
| 405 | return script == USCRIPT_LATIN && |
| 406 | (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN); |
| 407 | } |
| 408 | |
Seigo Nonaka | 14e2d13 | 2016-06-09 19:40:58 +0900 | [diff] [blame] | 409 | } // namespace minikin |