blob: 098e6e81cfb45a99e34670be5e5d15080d84fa24 [file] [log] [blame]
Raph Levien5cdad922015-03-30 14:20:18 -07001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Raph Levien5cdad922015-03-30 14:20:18 -070017#include "minikin/Hyphenator.h"
18
Seigo Nonaka1d461582017-10-19 17:43:56 -070019#include <algorithm>
20#include <memory>
21#include <string>
22#include <vector>
23
24#include <unicode/uchar.h>
25#include <unicode/uscript.h>
Raph Levien5cdad922015-03-30 14:20:18 -070026
Seigo Nonakab1363f22017-11-01 16:02:49 -070027#include "minikin/Characters.h"
Raph Levien5cdad922015-03-30 14:20:18 -070028
Seigo Nonakab1363f22017-11-01 16:02:49 -070029namespace minikin {
Raph Levien5cdad922015-03-30 14:20:18 -070030
Raph Levienf0be43d2015-08-27 13:50:00 -070031// The following are structs that correspond to tables inside the hyb file format
Raph Levien5cdad922015-03-30 14:20:18 -070032
Raph Levienf0be43d2015-08-27 13:50:00 -070033struct AlphabetTable0 {
34 uint32_t version;
35 uint32_t min_codepoint;
36 uint32_t max_codepoint;
37 uint8_t data[1]; // actually flexible array, size is known at runtime
38};
Raph Levien5cdad922015-03-30 14:20:18 -070039
Raph Levienf0be43d2015-08-27 13:50:00 -070040struct AlphabetTable1 {
41 uint32_t version;
42 uint32_t n_entries;
Seigo Nonaka6c8722e2017-11-29 16:37:49 -080043 uint32_t data[1]; // actually flexible array, size is known at runtime
Raph Levienf0be43d2015-08-27 13:50:00 -070044
45 static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
46 static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
47};
48
49struct Trie {
50 uint32_t version;
51 uint32_t char_mask;
52 uint32_t link_shift;
53 uint32_t link_mask;
54 uint32_t pattern_shift;
55 uint32_t n_entries;
56 uint32_t data[1]; // actually flexible array, size is known at runtime
57};
58
59struct Pattern {
60 uint32_t version;
61 uint32_t n_entries;
62 uint32_t pattern_offset;
63 uint32_t pattern_size;
64 uint32_t data[1]; // actually flexible array, size is known at runtime
65
66 // accessors
67 static uint32_t len(uint32_t entry) { return entry >> 26; }
68 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
69 const uint8_t* buf(uint32_t entry) const {
70 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
Raph Levien5cdad922015-03-30 14:20:18 -070071 }
Raph Levienf0be43d2015-08-27 13:50:00 -070072};
73
74struct Header {
75 uint32_t magic;
76 uint32_t version;
77 uint32_t alphabet_offset;
78 uint32_t trie_offset;
79 uint32_t pattern_offset;
80 uint32_t file_size;
81
82 // accessors
83 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
84 uint32_t alphabetVersion() const {
85 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
86 }
87 const AlphabetTable0* alphabetTable0() const {
88 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
89 }
90 const AlphabetTable1* alphabetTable1() const {
91 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
92 }
Seigo Nonaka6c8722e2017-11-29 16:37:49 -080093 const Trie* trieTable() const { return reinterpret_cast<const Trie*>(bytes() + trie_offset); }
Raph Levienf0be43d2015-08-27 13:50:00 -070094 const Pattern* patternTable() const {
95 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
96 }
97};
98
Seigo Nonaka5aa870f2017-09-01 11:16:44 -070099// static
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700100Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800101 const std::string& locale) {
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700102 HyphenationLocale hyphenLocale = HyphenationLocale::OTHER;
Seigo Nonakab76fd0c2017-10-12 11:11:43 -0700103 if (locale == "pl") {
104 hyphenLocale = HyphenationLocale::POLISH;
105 } else if (locale == "ca") {
106 hyphenLocale = HyphenationLocale::CATALAN;
Seigo Nonakaf1c8c292017-10-19 11:07:24 -0700107 } else if (locale == "sl") {
Seigo Nonakab76fd0c2017-10-12 11:11:43 -0700108 hyphenLocale = HyphenationLocale::SLOVENIAN;
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700109 }
110 return new Hyphenator(patternData, minPrefix, minSuffix, hyphenLocale);
Raph Levien5cdad922015-03-30 14:20:18 -0700111}
112
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700113Hyphenator::Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800114 HyphenationLocale hyphenLocale)
115 : mPatternData(patternData),
116 mMinPrefix(minPrefix),
117 mMinSuffix(minSuffix),
118 mHyphenationLocale(hyphenLocale) {}
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700119
Seigo Nonaka524d2942017-12-11 21:24:19 -0800120void Hyphenator::hyphenate(const U16StringPiece& word, HyphenationType* out) const {
121 const size_t len = word.size();
Raph Levienf0be43d2015-08-27 13:50:00 -0700122 const size_t paddedLen = len + 2; // start and stop code each count for 1
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800123 if (mPatternData != nullptr && len >= mMinPrefix + mMinSuffix &&
124 paddedLen <= MAX_HYPHENATED_SIZE) {
Raph Levienf0be43d2015-08-27 13:50:00 -0700125 uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
Seigo Nonaka524d2942017-12-11 21:24:19 -0800126 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word);
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800127 if (hyphenValue != HyphenationType::DONT_BREAK) {
Seigo Nonaka524d2942017-12-11 21:24:19 -0800128 hyphenateFromCodes(alpha_codes, paddedLen, hyphenValue, out);
Raph Levienf0be43d2015-08-27 13:50:00 -0700129 return;
130 }
131 // TODO: try NFC normalization
132 // TODO: handle non-BMP Unicode (requires remapping of offsets)
133 }
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800134 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
135 // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
136 // would return DONT_BREAK.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800137 hyphenateWithNoPatterns(word, out);
Raph Levienf0be43d2015-08-27 13:50:00 -0700138}
139
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800140// This function determines whether a character is like U+2010 HYPHEN in
141// line breaking and usage: a character immediately after which line breaks
142// are allowed, but words containing it should not be automatically
143// hyphenated using patterns. This is a curated set, created by manually
144// inspecting all the characters that have the Unicode line breaking
145// property of BA or HY and seeing which ones are hyphens.
146bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800147 return (c == 0x002D || // HYPHEN-MINUS
148 c == 0x058A || // ARMENIAN HYPHEN
149 c == 0x05BE || // HEBREW PUNCTUATION MAQAF
150 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
151 c == 0x2010 || // HYPHEN
152 c == 0x2013 || // EN DASH
153 c == 0x2027 || // HYPHENATION POINT
154 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
155 c == 0x2E40); // DOUBLE HYPHEN
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800156}
157
Seigo Nonakab1363f22017-11-01 16:02:49 -0700158EndHyphenEdit editForThisLine(HyphenationType type) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800159 switch (type) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800160 case HyphenationType::BREAK_AND_INSERT_HYPHEN:
Calvin Pand70a8912022-07-27 14:52:12 +0800161 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700162 return EndHyphenEdit::INSERT_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800163 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700164 return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800165 case HyphenationType::BREAK_AND_INSERT_MAQAF:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700166 return EndHyphenEdit::INSERT_MAQAF;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800167 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700168 return EndHyphenEdit::INSERT_UCAS_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800169 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700170 return EndHyphenEdit::REPLACE_WITH_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800171 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700172 return EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN;
173 case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense.
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800174 default:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700175 return EndHyphenEdit::NO_EDIT;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800176 }
177}
178
Seigo Nonakab1363f22017-11-01 16:02:49 -0700179StartHyphenEdit editForNextLine(HyphenationType type) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800180 switch (type) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800181 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
Calvin Pand70a8912022-07-27 14:52:12 +0800182 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700183 return StartHyphenEdit::INSERT_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800184 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700185 return StartHyphenEdit::INSERT_ZWJ;
186 case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense.
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800187 default:
Seigo Nonakab1363f22017-11-01 16:02:49 -0700188 return StartHyphenEdit::NO_EDIT;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800189 }
190}
191
192static UScriptCode getScript(uint32_t codePoint) {
193 UErrorCode errorCode = U_ZERO_ERROR;
194 const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
195 if (U_SUCCESS(errorCode)) {
196 return script;
197 } else {
198 return USCRIPT_INVALID_CODE;
199 }
200}
201
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800202static inline int32_t getJoiningType(UChar32 codepoint) {
203 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
204}
205
206// Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
207// This function decides if the letters before and after the hyphen should appear as joining.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800208static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, size_t location) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800209 ssize_t i = location;
210 int32_t type = U_JT_NON_JOINING;
Seigo Nonaka524d2942017-12-11 21:24:19 -0800211 while (static_cast<size_t>(i) < word.size() &&
212 (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800213 i++;
214 }
215 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
216 // The next character is of the type that may join the last character. See if the last
217 // character is also of the right type.
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800218 i = location - 2; // Skip the soft hyphen
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800219 type = U_JT_NON_JOINING;
220 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
221 i--;
222 }
223 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
224 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
225 }
226 }
227 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
228}
229
Calvin Pand70a8912022-07-27 14:52:12 +0800230HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const {
231 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
232 // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
233 // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
234 // to maqaf for Hebrew, we can simply add a condition here.
235 const UScriptCode script = getScript(codePoint);
236 if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
237 script == USCRIPT_TELUGU) {
238 // Grantha is not included, since we don't support non-BMP hyphenation yet.
239 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
240 } else if (script == USCRIPT_ARMENIAN) {
241 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
242 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
243 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
244 } else if (isRepeatHyphen(script, mHyphenationLocale)) {
245 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE;
246 } else {
247 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
248 }
249}
250
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800251// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
252// that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
253// 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
Seigo Nonaka524d2942017-12-11 21:24:19 -0800254void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const {
255 out[0] = HyphenationType::DONT_BREAK;
256 for (size_t i = 1; i < word.size(); i++) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800257 const uint16_t prevChar = word[i - 1];
258 if (i > 1 && isLineBreakingHyphen(prevChar)) {
259 // Break after hyphens, but only if they don't start the word.
260
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800261 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
Calvin Pand70a8912022-07-27 14:52:12 +0800262 isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) {
Roozbeh Pournader237f0662017-10-13 15:20:23 -0700263 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800264 // we will do this only if the next character is Latin.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800265 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800266 } else {
Seigo Nonaka524d2942017-12-11 21:24:19 -0800267 out[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800268 }
269 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
270 // Break after soft hyphens, but only if they don't start the word (a soft hyphen
271 // starting the word doesn't give any useful break opportunities). The type of the break
272 // is based on the script of the character we break on.
273 if (getScript(word[i]) == USCRIPT_ARABIC) {
274 // For Arabic, we need to look and see if the characters around the soft hyphen
275 // actually join. If they don't, we'll just insert a normal hyphen.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800276 out[i] = getHyphTypeForArabic(word, i);
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800277 } else {
Calvin Pand70a8912022-07-27 14:52:12 +0800278 out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]);
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800279 }
Seigo Nonaka524d2942017-12-11 21:24:19 -0800280 } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800281 ((word[i - 2] == 'l' && word[i] == 'l') ||
282 (word[i - 2] == 'L' && word[i] == 'L')) &&
283 mHyphenationLocale == HyphenationLocale::CATALAN) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800284 // In Catalan, "l·l" should break as "l-" on the first line
285 // and "l" on the next line.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800286 out[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800287 } else {
Seigo Nonaka524d2942017-12-11 21:24:19 -0800288 out[i] = HyphenationType::DONT_BREAK;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800289 }
Seigo Nonaka6c8722e2017-11-29 16:37:49 -0800290 }
Raph Levienf0be43d2015-08-27 13:50:00 -0700291}
292
Seigo Nonaka524d2942017-12-11 21:24:19 -0800293HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
294 const U16StringPiece& word) const {
Raph Levienf0be43d2015-08-27 13:50:00 -0700295 const Header* header = getHeader();
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800296 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
Raph Levienf0be43d2015-08-27 13:50:00 -0700297 // TODO: check header magic
298 uint32_t alphabetVersion = header->alphabetVersion();
299 if (alphabetVersion == 0) {
300 const AlphabetTable0* alphabet = header->alphabetTable0();
301 uint32_t min_codepoint = alphabet->min_codepoint;
302 uint32_t max_codepoint = alphabet->max_codepoint;
303 alpha_codes[0] = 0; // word start
Seigo Nonaka524d2942017-12-11 21:24:19 -0800304 for (size_t i = 0; i < word.size(); i++) {
Raph Levienf0be43d2015-08-27 13:50:00 -0700305 uint16_t c = word[i];
306 if (c < min_codepoint || c >= max_codepoint) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800307 return HyphenationType::DONT_BREAK;
Raph Levien5cdad922015-03-30 14:20:18 -0700308 }
Raph Levienf0be43d2015-08-27 13:50:00 -0700309 uint8_t code = alphabet->data[c - min_codepoint];
310 if (code == 0) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800311 return HyphenationType::DONT_BREAK;
312 }
313 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
Calvin Pand70a8912022-07-27 14:52:12 +0800314 result = hyphenationTypeBasedOnScriptAndLocale(c);
Raph Levienf0be43d2015-08-27 13:50:00 -0700315 }
316 alpha_codes[i + 1] = code;
317 }
Seigo Nonaka524d2942017-12-11 21:24:19 -0800318 alpha_codes[word.size() + 1] = 0; // word termination
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800319 return result;
Raph Levienf0be43d2015-08-27 13:50:00 -0700320 } else if (alphabetVersion == 1) {
321 const AlphabetTable1* alphabet = header->alphabetTable1();
322 size_t n_entries = alphabet->n_entries;
323 const uint32_t* begin = alphabet->data;
324 const uint32_t* end = begin + n_entries;
325 alpha_codes[0] = 0;
Seigo Nonaka524d2942017-12-11 21:24:19 -0800326 for (size_t i = 0; i < word.size(); i++) {
Raph Levienf0be43d2015-08-27 13:50:00 -0700327 uint16_t c = word[i];
328 auto p = std::lower_bound(begin, end, c << 11);
329 if (p == end) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800330 return HyphenationType::DONT_BREAK;
Raph Levienf0be43d2015-08-27 13:50:00 -0700331 }
332 uint32_t entry = *p;
333 if (AlphabetTable1::codepoint(entry) != c) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800334 return HyphenationType::DONT_BREAK;
335 }
336 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
Calvin Pand70a8912022-07-27 14:52:12 +0800337 result = hyphenationTypeBasedOnScriptAndLocale(c);
Raph Levienf0be43d2015-08-27 13:50:00 -0700338 }
339 alpha_codes[i + 1] = AlphabetTable1::value(entry);
340 }
Seigo Nonaka524d2942017-12-11 21:24:19 -0800341 alpha_codes[word.size() + 1] = 0;
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800342 return result;
Raph Levienf0be43d2015-08-27 13:50:00 -0700343 }
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800344 return HyphenationType::DONT_BREAK;
Raph Levienf0be43d2015-08-27 13:50:00 -0700345}
346
347/**
348 * Internal implementation, after conversion to codes. All case folding and normalization
349 * has been done by now, and all characters have been found in the alphabet.
350 * Note: len here is the padded length including 0 codes at start and end.
351 **/
Seigo Nonaka524d2942017-12-11 21:24:19 -0800352void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
353 HyphenationType* out) const {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800354 static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
355 // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800356 uint8_t* buffer = reinterpret_cast<uint8_t*>(out);
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800357
Raph Levienf0be43d2015-08-27 13:50:00 -0700358 const Header* header = getHeader();
359 const Trie* trie = header->trieTable();
360 const Pattern* pattern = header->patternTable();
361 uint32_t char_mask = trie->char_mask;
362 uint32_t link_shift = trie->link_shift;
363 uint32_t link_mask = trie->link_mask;
364 uint32_t pattern_shift = trie->pattern_shift;
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700365 size_t maxOffset = len - mMinSuffix - 1;
Raph Levienf0be43d2015-08-27 13:50:00 -0700366 for (size_t i = 0; i < len - 1; i++) {
367 uint32_t node = 0; // index into Trie table
368 for (size_t j = i; j < len; j++) {
369 uint16_t c = codes[j];
370 uint32_t entry = trie->data[node + c];
371 if ((entry & char_mask) == c) {
372 node = (entry & link_mask) >> link_shift;
Raph Levien5cdad922015-03-30 14:20:18 -0700373 } else {
374 break;
375 }
Raph Levienf0be43d2015-08-27 13:50:00 -0700376 uint32_t pat_ix = trie->data[node] >> pattern_shift;
377 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
378 // into the buf pool. This is the pattern for the substring (i..j) we just matched,
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800379 // which we combine (via point-wise max) into the buffer vector.
Raph Levienf0be43d2015-08-27 13:50:00 -0700380 if (pat_ix != 0) {
381 uint32_t pat_entry = pattern->data[pat_ix];
382 int pat_len = Pattern::len(pat_entry);
383 int pat_shift = Pattern::shift(pat_entry);
384 const uint8_t* pat_buf = pattern->buf(pat_entry);
385 int offset = j + 1 - (pat_len + pat_shift);
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800386 // offset is the index within buffer that lines up with the start of pat_buf
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700387 int start = std::max((int)mMinPrefix - offset, 0);
Raph Levienf0be43d2015-08-27 13:50:00 -0700388 int end = std::min(pat_len, (int)maxOffset - offset);
Raph Levien5cdad922015-03-30 14:20:18 -0700389 for (int k = start; k < end; k++) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800390 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
Raph Levien5cdad922015-03-30 14:20:18 -0700391 }
Raph Levien5cdad922015-03-30 14:20:18 -0700392 }
393 }
394 }
395 // Since the above calculation does not modify values outside
Seigo Nonaka5aa870f2017-09-01 11:16:44 -0700396 // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK.
397 for (size_t i = mMinPrefix; i < maxOffset; i++) {
Roozbeh Pournaderc7ef4002017-02-17 18:55:02 -0800398 // Hyphenation opportunities happen when the hyphenation numbers are odd.
Seigo Nonaka524d2942017-12-11 21:24:19 -0800399 out[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
Raph Levien5cdad922015-03-30 14:20:18 -0700400 }
401}
402
Calvin Pand70a8912022-07-27 14:52:12 +0800403bool Hyphenator::isRepeatHyphen(UScriptCode script,
404 HyphenationLocale locale) const {
405 return script == USCRIPT_LATIN &&
406 (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN);
407}
408
Seigo Nonaka14e2d132016-06-09 19:40:58 +0900409} // namespace minikin