claireho | 5569331 | 2010-04-26 13:43:16 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2008 Nokia Corporation and/or its subsidiary(-ies) |
| 3 | * |
| 4 | * This is part of HarfBuzz, an OpenType Layout engine library. |
| 5 | * |
| 6 | * Permission is hereby granted, without written agreement and without |
| 7 | * license or royalty fees, to use, copy, modify, and distribute this |
| 8 | * software and its documentation for any purpose, provided that the |
| 9 | * above copyright notice and the following two paragraphs appear in |
| 10 | * all copies of this software. |
| 11 | * |
| 12 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
| 13 | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
| 14 | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
| 15 | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
| 16 | * DAMAGE. |
| 17 | * |
| 18 | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
| 19 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 20 | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
| 21 | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
| 22 | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
| 23 | */ |
| 24 | |
| 25 | #include "harfbuzz-shaper.h" |
| 26 | #include "harfbuzz-shaper-private.h" |
| 27 | |
| 28 | #include <assert.h> |
| 29 | #include <stdio.h> |
| 30 | |
| 31 | /* |
| 32 | // Vocabulary |
| 33 | // Base -> A consonant or an independent vowel in its full (not subscript) form. It is the |
| 34 | // center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels, |
| 35 | // split vowels, signs... but there is only one base in a syllable, it has to be coded as |
| 36 | // the first character of the syllable. |
| 37 | // split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant). |
| 38 | // Khmer language has five of them. Khmer split vowels either have one part before the |
| 39 | // base and one after the base or they have a part before the base and a part above the base. |
| 40 | // The first part of all Khmer split vowels is the same character, identical to |
| 41 | // the glyph of Khmer dependent vowel SRA EI |
| 42 | // coeng --> modifier used in Khmer to construct coeng (subscript) consonants |
| 43 | // Differently than indian languages, the coeng modifies the consonant that follows it, |
| 44 | // not the one preceding it Each consonant has two forms, the base form and the subscript form |
| 45 | // the base form is the normal one (using the consonants code-point), the subscript form is |
| 46 | // displayed when the combination coeng + consonant is encountered. |
| 47 | // Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant |
| 48 | // Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO) |
| 49 | // Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA) |
| 50 | // Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds |
| 51 | // if it is attached to a consonant of the first series or a consonant of the second series |
| 52 | // Most consonants have an equivalent in the other series, but some of theme exist only in |
| 53 | // one series (for example SA). If we want to use the consonant SA with a vowel sound that |
| 54 | // can only be done with a vowel sound that corresponds to a vowel accompanying a consonant |
| 55 | // of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN |
| 56 | // x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and |
| 57 | // MUSIKATOAN a second series consonant to have a first series vowel sound. |
| 58 | // Consonant shifter are both normally supercript marks, but, when they are followed by a |
| 59 | // superscript, they change shape and take the form of subscript dependent vowel SRA U. |
| 60 | // If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they |
| 61 | // should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should |
| 62 | // be placed after the coeng consonant. |
| 63 | // Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base |
| 64 | // Each vowel has its own position. Only one vowel per syllable is allowed. |
| 65 | // Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are |
| 66 | // Allowed in a syllable. |
| 67 | // |
| 68 | // |
| 69 | // order is important here! This order must be the same that is found in each horizontal |
| 70 | // line in the statetable for Khmer (see khmerStateTable) . |
| 71 | */ |
| 72 | enum KhmerCharClassValues { |
| 73 | CC_RESERVED = 0, |
| 74 | CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */ |
| 75 | CC_CONSONANT2 = 2, /* Consonant of type 2 */ |
| 76 | CC_CONSONANT3 = 3, /* Consonant of type 3 */ |
| 77 | CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */ |
| 78 | CC_CONSONANT_SHIFTER = 5, |
| 79 | CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */ |
| 80 | CC_COENG = 7, /* Subscript consonant combining character */ |
| 81 | CC_DEPENDENT_VOWEL = 8, |
| 82 | CC_SIGN_ABOVE = 9, |
| 83 | CC_SIGN_AFTER = 10, |
| 84 | CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */ |
| 85 | CC_COUNT = 12 /* This is the number of character classes */ |
| 86 | }; |
| 87 | |
| 88 | |
| 89 | enum KhmerCharClassFlags { |
| 90 | CF_CLASS_MASK = 0x0000FFFF, |
| 91 | |
| 92 | CF_CONSONANT = 0x01000000, /* flag to speed up comparing */ |
| 93 | CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */ |
| 94 | CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */ |
| 95 | CF_COENG = 0x08000000, /* flag to speed up comparing */ |
| 96 | CF_SHIFTER = 0x10000000, /* flag to speed up comparing */ |
| 97 | CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */ |
| 98 | |
| 99 | /* position flags */ |
| 100 | CF_POS_BEFORE = 0x00080000, |
| 101 | CF_POS_BELOW = 0x00040000, |
| 102 | CF_POS_ABOVE = 0x00020000, |
| 103 | CF_POS_AFTER = 0x00010000, |
| 104 | CF_POS_MASK = 0x000f0000 |
| 105 | }; |
| 106 | |
| 107 | |
| 108 | /* Characters that get referred to by name */ |
| 109 | enum KhmerChar { |
| 110 | C_SIGN_ZWNJ = 0x200C, |
| 111 | C_SIGN_ZWJ = 0x200D, |
| 112 | C_RO = 0x179A, |
| 113 | C_VOWEL_AA = 0x17B6, |
| 114 | C_SIGN_NIKAHIT = 0x17C6, |
| 115 | C_VOWEL_E = 0x17C1, |
| 116 | C_COENG = 0x17D2 |
| 117 | }; |
| 118 | |
| 119 | |
| 120 | /* |
| 121 | // simple classes, they are used in the statetable (in this file) to control the length of a syllable |
| 122 | // they are also used to know where a character should be placed (location in reference to the base character) |
| 123 | // and also to know if a character, when independently displayed, should be displayed with a dotted-circle to |
| 124 | // indicate error in syllable construction |
| 125 | */ |
| 126 | enum { |
| 127 | _xx = CC_RESERVED, |
| 128 | _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE, |
| 129 | _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER, |
| 130 | _c1 = CC_CONSONANT | CF_CONSONANT, |
| 131 | _c2 = CC_CONSONANT2 | CF_CONSONANT, |
| 132 | _c3 = CC_CONSONANT3 | CF_CONSONANT, |
| 133 | _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE, |
| 134 | _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER, |
| 135 | _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE, |
| 136 | _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE, |
| 137 | _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL, |
| 138 | _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE, |
| 139 | _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE, |
| 140 | |
| 141 | /* split vowel */ |
| 142 | _va = _da | CF_SPLIT_VOWEL, |
| 143 | _vr = _dr | CF_SPLIT_VOWEL |
| 144 | }; |
| 145 | |
| 146 | |
| 147 | /* |
| 148 | // Character class: a character class value |
| 149 | // ORed with character class flags. |
| 150 | */ |
| 151 | typedef unsigned long KhmerCharClass; |
| 152 | |
| 153 | |
| 154 | /* |
| 155 | // Character class tables |
| 156 | // _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs... |
| 157 | // _sa Sign placed above the base |
| 158 | // _sp Sign placed after the base |
| 159 | // _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants) |
| 160 | // _c2 Consonant of type 2 (only RO) |
| 161 | // _c3 Consonant of type 3 |
| 162 | // _rb Khmer sign robat u17CC. combining mark for subscript consonants |
| 163 | // _cd Consonant-shifter |
| 164 | // _dl Dependent vowel placed before the base (left of the base) |
| 165 | // _db Dependent vowel placed below the base |
| 166 | // _da Dependent vowel placed above the base |
| 167 | // _dr Dependent vowel placed behind the base (right of the base) |
| 168 | // _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following |
| 169 | // it to create a subscript consonant or independent vowel |
| 170 | // _va Khmer split vowel in which the first part is before the base and the second one above the base |
| 171 | // _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base |
| 172 | */ |
| 173 | static const KhmerCharClass khmerCharClasses[] = { |
| 174 | _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */ |
| 175 | _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */ |
| 176 | _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */ |
| 177 | _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */ |
| 178 | _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */ |
| 179 | _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */ |
| 180 | }; |
| 181 | |
| 182 | /* this enum must reflect the range of khmerCharClasses */ |
| 183 | enum KhmerCharClassesRange { |
| 184 | KhmerFirstChar = 0x1780, |
| 185 | KhmerLastChar = 0x17df |
| 186 | }; |
| 187 | |
| 188 | /* |
| 189 | // Below we define how a character in the input string is either in the khmerCharClasses table |
| 190 | // (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear |
| 191 | // within the syllable, but are not in the table) we also get their type back, or an unknown object |
| 192 | // in which case we get _xx (CC_RESERVED) back |
| 193 | */ |
| 194 | static KhmerCharClass getKhmerCharClass(HB_UChar16 uc) |
| 195 | { |
| 196 | if (uc == C_SIGN_ZWJ) { |
| 197 | return CC_ZERO_WIDTH_J_MARK; |
| 198 | } |
| 199 | |
| 200 | if (uc == C_SIGN_ZWNJ) { |
| 201 | return CC_ZERO_WIDTH_NJ_MARK; |
| 202 | } |
| 203 | |
| 204 | if (uc < KhmerFirstChar || uc > KhmerLastChar) { |
| 205 | return CC_RESERVED; |
| 206 | } |
| 207 | |
| 208 | return khmerCharClasses[uc - KhmerFirstChar]; |
| 209 | } |
| 210 | |
| 211 | |
| 212 | /* |
| 213 | // The stateTable is used to calculate the end (the length) of a well |
| 214 | // formed Khmer Syllable. |
| 215 | // |
| 216 | // Each horizontal line is ordered exactly the same way as the values in KhmerClassTable |
| 217 | // CharClassValues. This coincidence of values allows the follow up of the table. |
| 218 | // |
| 219 | // Each line corresponds to a state, which does not necessarily need to be a type |
| 220 | // of component... for example, state 2 is a base, with is always a first character |
| 221 | // in the syllable, but the state could be produced a consonant of any type when |
| 222 | // it is the first character that is analysed (in ground state). |
| 223 | // |
| 224 | // Differentiating 3 types of consonants is necessary in order to |
| 225 | // forbid the use of certain combinations, such as having a second |
| 226 | // coeng after a coeng RO, |
| 227 | // The inexistent possibility of having a type 3 after another type 3 is permitted, |
| 228 | // eliminating it would very much complicate the table, and it does not create typing |
| 229 | // problems, as the case above. |
| 230 | // |
| 231 | // The table is quite complex, in order to limit the number of coeng consonants |
| 232 | // to 2 (by means of the table). |
| 233 | // |
| 234 | // There a peculiarity, as far as Unicode is concerned: |
| 235 | // - The consonant-shifter is considered in two possible different |
| 236 | // locations, the one considered in Unicode 3.0 and the one considered in |
| 237 | // Unicode 4.0. (there is a backwards compatibility problem in this standard). |
| 238 | // |
| 239 | // |
| 240 | // xx independent character, such as a number, punctuation sign or non-khmer char |
| 241 | // |
| 242 | // c1 Khmer consonant of type 1 or an independent vowel |
| 243 | // that is, a letter in which the subscript for is only under the |
| 244 | // base, not taking any space to the right or to the left |
| 245 | // |
| 246 | // c2 Khmer consonant of type 2, the coeng form takes space under |
| 247 | // and to the left of the base (only RO is of this type) |
| 248 | // |
| 249 | // c3 Khmer consonant of type 3. Its subscript form takes space under |
| 250 | // and to the right of the base. |
| 251 | // |
| 252 | // cs Khmer consonant shifter |
| 253 | // |
| 254 | // rb Khmer robat |
| 255 | // |
| 256 | // co coeng character (u17D2) |
| 257 | // |
| 258 | // dv dependent vowel (including split vowels, they are treated in the same way). |
| 259 | // even if dv is not defined above, the component that is really tested for is |
| 260 | // KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels |
| 261 | // |
| 262 | // zwj Zero Width joiner |
| 263 | // |
| 264 | // zwnj Zero width non joiner |
| 265 | // |
| 266 | // sa above sign |
| 267 | // |
| 268 | // sp post sign |
| 269 | // |
| 270 | // there are lines with equal content but for an easier understanding |
| 271 | // (and maybe change in the future) we did not join them |
| 272 | */ |
| 273 | static const signed char khmerStateTable[][CC_COUNT] = |
| 274 | { |
| 275 | /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */ |
| 276 | { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */ |
| 277 | {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */ |
| 278 | {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */ |
| 279 | {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */ |
| 280 | {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */ |
| 281 | {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */ |
| 282 | {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */ |
| 283 | {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */ |
| 284 | {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */ |
| 285 | {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */ |
| 286 | {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */ |
| 287 | {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */ |
| 288 | {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */ |
| 289 | {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */ |
| 290 | {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */ |
| 291 | {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */ |
| 292 | {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */ |
| 293 | {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */ |
| 294 | {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */ |
| 295 | {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */ |
| 296 | {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */ |
| 297 | }; |
| 298 | |
| 299 | |
| 300 | /* #define KHMER_DEBUG */ |
| 301 | #ifdef KHMER_DEBUG |
| 302 | #define KHDEBUG qDebug |
| 303 | #else |
| 304 | #define KHDEBUG if(0) printf |
| 305 | #endif |
| 306 | |
| 307 | /* |
| 308 | // Given an input string of characters and a location in which to start looking |
| 309 | // calculate, using the state table, which one is the last character of the syllable |
| 310 | // that starts in the starting position. |
| 311 | */ |
| 312 | static int khmer_nextSyllableBoundary(const HB_UChar16 *s, int start, int end, HB_Bool *invalid) |
| 313 | { |
| 314 | const HB_UChar16 *uc = s + start; |
| 315 | int state = 0; |
| 316 | int pos = start; |
| 317 | *invalid = FALSE; |
| 318 | |
| 319 | while (pos < end) { |
| 320 | KhmerCharClass charClass = getKhmerCharClass(*uc); |
| 321 | if (pos == start) { |
| 322 | *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT); |
| 323 | } |
| 324 | state = khmerStateTable[state][charClass & CF_CLASS_MASK]; |
| 325 | |
| 326 | KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", pos - start, state, |
| 327 | charClass, *uc ); |
| 328 | |
| 329 | if (state < 0) { |
| 330 | break; |
| 331 | } |
| 332 | ++uc; |
| 333 | ++pos; |
| 334 | } |
| 335 | return pos; |
| 336 | } |
| 337 | |
| 338 | #ifndef NO_OPENTYPE |
| 339 | static const HB_OpenTypeFeature khmer_features[] = { |
| 340 | { HB_MAKE_TAG( 'p', 'r', 'e', 'f' ), PreFormProperty }, |
| 341 | { HB_MAKE_TAG( 'b', 'l', 'w', 'f' ), BelowFormProperty }, |
| 342 | { HB_MAKE_TAG( 'a', 'b', 'v', 'f' ), AboveFormProperty }, |
| 343 | { HB_MAKE_TAG( 'p', 's', 't', 'f' ), PostFormProperty }, |
| 344 | { HB_MAKE_TAG( 'p', 'r', 'e', 's' ), PreSubstProperty }, |
| 345 | { HB_MAKE_TAG( 'b', 'l', 'w', 's' ), BelowSubstProperty }, |
| 346 | { HB_MAKE_TAG( 'a', 'b', 'v', 's' ), AboveSubstProperty }, |
| 347 | { HB_MAKE_TAG( 'p', 's', 't', 's' ), PostSubstProperty }, |
| 348 | { HB_MAKE_TAG( 'c', 'l', 'i', 'g' ), CligProperty }, |
| 349 | { 0, 0 } |
| 350 | }; |
| 351 | #endif |
| 352 | |
| 353 | |
| 354 | static HB_Bool khmer_shape_syllable(HB_Bool openType, HB_ShaperItem *item) |
| 355 | { |
| 356 | /* KHDEBUG("syllable from %d len %d, str='%s'", item->from, item->length, |
| 357 | item->string->mid(item->from, item->length).toUtf8().data()); */ |
| 358 | |
| 359 | int len = 0; |
| 360 | int syllableEnd = item->item.pos + item->item.length; |
| 361 | unsigned short reordered[16]; |
| 362 | unsigned char properties[16]; |
| 363 | enum { |
| 364 | AboveForm = 0x01, |
| 365 | PreForm = 0x02, |
| 366 | PostForm = 0x04, |
| 367 | BelowForm = 0x08 |
| 368 | }; |
| 369 | #ifndef NO_OPENTYPE |
| 370 | const int availableGlyphs = item->num_glyphs; |
| 371 | #endif |
| 372 | int coengRo; |
| 373 | int i; |
| 374 | |
| 375 | /* according to the specs this is the max length one can get |
| 376 | ### the real value should be smaller */ |
| 377 | assert(item->item.length < 13); |
| 378 | |
| 379 | memset(properties, 0, 16*sizeof(unsigned char)); |
| 380 | |
| 381 | #ifdef KHMER_DEBUG |
| 382 | qDebug("original:"); |
| 383 | for (int i = from; i < syllableEnd; i++) { |
| 384 | qDebug(" %d: %4x", i, string[i]); |
| 385 | } |
| 386 | #endif |
| 387 | |
| 388 | /* |
| 389 | // write a pre vowel or the pre part of a split vowel first |
| 390 | // and look out for coeng + ro. RO is the only vowel of type 2, and |
| 391 | // therefore the only one that requires saving space before the base. |
| 392 | */ |
| 393 | coengRo = -1; /* There is no Coeng Ro, if found this value will change */ |
| 394 | for (i = item->item.pos; i < syllableEnd; i += 1) { |
| 395 | KhmerCharClass charClass = getKhmerCharClass(item->string[i]); |
| 396 | |
| 397 | /* if a split vowel, write the pre part. In Khmer the pre part |
| 398 | is the same for all split vowels, same glyph as pre vowel C_VOWEL_E */ |
| 399 | if (charClass & CF_SPLIT_VOWEL) { |
| 400 | reordered[len] = C_VOWEL_E; |
| 401 | properties[len] = PreForm; |
| 402 | ++len; |
| 403 | break; /* there can be only one vowel */ |
| 404 | } |
| 405 | /* if a vowel with pos before write it out */ |
| 406 | if (charClass & CF_POS_BEFORE) { |
| 407 | reordered[len] = item->string[i]; |
| 408 | properties[len] = PreForm; |
| 409 | ++len; |
| 410 | break; /* there can be only one vowel */ |
| 411 | } |
| 412 | /* look for coeng + ro and remember position |
| 413 | works because coeng + ro is always in front of a vowel (if there is a vowel) |
| 414 | and because CC_CONSONANT2 is enough to identify it, as it is the only consonant |
| 415 | with this flag */ |
| 416 | if ( (charClass & CF_COENG) && (i + 1 < syllableEnd) && |
| 417 | ( (getKhmerCharClass(item->string[i+1]) & CF_CLASS_MASK) == CC_CONSONANT2) ) { |
| 418 | coengRo = i; |
| 419 | } |
| 420 | } |
| 421 | |
| 422 | /* write coeng + ro if found */ |
| 423 | if (coengRo > -1) { |
| 424 | reordered[len] = C_COENG; |
| 425 | properties[len] = PreForm; |
| 426 | ++len; |
| 427 | reordered[len] = C_RO; |
| 428 | properties[len] = PreForm; |
| 429 | ++len; |
| 430 | } |
| 431 | |
| 432 | /* |
| 433 | shall we add a dotted circle? |
| 434 | If in the position in which the base should be (first char in the string) there is |
| 435 | a character that has the Dotted circle flag (a character that cannot be a base) |
| 436 | then write a dotted circle */ |
| 437 | if (getKhmerCharClass(item->string[item->item.pos]) & CF_DOTTED_CIRCLE) { |
| 438 | reordered[len] = C_DOTTED_CIRCLE; |
| 439 | ++len; |
| 440 | } |
| 441 | |
| 442 | /* copy what is left to the output, skipping before vowels and |
| 443 | coeng Ro if they are present */ |
| 444 | for (i = item->item.pos; i < syllableEnd; i += 1) { |
| 445 | HB_UChar16 uc = item->string[i]; |
| 446 | KhmerCharClass charClass = getKhmerCharClass(uc); |
| 447 | |
| 448 | /* skip a before vowel, it was already processed */ |
| 449 | if (charClass & CF_POS_BEFORE) { |
| 450 | continue; |
| 451 | } |
| 452 | |
| 453 | /* skip coeng + ro, it was already processed */ |
| 454 | if (i == coengRo) { |
| 455 | i += 1; |
| 456 | continue; |
| 457 | } |
| 458 | |
| 459 | switch (charClass & CF_POS_MASK) |
| 460 | { |
| 461 | case CF_POS_ABOVE : |
| 462 | reordered[len] = uc; |
| 463 | properties[len] = AboveForm; |
| 464 | ++len; |
| 465 | break; |
| 466 | |
| 467 | case CF_POS_AFTER : |
| 468 | reordered[len] = uc; |
| 469 | properties[len] = PostForm; |
| 470 | ++len; |
| 471 | break; |
| 472 | |
| 473 | case CF_POS_BELOW : |
| 474 | reordered[len] = uc; |
| 475 | properties[len] = BelowForm; |
| 476 | ++len; |
| 477 | break; |
| 478 | |
| 479 | default: |
| 480 | /* assign the correct flags to a coeng consonant |
| 481 | Consonants of type 3 are taged as Post forms and those type 1 as below forms */ |
| 482 | if ( (charClass & CF_COENG) && i + 1 < syllableEnd ) { |
| 483 | unsigned char property = (getKhmerCharClass(item->string[i+1]) & CF_CLASS_MASK) == CC_CONSONANT3 ? |
| 484 | PostForm : BelowForm; |
| 485 | reordered[len] = uc; |
| 486 | properties[len] = property; |
| 487 | ++len; |
| 488 | i += 1; |
| 489 | reordered[len] = item->string[i]; |
| 490 | properties[len] = property; |
| 491 | ++len; |
| 492 | break; |
| 493 | } |
| 494 | |
| 495 | /* if a shifter is followed by an above vowel change the shifter to below form, |
| 496 | an above vowel can have two possible positions i + 1 or i + 3 |
| 497 | (position i+1 corresponds to unicode 3, position i+3 to Unicode 4) |
| 498 | and there is an extra rule for C_VOWEL_AA + C_SIGN_NIKAHIT also for two |
| 499 | different positions, right after the shifter or after a vowel (Unicode 4) */ |
| 500 | if ( (charClass & CF_SHIFTER) && (i + 1 < syllableEnd) ) { |
| 501 | if (getKhmerCharClass(item->string[i+1]) & CF_ABOVE_VOWEL ) { |
| 502 | reordered[len] = uc; |
| 503 | properties[len] = BelowForm; |
| 504 | ++len; |
| 505 | break; |
| 506 | } |
| 507 | if (i + 2 < syllableEnd && |
| 508 | (item->string[i+1] == C_VOWEL_AA) && |
| 509 | (item->string[i+2] == C_SIGN_NIKAHIT) ) |
| 510 | { |
| 511 | reordered[len] = uc; |
| 512 | properties[len] = BelowForm; |
| 513 | ++len; |
| 514 | break; |
| 515 | } |
| 516 | if (i + 3 < syllableEnd && (getKhmerCharClass(item->string[i+3]) & CF_ABOVE_VOWEL) ) { |
| 517 | reordered[len] = uc; |
| 518 | properties[len] = BelowForm; |
| 519 | ++len; |
| 520 | break; |
| 521 | } |
| 522 | if (i + 4 < syllableEnd && |
| 523 | (item->string[i+3] == C_VOWEL_AA) && |
| 524 | (item->string[i+4] == C_SIGN_NIKAHIT) ) |
| 525 | { |
| 526 | reordered[len] = uc; |
| 527 | properties[len] = BelowForm; |
| 528 | ++len; |
| 529 | break; |
| 530 | } |
| 531 | } |
| 532 | |
| 533 | /* default - any other characters */ |
| 534 | reordered[len] = uc; |
| 535 | ++len; |
| 536 | break; |
| 537 | } /* switch */ |
| 538 | } /* for */ |
| 539 | |
| 540 | if (!item->font->klass->convertStringToGlyphIndices(item->font, |
| 541 | reordered, len, |
| 542 | item->glyphs, &item->num_glyphs, |
| 543 | item->item.bidiLevel % 2)) |
| 544 | return FALSE; |
| 545 | |
| 546 | |
| 547 | KHDEBUG("after shaping: len=%d", len); |
| 548 | for (i = 0; i < len; i++) { |
| 549 | item->attributes[i].mark = FALSE; |
| 550 | item->attributes[i].clusterStart = FALSE; |
| 551 | item->attributes[i].justification = 0; |
| 552 | item->attributes[i].zeroWidth = FALSE; |
| 553 | KHDEBUG(" %d: %4x property=%x", i, reordered[i], properties[i]); |
| 554 | } |
| 555 | |
| 556 | /* now we have the syllable in the right order, and can start running it through open type. */ |
| 557 | |
| 558 | #ifndef NO_OPENTYPE |
| 559 | if (openType) { |
| 560 | hb_uint32 where[16]; |
| 561 | for (i = 0; i < len; ++i) { |
| 562 | where[i] = ~(PreSubstProperty |
| 563 | | BelowSubstProperty |
| 564 | | AboveSubstProperty |
| 565 | | PostSubstProperty |
| 566 | | CligProperty |
| 567 | | PositioningProperties); |
| 568 | if (properties[i] == PreForm) |
| 569 | where[i] &= ~PreFormProperty; |
| 570 | else if (properties[i] == BelowForm) |
| 571 | where[i] &= ~BelowFormProperty; |
| 572 | else if (properties[i] == AboveForm) |
| 573 | where[i] &= ~AboveFormProperty; |
| 574 | else if (properties[i] == PostForm) |
| 575 | where[i] &= ~PostFormProperty; |
| 576 | } |
| 577 | |
| 578 | HB_OpenTypeShape(item, where); |
| 579 | if (!HB_OpenTypePosition(item, availableGlyphs, /*doLogClusters*/FALSE)) |
| 580 | return FALSE; |
| 581 | } else |
| 582 | #endif |
| 583 | { |
| 584 | KHDEBUG("Not using openType"); |
| 585 | HB_HeuristicPosition(item); |
| 586 | } |
| 587 | |
| 588 | item->attributes[0].clusterStart = TRUE; |
| 589 | return TRUE; |
| 590 | } |
| 591 | |
| 592 | HB_Bool HB_KhmerShape(HB_ShaperItem *item) |
| 593 | { |
| 594 | HB_Bool openType = FALSE; |
| 595 | unsigned short *logClusters = item->log_clusters; |
| 596 | int i; |
| 597 | |
| 598 | HB_ShaperItem syllable = *item; |
| 599 | int first_glyph = 0; |
| 600 | |
| 601 | int sstart = item->item.pos; |
| 602 | int end = sstart + item->item.length; |
| 603 | |
| 604 | assert(item->item.script == HB_Script_Khmer); |
| 605 | |
| 606 | #ifndef NO_OPENTYPE |
| 607 | openType = HB_SelectScript(item, khmer_features); |
| 608 | #endif |
| 609 | |
| 610 | KHDEBUG("khmer_shape: from %d length %d", item->item.pos, item->item.length); |
| 611 | while (sstart < end) { |
| 612 | HB_Bool invalid; |
| 613 | int send = khmer_nextSyllableBoundary(item->string, sstart, end, &invalid); |
| 614 | KHDEBUG("syllable from %d, length %d, invalid=%s", sstart, send-sstart, |
| 615 | invalid ? "TRUE" : "FALSE"); |
| 616 | syllable.item.pos = sstart; |
| 617 | syllable.item.length = send-sstart; |
| 618 | syllable.glyphs = item->glyphs + first_glyph; |
| 619 | syllable.attributes = item->attributes + first_glyph; |
| 620 | syllable.offsets = item->offsets + first_glyph; |
| 621 | syllable.advances = item->advances + first_glyph; |
| 622 | syllable.num_glyphs = item->num_glyphs - first_glyph; |
| 623 | if (!khmer_shape_syllable(openType, &syllable)) { |
| 624 | KHDEBUG("syllable shaping failed, syllable requests %d glyphs", syllable.num_glyphs); |
| 625 | item->num_glyphs += syllable.num_glyphs; |
| 626 | return FALSE; |
| 627 | } |
| 628 | /* fix logcluster array */ |
| 629 | KHDEBUG("syllable:"); |
| 630 | for (i = first_glyph; i < first_glyph + (int)syllable.num_glyphs; ++i) |
| 631 | KHDEBUG(" %d -> glyph %x", i, item->glyphs[i]); |
| 632 | KHDEBUG(" logclusters:"); |
| 633 | for (i = sstart; i < send; ++i) { |
| 634 | KHDEBUG(" %d -> glyph %d", i, first_glyph); |
| 635 | logClusters[i-item->item.pos] = first_glyph; |
| 636 | } |
| 637 | sstart = send; |
| 638 | first_glyph += syllable.num_glyphs; |
| 639 | } |
| 640 | item->num_glyphs = first_glyph; |
| 641 | return TRUE; |
| 642 | } |
| 643 | |
| 644 | void HB_KhmerAttributes(HB_Script script, const HB_UChar16 *text, hb_uint32 from, hb_uint32 len, HB_CharAttributes *attributes) |
| 645 | { |
| 646 | int end = from + len; |
| 647 | const HB_UChar16 *uc = text + from; |
| 648 | hb_uint32 i = 0; |
| 649 | HB_UNUSED(script); |
| 650 | attributes += from; |
| 651 | while ( i < len ) { |
| 652 | HB_Bool invalid; |
| 653 | hb_uint32 boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from; |
| 654 | |
| 655 | attributes[i].charStop = TRUE; |
| 656 | |
| 657 | if ( boundary > len-1 ) boundary = len; |
| 658 | i++; |
| 659 | while ( i < boundary ) { |
| 660 | attributes[i].charStop = FALSE; |
| 661 | ++uc; |
| 662 | ++i; |
| 663 | } |
| 664 | assert( i == boundary ); |
| 665 | } |
| 666 | } |
| 667 | |