| package org.unicode.cldr.util; |
| |
| import com.google.common.base.CharMatcher; |
| import com.google.common.base.Splitter; |
| import com.google.common.collect.ImmutableMap; |
| import com.google.common.collect.ImmutableSet; |
| import com.ibm.icu.dev.util.UnicodeMap; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.lang.CharSequences; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.util.ICUException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Set; |
| import org.unicode.cldr.draft.FileUtilities; |
| import org.unicode.cldr.util.PathHeader.PageId; |
| |
| public class Emoji { |
| public static final String EMOJI_VARIANT = "\uFE0F"; |
| public static final String COMBINING_ENCLOSING_KEYCAP = "\u20E3"; |
| public static final String ZWJ = "\u200D"; |
| public static final UnicodeSet REGIONAL_INDICATORS = new UnicodeSet(0x1F1E6, 0x1F1FF).freeze(); |
| public static final UnicodeSet MODIFIERS = new UnicodeSet("[🏻-🏿]").freeze(); |
| public static final UnicodeSet TAGS = new UnicodeSet(0xE0000, 0xE007F).freeze(); |
| public static final UnicodeSet FAMILY = new UnicodeSet("[\u200D 👦-👩 💋 ❤]").freeze(); |
| public static final UnicodeSet GENDER = new UnicodeSet().add(0x2640).add(0x2642).freeze(); |
| public static final UnicodeSet SPECIALS = |
| new UnicodeSet( |
| "[" |
| + "{🐈⬛}{🐻❄}{👨🍼}{👩🍼}{🧑🍼}{🧑🎄}{🧑🤝🧑}{🏳🌈} {👁🗨} {🏴☠} {🐕🦺} {👨🦯} {👨🦼} {👨🦽} {👩🦯} {👩🦼} {👩🦽}" |
| + "{🏳⚧}{🧑⚕}{🧑⚖}{🧑✈}{🧑🌾}{🧑🍳}{🧑🎓}{🧑🎤}{🧑🎨}{🧑🏫}{🧑🏭}{🧑💻}{🧑💼}{🧑🔧}{🧑🔬}{🧑🚀}{🧑🚒}{🧑🦯}{🧑🦼}{🧑🦽}" |
| + "{❤🔥}, {❤🩹}, {😮💨}, {😵💫}" // #E13.1 |
| + "]") |
| .freeze(); |
| // May have to add from above, if there is a failure in testAnnotationPaths. Failure will be |
| // like: |
| // got java.util.TreeSet<[//ldml/annotations/annotation[@cp="🏳⚧"][@type="tts"], |
| // //ldml/annotations/annotation[@cp="🧑⚕"][@type="tts"], ... |
| // just extract the items in "...", and change into {...} for adding above. |
| // Example: //ldml/annotations/annotation[@cp="🧑⚕"] ==> {🧑⚕} |
| public static final UnicodeSet MAN_WOMAN = new UnicodeSet("[👨 👩]").freeze(); |
| public static final UnicodeSet OBJECT = |
| new UnicodeSet("[👩 🎓 🌾 🍳 🏫 🏭 🎨 🚒 ✈ 🚀 🎤 💻 🔬 💼 🔧 ⚖ ⚕]").freeze(); |
| |
| static final UnicodeMap<String> emojiToMajorCategory = new UnicodeMap<>(); |
| static final UnicodeMap<String> emojiToMinorCategory = new UnicodeMap<>(); |
| static final UnicodeMap<String> toName = new UnicodeMap<>(); |
| |
| static { |
| emojiToMajorCategory.setErrorOnReset(true); |
| emojiToMinorCategory.setErrorOnReset(true); |
| toName.setErrorOnReset(true); |
| } |
| /** |
| * A mapping from a majorCategory to a unique ordering number, based on the first time it is |
| * encountered. |
| */ |
| static final Map<String, Long> majorToOrder = new HashMap<>(); |
| /** |
| * A mapping from a minorCategory to a unique ordering number, based on the first time it is |
| * encountered. |
| */ |
| static final Map<String, Long> minorToOrder = new HashMap<>(); |
| |
| static final Map<String, Long> emojiToOrder = new LinkedHashMap<>(); |
| static final UnicodeSet nonConstructed = new UnicodeSet(); |
| static final UnicodeSet allRgi = new UnicodeSet(); |
| static final UnicodeSet allRgiNoES = new UnicodeSet(); |
| |
| static { |
| /* |
| * Example from emoji-test.txt: |
| * # group: Smileys & Emotion |
| * # subgroup: face-smiling |
| * 1F600 ; fully-qualified # 😀 grinning face |
| */ |
| Splitter semi = Splitter.on(CharMatcher.anyOf(";#")).trimResults(); |
| String majorCategory = null; |
| String minorCategory = null; |
| for (String line : FileUtilities.in(Emoji.class, "data/emoji/emoji-test.txt")) { |
| if (line.startsWith("#")) { |
| line = line.substring(1).trim(); |
| if (line.startsWith("group:")) { |
| majorCategory = line.substring("group:".length()).trim(); |
| majorToOrder.computeIfAbsent(majorCategory, k -> (long) majorToOrder.size()); |
| } else if (line.startsWith("subgroup:")) { |
| minorCategory = line.substring("subgroup:".length()).trim(); |
| minorToOrder.computeIfAbsent(minorCategory, k -> (long) minorToOrder.size()); |
| } |
| continue; |
| } |
| line = line.trim(); |
| if (line.isEmpty()) { |
| continue; |
| } |
| Iterator<String> it = semi.split(line).iterator(); |
| |
| String emojiHex = it.next(); |
| String original = Utility.fromHex(emojiHex, 4, " "); |
| String type = it.next(); |
| if (type.startsWith("fully-qualified")) { |
| allRgi.add(original); |
| allRgiNoES.add(original.replace(Emoji.EMOJI_VARIANT, "")); |
| } |
| emojiToMajorCategory.put(original, majorCategory); |
| emojiToMinorCategory.put(original, minorCategory); |
| String comment = it.next(); |
| // The comment is now of the form: # 😁 E0.6 beaming face with smiling eyes |
| int spacePos = comment.indexOf(' '); |
| // The format changed in v15.1, so there is no version number. |
| // Thus the following is commented out: |
| // spacePos = comment.indexOf(' ', spacePos + 1); // get second space |
| String name = comment.substring(spacePos + 1).trim(); |
| toName.put(original, name); |
| |
| // add all the non-constructed values to a set for annotations |
| |
| String minimal = original.replace(EMOJI_VARIANT, ""); |
| |
| // Add the order. If it is not minimal, add that also. |
| if (!emojiToOrder.containsKey(original)) { |
| putUnique(emojiToOrder, original, emojiToOrder.size() * 100L); |
| } |
| if (!emojiToOrder.containsKey(minimal)) { |
| putUnique(emojiToOrder, minimal, emojiToOrder.size() * 100L); |
| } |
| |
| boolean singleton = CharSequences.getSingleCodePoint(minimal) != Integer.MAX_VALUE; |
| |
| // skip constructed values |
| if (minimal.contains(COMBINING_ENCLOSING_KEYCAP) |
| || REGIONAL_INDICATORS.containsSome(minimal) |
| || TAGS.containsSome(minimal) |
| || !singleton && MODIFIERS.containsSome(minimal) |
| || !singleton && FAMILY.containsAll(minimal)) { |
| // do nothing |
| } else if (minimal.contains(ZWJ)) { // only do certain ZWJ sequences |
| if (SPECIALS.contains(minimal) |
| || GENDER.containsSome(minimal) |
| || MAN_WOMAN.contains(minimal.codePointAt(0)) |
| && OBJECT.contains(minimal.codePointBefore(minimal.length()))) { |
| nonConstructed.add(minimal); |
| } |
| } else if (!minimal.contains("🔟")) { |
| nonConstructed.add(minimal); |
| } |
| } |
| emojiToMajorCategory.freeze(); |
| emojiToMinorCategory.freeze(); |
| nonConstructed.add(MODIFIERS); // needed for names |
| nonConstructed.freeze(); |
| toName.freeze(); |
| allRgi.freeze(); |
| allRgiNoES.freeze(); |
| } |
| |
| private static <K, V> void putUnique(Map<K, V> map, K key, V value) { |
| V oldValue = map.put(key, value); |
| if (oldValue != null) { |
| throw new ICUException( |
| "Attempt to change value of " |
| + map |
| + " for " |
| + key |
| + " from " |
| + oldValue |
| + " to " |
| + value); |
| } |
| } |
| |
| public static UnicodeSet getAllRgi() { |
| return allRgi; |
| } |
| |
| public static UnicodeSet getAllRgiNoES() { |
| return allRgiNoES; |
| } |
| |
| public static final UnicodeMap<String> EXTRA_SYMBOL_MINOR_CATEGORIES = new UnicodeMap<>(); |
| public static final Map<String, Long> EXTRA_SYMBOL_ORDER; |
| private static final boolean DEBUG = false; |
| |
| static { |
| String[][] data = { |
| {"arrow", "→ ↓ ↑ ← ↔ ↕ ⇆ ⇅"}, |
| {"alphanum", "© ® ℗ ™ µ"}, |
| {"geometric", "▼ ▶ ▲ ◀ ● ○ ◯ ◊"}, |
| {"math", "× ÷ √ ∞ ∆ ∇ ⁻ ¹ ² ³ ≡ ∈ ⊂ ∩ ∪ ° + ± − = ≈ ≠ > < ≤ ≥ ¬ | ~"}, |
| { |
| "punctuation", |
| "§ † ‡ \\u0020 , 、 ، ; : ؛ ! ¡ ? ¿ ؟ ¶ ※ / \\ & # % ‰ ′ ″ ‴ @ * ♪ ♭ ♯ ` ´ ^ ¨ ‐ ― _ - – — • · . … 。 ‧ ・ ‘ ’ ‚ ' “ ” „ » « ( ) [ ] { } 〔 〕 〈 〉 《 》 「 」 『 』 〖 〗 【 】" |
| }, |
| {"currency", "€ £ ¥ ₹ ₽ $ ¢ ฿ ₪ ₺ ₫ ₱ ₩ ₡ ₦ ₮ ৳ ₴ ₸ ₲ ₵ ៛ ₭ ֏ ₥ ₾ ₼ ₿ ؋ ₧ ¤"}, |
| { |
| "other-symbol", |
| "‾‽‸⁂↚↛↮↙↜↝↞↟↠↡↢↣↤↥↦↧↨↫↬↭↯↰↱↲↳↴↵↶↷↸↹↺↻↼↽↾↿⇀⇁⇂⇃⇄⇇⇈⇉⇊⇋⇌⇐⇍⇑⇒⇏⇓⇔⇎⇖⇗⇘⇙⇚⇛⇜⇝⇞⇟⇠⇡⇢⇣⇤⇥⇦⇧⇨⇩⇪⇵∀∂∃∅∉∋∎∏∑≮≯∓∕⁄∗∘∙∝∟∠∣∥∧∫∬∮∴∵∶∷∼∽∾≃≅≌≒≖≣≦≧≪≫≬≳≺≻⊁⊃⊆⊇⊕⊖⊗⊘⊙⊚⊛⊞⊟⊥⊮⊰⊱⋭⊶⊹⊿⋁⋂⋃⋅⋆⋈⋒⋘⋙⋮⋯⋰⋱■□▢▣▤▥▦▧▨▩▬▭▮▰△▴▵▷▸▹►▻▽▾▿◁◂◃◄◅◆◇◈◉◌◍◎◐◑◒◓◔◕◖◗◘◙◜◝◞◟◠◡◢◣◤◥◦◳◷◻◽◿⨧⨯⨼⩣⩽⪍⪚⪺₢₣₤₰₳₶₷₨﷼" |
| }, |
| }; |
| // get the maximum suborder for each subcategory |
| Map<String, Long> subcategoryToMaxSuborder = new HashMap<>(); |
| for (String[] row : data) { |
| final String subcategory = row[0]; |
| for (Entry<String, String> entry : emojiToMinorCategory.entrySet()) { |
| if (entry.getValue().equals(subcategory)) { |
| String emoji = entry.getKey(); |
| Long order = emojiToOrder.get(emoji); |
| Long currentMax = subcategoryToMaxSuborder.get(subcategory); |
| if (currentMax == null || currentMax < order) { |
| subcategoryToMaxSuborder.put(subcategory, order); |
| } |
| } |
| } |
| } |
| if (DEBUG) System.out.println(subcategoryToMaxSuborder); |
| Map<String, Long> _EXTRA_SYMBOL_ORDER = new LinkedHashMap<>(); |
| for (String[] row : data) { |
| final String subcategory = row[0]; |
| final String characters = row[1]; |
| |
| List<String> items = new ArrayList<>(); |
| for (int cp : With.codePointArray(characters)) { |
| if (cp != ' ') { |
| items.add(With.fromCodePoint(cp)); |
| } |
| } |
| final UnicodeSet uset = new UnicodeSet().addAll(items); |
| if (uset.containsSome(EXTRA_SYMBOL_MINOR_CATEGORIES.keySet())) { |
| throw new IllegalArgumentException( |
| "Duplicate values in " + EXTRA_SYMBOL_MINOR_CATEGORIES); |
| } |
| EXTRA_SYMBOL_MINOR_CATEGORIES.putAll(uset, subcategory); |
| long count = subcategoryToMaxSuborder.get(subcategory); |
| for (String s : items) { |
| ++count; |
| _EXTRA_SYMBOL_ORDER.put(s, count); |
| } |
| subcategoryToMaxSuborder.put(subcategory, count); |
| } |
| if (DEBUG) System.out.println(_EXTRA_SYMBOL_ORDER); |
| EXTRA_SYMBOL_MINOR_CATEGORIES.freeze(); |
| EXTRA_SYMBOL_ORDER = ImmutableMap.copyOf(_EXTRA_SYMBOL_ORDER); |
| } |
| |
| public static String getMinorCategory(String emoji) { |
| String minorCat = emojiToMinorCategory.get(emoji); |
| if (minorCat == null) { |
| minorCat = EXTRA_SYMBOL_MINOR_CATEGORIES.get(emoji); |
| if (minorCat == null) { |
| throw new InternalCldrException( |
| "No minor category (aka subgroup) found for " |
| + emoji |
| + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"minor\", ..."); |
| } |
| } |
| return minorCat; |
| } |
| |
| public static String getName(String emoji) { |
| return toName.get(emoji); |
| } |
| |
| public static long getEmojiToOrder(String emoji) { |
| Long result = emojiToOrder.get(emoji); |
| if (result == null) { |
| result = EXTRA_SYMBOL_ORDER.get(emoji); |
| if (result == null) { |
| throw new InternalCldrException( |
| "No Order found for " |
| + emoji |
| + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"minor\", ..."); |
| } |
| } |
| return result; |
| } |
| |
| public static long getEmojiMinorOrder(String minor) { |
| Long result = minorToOrder.get(minor); |
| if (result == null) { |
| throw new InternalCldrException( |
| "No minor category (aka subgroup) found for " |
| + minor |
| + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"minor\", ..."); |
| } |
| return result; |
| } |
| |
| public static String getMajorCategory(String emoji) { |
| String majorCat = emojiToMajorCategory.get(emoji); |
| if (majorCat == null) { |
| if (EXTRA_SYMBOL_MINOR_CATEGORIES.containsKey(emoji)) { |
| majorCat = "Symbols"; |
| } else { |
| throw new InternalCldrException( |
| "No minor category (aka subgroup) found for " |
| + emoji |
| + ". Update emoji-test.txt to latest, and adjust PathHeader.. functionMap.put(\"major\", ..."); |
| } |
| } |
| return majorCat; |
| } |
| |
| public static Set<String> getMinorCategoriesWithExtras() { |
| Set<String> result = new LinkedHashSet<>(emojiToMinorCategory.values()); |
| result.addAll(EXTRA_SYMBOL_MINOR_CATEGORIES.getAvailableValues()); |
| return ImmutableSet.copyOf(result); |
| } |
| |
| public static UnicodeSet getEmojiInMinorCategoriesWithExtras(String minorCategory) { |
| return new UnicodeSet(emojiToMinorCategory.getSet(minorCategory)) |
| .addAll(EXTRA_SYMBOL_MINOR_CATEGORIES.getSet(minorCategory)) |
| .freeze(); |
| } |
| |
| public static UnicodeSet getNonConstructed() { |
| return nonConstructed; |
| } |
| |
| private static Set<String> NAME_PATHS = null; |
| public static final String TYPE_TTS = "[@type=\"tts\"]"; |
| |
| public static synchronized Set<String> getNamePaths() { |
| return NAME_PATHS != null ? NAME_PATHS : (NAME_PATHS = buildPaths(TYPE_TTS)); |
| } |
| |
| private static ImmutableSet<String> buildPaths(String suffix) { |
| ImmutableSet.Builder<String> builder = ImmutableSet.builder(); |
| for (String s : Emoji.getNonConstructed()) { |
| String base = "//ldml/annotations/annotation[@cp=\"" + s + "\"]" + suffix; |
| builder.add(base); |
| } |
| return builder.build(); |
| } |
| |
| /** |
| * Return the PageId for the given emoji, making adjustments for pages that are united in |
| * emoji-test.txt but divided in Survey Tool, such as Symbols, Symbols2, and Symbols3 |
| * |
| * @param emoji the emoji as a string |
| * @return the adjusted PageId |
| */ |
| public static PageId getPageId(String emoji) { |
| final String major = getMajorCategory(emoji); |
| final String minor = getMinorCategory(emoji); |
| final PageId pageId = PageId.forString(major); |
| final Long minorOrder = minorToOrder.get(minor); |
| switch (pageId) { |
| case Objects: |
| return (minorOrder < minorToOrder.get("money")) ? PageId.Objects : PageId.Objects2; |
| case People: |
| return (minorOrder < minorToOrder.get("person-fantasy")) |
| ? PageId.People |
| : PageId.People2; |
| case Symbols: |
| return (minorOrder < minorToOrder.get("transport-sign")) |
| ? PageId.Symbols |
| : PageId.EmojiSymbols; |
| case Travel_Places: |
| return (minorOrder < minorToOrder.get("transport-ground")) |
| ? PageId.Travel_Places |
| : PageId.Travel_Places2; |
| default: |
| return pageId; |
| } |
| } |
| } |