#!/usr/bin/env python3 # # Helper script to generate Emoji dictionary file from unicode data. # # SPDX-FileCopyrightText: 2022 Weng Xuetian # SPDX-License-Identifier: LGPL-2.0-or-later # import collections import requests import io import zipfile import os import glob import xml.dom.minidom from PyQt5.QtCore import QFile, QDataStream, QByteArray, QIODevice, qCompress EMOJI_VERSION = "15.1" EMOJI_TEST_FILE = "emoji-test.txt" EMOJI_TEST_URL = f"https://www.unicode.org/Public/emoji/{EMOJI_VERSION}/{EMOJI_TEST_FILE}" CLDR_VERSION = "44.0" CLDR_FILE = f"cldr-common-{CLDR_VERSION}.zip" CLDR_URL = f"https://unicode.org/Public/cldr/{CLDR_VERSION.split('.')[0]}/{CLDR_FILE}" CLDR_ANNOTATIONS_DIR = "common/annotations" CLDR_ANNOTATIONS_DERIVED_DIR = "common/annotationsDerived" class EmojiAnnotation(object): def __init__(self): self.description = "" self.annotations = [] class EmojiParser(object): def __init__(self): self.variantMapping = dict() self.categoryNames = [] self.emojis = collections.OrderedDict() def parseEmojiTest(self, emojiTestData): descriptionMapping = dict() currentGroup = 0; GROUP_TAG = b"# group: " for line in emojiTestData.split(b"\n"): line = line.strip() if line.startswith(GROUP_TAG): currentGroup += 1; # "&" has special meaning in Qt, which does not work well in the UI. self.categoryNames.append(line[len(GROUP_TAG):].replace(b" & ", b" and ")) continue if line.startswith(b"#"): continue; # line format: code points; status # emoji name segments = line.split(b";") if len(segments) != 2: continue metadata = segments[1].split(b"#") if len(metadata) != 2: continue; desc = metadata[1].strip().split(b" E", 1) if len(desc) != 2: continue description = desc[1] codes = segments[0].strip().split(b" ") try: emoji = "".join(chr(int(code, 16)) for code in codes) status = metadata[0].strip() if status == b"fully-qualified": self.emojis[emoji] = currentGroup descriptionMapping[description] = emoji else: fullyQualified = descriptionMapping.get(description, None); if fullyQualified: self.variantMapping[emoji] = fullyQualified; except e: pass def parseCldr(self, cldrList): annotations = dict() for data in cldrList: with xml.dom.minidom.parseString(data) as doc: annotationNodes = doc.getElementsByTagName("annotation") for annotationNode in annotationNodes: if "cp" not in annotationNode.attributes: continue emoji = annotationNode.attributes["cp"].nodeValue if emoji not in self.emojis: emoji = self.variantMapping.get(emoji, None) if not emoji: continue if len(annotationNode.childNodes) != 1 or annotationNode.childNodes[0].nodeType != xml.dom.minidom.Node.TEXT_NODE: continue if emoji not in annotations: annotations[emoji] = EmojiAnnotation() annotation = annotations[emoji] if "type" in annotationNode.attributes and annotationNode.attributes["type"].nodeValue == "tts": annotation.description = annotationNode.childNodes[0].nodeValue else: annotation.annotations = annotationNode.childNodes[0].nodeValue.split(" | ") return annotations def writeEmojiCategory(self): template_header='''/* SPDX-FileCopyrightText: 2022 Weng Xuetian SPDX-License-Identifier: LGPL-2.0-or-later */ // Generated from emoji-test.txt #include "emojicategory.h" #include const QStringList &getCategoryNames() { static const QStringList names = {''' template_foot=''' }; return names; } ''' content = "\n".join([template_header] + [f" QString::fromUtf8(kli18nc(\"Emoji Category\", \"{categoryName.decode('utf-8')}\").untranslatedText())," for categoryName in self.categoryNames] + [template_foot]) with open("../emojicategory.cpp", "w") as f: f.write(content) # Naive sanity check for running it under correct directory if not os.path.exists("../emojicategory.cpp"): print("Please run this script under its own directory") exit(1) print("Removing old *.dict files") for olddict in glob.glob("*.dict"): os.remove(olddict) parser = EmojiParser() print(f"Downloading {EMOJI_TEST_URL}") response = requests.get(EMOJI_TEST_URL) print(f"Parsing {EMOJI_TEST_FILE}") parser.parseEmojiTest(response.content) parser.writeEmojiCategory() print(f"Downloading {CLDR_URL}") response = requests.get(CLDR_URL) with zipfile.ZipFile(io.BytesIO(response.content)) as thezip: annotationsFiles = set() annotationsDerivedFiles = set() for zipinfo in thezip.infolist(): dirname = os.path.dirname(zipinfo.filename) basename = os.path.basename(zipinfo.filename) if not basename.endswith(".xml"): continue if dirname == CLDR_ANNOTATIONS_DIR: annotationsFiles.add(basename) elif dirname == CLDR_ANNOTATIONS_DERIVED_DIR: annotationsDerivedFiles.add(basename) files = annotationsFiles.intersection(annotationsDerivedFiles) for langfile in files: dictfilename = langfile[:-4] + ".dict" print(f"Generating {dictfilename}") annotations = dict() with thezip.open(os.path.join(CLDR_ANNOTATIONS_DIR, langfile)) as annotationsFile, thezip.open(os.path.join(CLDR_ANNOTATIONS_DERIVED_DIR, langfile)) as annotationsDerivedFile: annotations = parser.parseCldr([annotationsFile.read(), annotationsDerivedFile.read()]) filtered_emojis = [(emoji, category) for (emoji, category) in parser.emojis.items() if emoji in annotations and annotations[emoji].description] # There's indeed some annotations file with 0 entries. if not filtered_emojis: print(f"Skipping {dictfilename}") continue dictfile = QFile(dictfilename) if not dictfile.open(QIODevice.WriteOnly): continue buf = QByteArray() stream = QDataStream(buf, QIODevice.WriteOnly) stream.setVersion(QDataStream.Qt_5_15) stream.setByteOrder(QDataStream.LittleEndian) stream.writeUInt32(len(filtered_emojis)) for emoji, category in filtered_emojis: stream << QByteArray(emoji.encode("utf-8")) annotation = annotations[emoji] stream << QByteArray(annotation.description.encode("utf-8")) stream.writeInt32(category) # Write QList stream.writeUInt32(len(annotation.annotations)) for item in annotation.annotations: stream << QByteArray(item.encode("utf-8")) compressed = qCompress(buf) dictfile.write(compressed) dictfile.close() print("Update Finished, please also update CategoryAction.qml with new category name.")