#!/usr/bin/python3 # -*- coding: utf-8 -*- # # This script generates a data file containing all Unicode information needed # by KCharSelect. # ############################################################################## # SPDX-FileCopyrightText: 2007 Daniel Laidig # SPDX-FileCopyrightText: 2016 John Zaitseff # # SPDX-License-Identifier: LGPL-2.0-or-later ############################################################################## # # The current directory must contain the following files that can be found at # http://www.unicode.org/Public/UNIDATA/: # - UnicodeData.txt # - Unihan_Readings.txt (you need to uncompress it from Unihan.zip) # - NamesList.txt # - Blocks.txt # # The generated file is named "kcharselect-data" and has to be put in # kwidgetsaddons/src. Additionally a translation dummy named # "kcharselect-translation.cpp" is generated and has to be placed in the same # directory. # # FILE STRUCTURE # # The generated file is a binary file. The first 40 bytes are the header and # contain the position of each part of the file. Each entry is uint32. # # pos content # 0 names strings begin # 4 names offsets begin # 8 details strings begin # 12 details offsets begin # 16 block strings begin # 20 block offsets begin # 24 section strings begin # 28 section offsets begin # 32 unihan strings begin # 36 unihan offsets begin # # The string parts always contain all strings in a row, followed by a 0x00 # byte. There is one exception: The data for seeAlso in details is only 2 # bytes (as is always is _one_ unicode character) and _not_ followed by a 0x00 # byte. # # The offset parts contain entries with a fixed length. Unicode characters # are always uint16 and offsets uint32. Offsets are positions in the data # file. # # names_offsets: # each entry 6 bytes # 16bit: unicode # 32bit: offset to name in names_strings # # names_strings: # the first byte is the category (same values as QChar::Category), # directly followed by the character name (terminated by 0x00) # # nameslist_offsets: # char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_coutn, equiv, equiv_count, seeAlso, seeAlso_count # 16 32 8 32 8 32 8 32 8 32 8 # => each entry 27 bytes # # blocks_offsets: # each entry 4 bytes # 16bit: start unicode # 16bit: end unicode # Note that there is no string offset. # # section_offsets: # each entry 4 bytes # 16bit: section offset # 16bit: block offset # Note that these offsets are _not_ positions in the data file but indexes. # For example 0x0403 means the fourth section includes the third block. # # unihan_offsets: # each entry 30 bytes # 16bit: unicode # 32bit: offset to unihan_strings for Definition # 32bit: offset to unihan_strings for Cantonese # 32bit: offset to unihan_strings for Mandarin # 32bit: offset to unihan_strings for Tang # 32bit: offset to unihan_strings for Korean # 32bit: offset to unihan_strings for JapaneseKun # 32bit: offset to unihan_strings for JapaneseOn from struct import * import sys import re import io # Based on http://www.unicode.org/charts/, updated for Unicode 9.0 sectiondata = ''' SECTION European Scripts Basic Latin Latin-1 Supplement Latin Extended-A Latin Extended-B Latin Extended-C Latin Extended-D Latin Extended-E Latin Extended Additional Armenian Coptic Cyrillic Cyrillic Supplement Cyrillic Extended-A Cyrillic Extended-B Cyrillic Extended-C Georgian Georgian Supplement Georgian Extended Glagolitic Greek and Coptic Greek Extended Ogham Runic SECTION African Scripts Bamum Ethiopic Ethiopic Supplement Ethiopic Extended Ethiopic Extended-A NKo Tifinagh Vai SECTION Middle Eastern Scripts Arabic Arabic Supplement Arabic Extended-A Arabic Extended-B Arabic Presentation Forms-A Arabic Presentation Forms-B Hebrew Mandaic Samaritan Syriac Syriac Supplement SECTION Central Asian Scripts Mongolian Phags-pa Tibetan SECTION South Asian Scripts Bengali Common Indic Number Forms Devanagari Devanagari Extended Gujarati Gurmukhi Kannada Lepcha Limbu Malayalam Meetei Mayek Meetei Mayek Extensions Ol Chiki Oriya Saurashtra Sinhala Syloti Nagri Tamil Telugu Thaana Vedic Extensions SECTION Southeast Asian Scripts Cham Kayah Li Khmer Khmer Symbols Lao Myanmar Myanmar Extended-A Myanmar Extended-B New Tai Lue Tai Le Tai Tham Tai Viet Thai SECTION Indonesia and Oceania Scripts Balinese Batak Buginese Buhid Hanunoo Javanese Rejang Sundanese Sundanese Supplement Tagalog Tagbanwa SECTION East Asian Scripts Bopomofo Bopomofo Extended CJK Unified Ideographs CJK Unified Ideographs Extension A CJK Compatibility CJK Compatibility Ideographs CJK Compatibility Forms CJK Radicals Supplement CJK Strokes CJK Symbols and Punctuation Enclosed CJK Letters and Months Hangul Jamo Hangul Jamo Extended-A Hangul Jamo Extended-B Hangul Compatibility Jamo Hangul Syllables Hiragana Ideographic Description Characters Kanbun Kangxi Radicals Katakana Katakana Phonetic Extensions Lisu Yi Radicals Yi Syllables SECTION American Scripts Cherokee Cherokee Supplement Unified Canadian Aboriginal Syllabics Unified Canadian Aboriginal Syllabics Extended SECTION Symbols General Punctuation Alchemical Symbols Braille Patterns Chess Symbols Control Pictures Currency Symbols Dingbats Domino Tiles Emoticons Enclosed Alphanumerics Enclosed Alphanumeric Supplement Enclosed Ideographic Supplement Mahjong Tiles Miscellaneous Symbols Miscellaneous Symbols and Pictographs Miscellaneous Technical Optical Character Recognition Ornamental Dingbats Playing Cards Small Form Variants Supplemental Punctuation Supplemental Symbols and Pictographs Symbols and Pictographs Extended-A Symbols for Legacy Computing Transport and Map Symbols Vertical Forms Yijing Hexagram Symbols SECTION Mathematical Symbols Arrows Block Elements Box Drawing Geometric Shapes Geometric Shapes Extended Letterlike Symbols Mathematical Operators Miscellaneous Mathematical Symbols-A Miscellaneous Mathematical Symbols-B Miscellaneous Symbols and Arrows Number Forms Superscripts and Subscripts Supplemental Arrows-A Supplemental Arrows-B Supplemental Arrows-C Supplemental Mathematical Operators SECTION Phonetic Symbols IPA Extensions Modifier Tone Letters Phonetic Extensions Phonetic Extensions Supplement Spacing Modifier Letters SECTION Combining Diacritics Combining Diacritical Marks Combining Diacritical Marks Extended Combining Diacritical Marks Supplement Combining Diacritical Marks for Symbols Combining Half Marks SECTION Other Alphabetic Presentation Forms Halfwidth and Fullwidth Forms High Private Use Surrogates High Surrogates Low Surrogates Private Use Area Specials Variation Selectors ''' categoryMap = { # same values as QChar::Category "Mn": 1, "Mc": 2, "Me": 3, "Nd": 4, "Nl": 5, "No": 6, "Zs": 7, "Zl": 8, "Zp": 9, "Cc": 10, "Cf": 11, "Cs": 12, "Co": 13, "Cn": 14, "Lu": 15, "Ll": 16, "Lt": 17, "Lm": 18, "Lo": 19, "Pc": 20, "Pd": 21, "Ps": 22, "Pe": 23, "Pi": 24, "Pf": 25, "Po": 26, "Sm": 27, "Sc": 28, "Sk": 29, "So": 30 } # Temporary code point remapping # # Initial SMP support without needing a new data file format # - BMP U+Fxxx are remapped to U+Exxx # - SMP symbols U+1Fxxx are remapped to U+Fxxx # - Private Use Area is limited to U+F000 ... U+F8FF def remap(char): cp = int(char, 16) if cp >= 0xE000 and cp <= 0xFFFF: return "E"+char[1:] if cp >= 0x1F000 and cp <= 0x1FFFF: return char[1:] return char class Names: def __init__(self): self.names = [] self.controlpos = -1 def addName(self, uni, name, category): self.names.append([uni, name, category]) def calculateStringSize(self): size = 0 hadcontrol = False for entry in self.names: if entry[1] == "": if not hadcontrol: size += len(entry[1]) + 2 hadcontrol = True else: size += len(entry[1]) + 2 return size def calculateOffsetSize(self): return len(self.names)*6 def writeStrings(self, out, pos): hadcontrol = False for entry in self.names: if entry[1] == "": if not hadcontrol: out.write(pack("=b", entry[2])) out.write(entry[1].encode("utf-8") + b"\0") size = len(entry[1]) + 2 entry[1] = pos self.controlpos = pos pos += size hadcontrol = True else: entry[1] = self.controlpos else: out.write(pack("=b", entry[2])) out.write(entry[1].encode("utf-8") + b"\0") size = len(entry[1]) + 2 entry[1] = pos pos += size return pos def writeOffsets(self, out, pos): for entry in self.names: out.write(pack("=HI", int(entry[0], 16), entry[1])) pos += 6 return pos class Details: def __init__(self): self.details = {} def addEntry(self, char, category, text): if not char in self.details: self.details[char] = {} if not category in self.details[char]: self.details[char][category] = [] self.details[char][category].append(text) def calculateStringSize(self): size = 0 for char in self.details.values(): for cat in char.values(): for s in cat: if type(s) is str: size += len(s.encode("utf-8")) + 1 else: size += 2 return size def calculateOffsetSize(self): return len(self.details)*27 def writeStrings(self, out, pos): for char in self.details.values(): for cat in char.values(): for i in range(0, len(cat)): s = cat[i] if type(s) is str: out.write(s.encode("utf-8") + b"\0") size = len(s.encode("utf-8")) + 1 else: out.write(pack("=H", s)) size = 2 cat[i] = pos pos += size return pos def writeOffsets(self, out, pos): for char in self.details.keys(): alias = 0 alias_count = 0 note = 0 note_count = 0 approxEquiv = 0 approxEquiv_count = 0 equiv = 0 equiv_count = 0 seeAlso = 0 seeAlso_count = 0 if "alias" in self.details[char]: alias = self.details[char]["alias"][0] alias_count = len(self.details[char]["alias"]) if "note" in self.details[char]: note = self.details[char]["note"][0] note_count = len(self.details[char]["note"]) if "approxEquiv" in self.details[char]: approxEquiv = self.details[char]["approxEquiv"][0] approxEquiv_count = len(self.details[char]["approxEquiv"]) if "equiv" in self.details[char]: equiv = self.details[char]["equiv"][0] equiv_count = len(self.details[char]["equiv"]) if "seeAlso" in self.details[char]: seeAlso = self.details[char]["seeAlso"][0] seeAlso_count = len(self.details[char]["seeAlso"]) out.write(pack("=HIbIbIbIbIb", char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_count, equiv, equiv_count, seeAlso, seeAlso_count)) pos += 27 return pos class SectionsBlocks: def __init__(self): self.sections = [] self.blocks = [] self.blockList = [] self.sectionList = [] def addBlock(self, begin, end, name): self.blocks.append([begin, end, name]) self.blockList.append(name) def addSection(self, section, block): self.sections.append([section, block]) if not section in self.sectionList: self.sectionList.append(section) def calculateBlockStringSize(self): size = 0 for block in self.blocks: size += len(block[2]) + 1 return size def calculateBlockOffsetSize(self): return len(self.blocks) * 4 def calculateSectionStringSize(self): size = 0 lastsection = "" for section in self.sections: if section[0] != lastsection: size += len(section[0]) + 1 lastsection = section[0] return size def calculateSectionOffsetSize(self): return len(self.sections) * 4 def writeBlockStrings(self, out, pos): index = 0 for block in self.blocks: out.write(block[2].encode("utf-8") + b"\0") size = len(block[2].encode("utf-8")) + 1 found = False for section in self.sections: if section[1] == block[2]: print("found", section) section[1] = index found = True if not found: print("Error: Did not find any category for block \""+block[2]+"\"") sys.exit(1) block[2] = index pos += size index += 1 return pos def writeBlockOffsets(self, out, pos): for block in self.blocks: out.write(pack("=HH", int(block[0], 16), int(block[1], 16))) pos += 4 return pos def writeSectionStrings(self, out, pos): lastsection = "" lastpos = 0 index = -1 for section in self.sections: if section[0] != lastsection: index += 1 lastsection = section[0] out.write(section[0].encode("utf-8") + b"\0") size = len(section[0].encode("utf-8")) + 1 section[0] = index lastpos = pos pos += size else: section[0] = index return pos def writeSectionOffsets(self, out, pos): for section in self.sections: out.write(pack("=HH", section[0], section[1])) pos += 4 return pos def getBlockList(self): return self.blockList def getSectionList(self): return self.sectionList class Unihan: def __init__(self): self.unihan = {} def addUnihan(self, uni, category, value): uni = int(uni, 16) if category != "kDefinition" and category != "kCantonese" and category != "kMandarin" and category != "kTang" and category != "kKorean" and category != "kJapaneseKun" and category != "kJapaneseOn": return if not uni in self.unihan: self.unihan[uni] = [None, None, None, None, None, None, None] if category == "kDefinition": self.unihan[uni][0] = value elif category == "kCantonese": self.unihan[uni][1] = value elif category == "kMandarin": self.unihan[uni][2] = value elif category == "kTang": self.unihan[uni][3] = value elif category == "kKorean": self.unihan[uni][4] = value elif category == "kJapaneseKun": self.unihan[uni][5] = value elif category == "kJapaneseOn": self.unihan[uni][6] = value def calculateStringSize(self): size = 0 for char in self.unihan.keys(): for entry in self.unihan[char]: if entry != None: size += len(entry.encode("utf-8")) + 1 return size def calculateOffsetSize(self): return len(self.unihan) * 30 def writeStrings(self, out, pos): for char in self.unihan.keys(): for i in range(0, 7): if self.unihan[char][i] != None: out.write(self.unihan[char][i].encode("utf-8") + b"\0") size = len(self.unihan[char][i].encode("utf-8")) + 1 self.unihan[char][i] = pos pos += size return pos def writeOffsets(self, out, pos): for char in self.unihan.keys(): out.write(pack("=H", char)) for i in range(0, 7): if self.unihan[char][i] != None: out.write(pack("=I", self.unihan[char][i])) else: out.write(pack("=I", 0)) pos += 30 return pos class Parser: def parseUnicodeData(self, inUnicodeData, names): regexp = re.compile(r'^([^;]+);([^;]+);([^;]+)') for line in inUnicodeData: line = line[:-1] m = regexp.match(line) if not m: continue uni = remap(m.group(1)) name = m.group(2) category = m.group(3) if len(uni) > 4: continue names.addName(uni, name, categoryMap[category]) def parseDetails(self, inNamesList, details): invalidRegexp = re.compile(r'^@') unicodeRegexp = re.compile(r'^([0-9A-F]+)') aliasRegexp = re.compile(r'^\s+=\s+(.+)$') #equal seeAlsoRegexp1 = re.compile(r'^\s+x\s+.*\s([0-9A-F]{4,6})\)$') #ex seeAlsoRegexp2 = re.compile(r'^\s+x\s+([0-9A-F]{4,6})$') #ex noteRegexp = re.compile(r'^\s+\*\s+(.+)$') #star approxEquivalentRegexp = re.compile(r'^\s+#\s+(.+)$') #pound equivalentRegexp = re.compile(r'^\s+:\s+(.+)$') #colon drop = 0 currChar = 0 for line in inNamesList: line = line[:-1] m1 = unicodeRegexp.match(line) m2 = aliasRegexp.match(line) m3 = noteRegexp.match(line) m4 = approxEquivalentRegexp.match(line) m5 = equivalentRegexp.match(line) m6 = seeAlsoRegexp1.match(line) m7 = seeAlsoRegexp2.match(line) if invalidRegexp.match(line): continue elif m1: mg1 = remap(m1.group(1)) currChar = int(mg1, 16) if len(mg1) > 4: drop = 1 continue elif drop == 1: continue elif m2: value = m2.group(1) details.addEntry(currChar, "alias", value) elif m3: value = m3.group(1) details.addEntry(currChar, "note", value) elif m4: value = m4.group(1) details.addEntry(currChar, "approxEquiv", value) elif m5: value = m5.group(1) details.addEntry(currChar, "equiv", value) elif m6: value = int(remap(m6.group(1)), 16) if value < 0x10000: details.addEntry(currChar, "seeAlso", value) elif m7: value = int(remap(m7.group(1)), 16) if value < 0x10000: details.addEntry(currChar, "seeAlso", value) def parseBlocks(self, inBlocks, sectionsBlocks): regexp = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$') for line in inBlocks: line = line[:-1] m = regexp.match(line) if not m: continue m1 = remap(m.group(1)) m2 = remap(m.group(2)) if len(m1) > 4: continue sectionsBlocks.addBlock(m1, m2, m.group(3)) def parseSections(self, inSections, sectionsBlocks): currSection = "" for line in inSections: line = line[:-1] if len(line) == 0: continue temp = line.split(" ") if temp[0] == "SECTION": currSection = line[8:] elif currSection != "": sectionsBlocks.addSection(currSection, line) else: print("error in data file") sys.exit(1) def parseUnihan(self, inUnihan, unihan): regexp = re.compile(r'^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$') count = 0 for line in inUnihan: if count % 100000 == 0: print("\b."); sys.stdout.flush() count += 1 line = line[:-1] m = regexp.match(line) if not m: continue if len(remap(m.group(1))) <= 4: unihan.addUnihan(remap(m.group(1)), m.group(2), m.group(3)) def writeTranslationDummy(out, data): out.write(b"""/* This file is part of the KDE libraries SPDX-FileCopyrightText: 2007 Daniel Laidig SPDX-FileCopyrightText: 2016 John Zaitseff SPDX-License-Identifier: LGPL-2.0-or-later This file is autogenerated by kcharselect/kcharselect-generate-datafile.py */\n\n""") for group in data: for entry in group[1]: out.write(b"QT_TRANSLATE_NOOP3(\"KCharSelectData\", \""+entry.encode("utf-8")+b"\", \""+group[0].encode("utf-8")+b"\");\n") out = open("kcharselect-data", "wb") outTranslationDummy = open("kcharselect-translation.cpp", "wb") inUnicodeData = open("UnicodeData.txt", "r") inNamesList = open("NamesList.txt", "r") inBlocks = open("Blocks.txt", "r") inSections = io.StringIO(sectiondata) inUnihan = open("Unihan_Readings.txt", "r") if calcsize('=H') != 2 or calcsize('=I') != 4: print("Error: Sizes of ushort and uint are not 16 and 32 bit as expected") sys.exit(1) names = Names() details = Details() sectionsBlocks = SectionsBlocks() unihan = Unihan() parser = Parser() print("========== parsing files ===================") parser.parseUnicodeData(inUnicodeData, names) print("."); sys.stdout.flush() parser.parseDetails(inNamesList, details) print("\b."); sys.stdout.flush() parser.parseBlocks(inBlocks, sectionsBlocks) print("\b."); sys.stdout.flush() parser.parseSections(inSections, sectionsBlocks) print("\b."); sys.stdout.flush() parser.parseUnihan(inUnihan, unihan) print("\b."); sys.stdout.flush() print("done.") pos = 0 #write header, size: 40 bytes print("========== writing header ==================") out.write(pack("=I", 40)) print("names strings begin", 40) namesOffsetBegin = names.calculateStringSize() + 40 out.write(pack("=I", namesOffsetBegin)) print("names offsets begin", namesOffsetBegin) detailsStringBegin = namesOffsetBegin + names.calculateOffsetSize() out.write(pack("=I", detailsStringBegin)) print("details strings begin", detailsStringBegin) detailsOffsetBegin = detailsStringBegin + details.calculateStringSize() out.write(pack("=I", detailsOffsetBegin)) print("details offsets begin", detailsOffsetBegin) blocksStringBegin = detailsOffsetBegin + details.calculateOffsetSize() out.write(pack("=I", blocksStringBegin)) print("block strings begin", blocksStringBegin) blocksOffsetBegin = blocksStringBegin + sectionsBlocks.calculateBlockStringSize() out.write(pack("=I", blocksOffsetBegin)) print("block offsets begin", blocksOffsetBegin) sectionStringBegin = blocksOffsetBegin + sectionsBlocks.calculateBlockOffsetSize() out.write(pack("=I", sectionStringBegin)) print("section strings begin", sectionStringBegin) sectionOffsetBegin = sectionStringBegin + sectionsBlocks.calculateSectionStringSize() out.write(pack("=I", sectionOffsetBegin)) print("section offsets begin", sectionOffsetBegin) unihanStringBegin = sectionOffsetBegin + sectionsBlocks.calculateSectionOffsetSize() out.write(pack("=I", unihanStringBegin)) print("unihan strings begin", unihanStringBegin) unihanOffsetBegin = unihanStringBegin + unihan.calculateStringSize() out.write(pack("=I", unihanOffsetBegin)) print("unihan offsets begin", unihanOffsetBegin) end = unihanOffsetBegin + unihan.calculateOffsetSize() print("end should be", end) pos += 40 print("========== writing data ====================") pos = names.writeStrings(out, pos) print("names strings written, position", pos) pos = names.writeOffsets(out, pos) print("names offsets written, position", pos) pos = details.writeStrings(out, pos) print("details strings written, position", pos) pos = details.writeOffsets(out, pos) print("details offsets written, position", pos) pos = sectionsBlocks.writeBlockStrings(out, pos) print("block strings written, position", pos) pos = sectionsBlocks.writeBlockOffsets(out, pos) print("block offsets written, position", pos) pos = sectionsBlocks.writeSectionStrings(out, pos) print("section strings written, position", pos) pos = sectionsBlocks.writeSectionOffsets(out, pos) print("section offsets written, position", pos) pos = unihan.writeStrings(out, pos) print("unihan strings written, position", pos) pos = unihan.writeOffsets(out, pos) print("unihan offsets written, position", pos) print("========== writing translation dummy ======") translationData = [["KCharSelect section name", sectionsBlocks.getSectionList()], ["KCharselect unicode block name",sectionsBlocks.getBlockList()]] writeTranslationDummy(outTranslationDummy, translationData) print("done. make sure to copy both kcharselect-data and kcharselect-translation.cpp.")