# -*- coding: UTF-8 -*-

# Script that compiles Transcript property maps from text to binary format.
# Binary format greately speeds up loading of property maps at runtime.
# http://techbase.kde.org/Localization/Concepts/Transcript
#
# Usage:
#   ts-pmap-compile.py file.pmap file.pmapc
#
# Works with Python >= 2.6 and >= 3.0.

import locale
import os
import re
import struct
import sys


cmdname = os.path.basename(sys.argv[0])
lenc = locale.getpreferredencoding()

def error (msg, code=1):
    sys.stderr.write(("%s: error: %s\n" % (cmdname, msg)).encode(lenc))
    sys.exit(code)


def count_lines (text, tolen):
    return text.count("\n", 0, tolen) + 1


def norm_keystr (text):
    # Must do the same as normKeystr() in kdelibs/kdecore/ktranscript.cpp
    return re.sub("[\s&]", "", text).lower()


def trim_smart (text):
    return re.sub("^\s*\n|\n\s*$", "", text)


def read_pmap (fname):

    # Adapted directly from C++ code.

    fh = open(fname, "rb")
    s = "".join([l.decode("utf8") for l in fh.readlines()])
    fh.close()

    s_nextEntry, s_nextKey, s_nextValue = 1, 2, 3

    pmap = []

    class END_PROP_PARSE (Exception): pass
    try:
        slen = len(s)
        state = s_nextEntry
        ekeys = [] # holds keys for current entry
        props = [] # holds properties for current entry
        pkey = "" # holds current property key
        i = 0
        while True:
            i_checkpoint = i

            if state == s_nextEntry:
                while s[i].isspace():
                    i += 1
                    if i >= slen: raise END_PROP_PARSE

                if i + 1 >= slen:
                    error("unexpected end of file %s" % fname)

                if s[i] != '#':
                    # Separator characters for this entry.
                    key_sep = s[i]
                    prop_sep = s[i + 1]
                    if key_sep.isalpha() or prop_sep.isalpha():
                        error("separator characters must not be letters "
                              "at %s:%d" % (fname, count_lines(s, i)))

                    # Reset all data for current entry.
                    ekeys = []
                    props = []
                    pkey = ""

                    i += 2
                    state = s_nextKey

                else:
                    # This is a comment, skip to EOL, don't change state.
                    while s[i] != '\n':
                        i += 1
                        if i >= slen: raise END_PROP_PARSE

            elif state == s_nextKey:
                ip = i
                # Proceed up to next key or property separator.
                while s[i] != key_sep and s[i] != prop_sep:
                    i += 1
                    if i >= slen: raise END_PROP_PARSE

                if s[i] == key_sep:
                    # This is a property key,
                    # record for when the value gets parsed.
                    pkey = norm_keystr(s[ip:i])

                    i += 1
                    state = s_nextValue

                else: # if (s[i] == prop_sep
                    # This is an entry key, or end of entry.
                    ekey = norm_keystr(s[ip:i])
                    if ekey:
                        # An entry key.
                        ekeys.append(ekey)

                        i += 1
                        state = s_nextKey

                    else:
                        # End of entry.
                        if len(ekeys) < 1:
                            error("no entry key for entry ending "
                                  "at %s:%d" % (fname, count_lines(s, i)))

                        # Put collected properties into global store.
                        pmap.append((ekeys, props))

                        i += 1
                        state = s_nextEntry
                        # This check covers no newline at end of file.
                        if i >= slen: raise END_PROP_PARSE

            elif state == s_nextValue:
                ip = i
                # Proceed up to next property separator.
                while s[i] != prop_sep:
                    i += 1
                    if i >= slen: raise END_PROP_PARSE
                    if s[i] == key_sep:
                        error("property separator inside property value "
                              "at %s:%d" % (fname, count_lines(s, i)))

                # Extract the property value and store the property.
                pval = trim_smart(s[ip:i])
                props.append((pkey, pval))

                i += 1
                state = s_nextKey

            else:
                error("internal error 10 "
                      "at %s:%d" % (fname, count_lines(s, i)))

            # To avoid infinite looping and stepping out.
            if i == i_checkpoint or i >= slen:
                error("internal error 20 "
                      "at %s:%d" % (fname, count_lines(s, i)))

    except END_PROP_PARSE:
        if state != s_nextEntry:
            error("unexpected end of file in %s" % fname)

    return pmap


# Convert integer to 32-bit big-endian byte sequence.
def int_bin_32 (val):
    return struct.pack(">i", val)[-4:]


# Convert integer to 64-bit big-endian byte sequence.
def int_bin_64 (val):
    return struct.pack(">q", val)[-8:]


# Convert string to UTF-8 byte sequence,
# preceded by its length in 32-bit big-endian.
def str_bin_32 (val):
    val_enc = val.encode("utf8")
    return int_bin_32(len(val_enc)) + val_enc


# Concatenate byte sequence.
def catb (seq):
    return bytes().join(seq)


# Binary map format 00.
def write_map_bin_00 (fh, pmap):

    # Magic bytes.
    fh.write("TSPMAP00".encode("ascii"))

    # Number of entries.
    fh.write(int_bin_32(len(pmap)))

    for ekeys, props in pmap:
        # Number of phrase keys and all phrase keys.
        fh.write(int_bin_32(len(ekeys)))
        for ekey in ekeys:
            fh.write(str_bin_32(ekey))

        # Number of properties and all properties.
        fh.write(int_bin_32(len(props)))
        for pkey, pval in props:
            fh.write(str_bin_32(pkey))
            fh.write(str_bin_32(pval))


# Binary map format 01.
def write_map_bin_01 (fh, pmap):

    offset0 = 0
    binint32len = len(int_bin_32(0))
    binint64len = len(int_bin_64(0))

    # Magic bytes.
    mbytestr = "TSPMAP01".encode("ascii")
    offset0 += len(mbytestr)

    # Compute length of binary representation of all entry keys
    # additionally equipped with offsets to corresponding property blobs.
    offset0 += binint32len
    offset0 += binint64len
    binekeyslen = 0
    for ekeys, d1 in pmap:
        binekeyslen += sum([len(str_bin_32(x)) + binint64len for x in ekeys])
    offset0 += binekeyslen

    # Construct binary representations of all unique property keys.
    offset0 += binint32len
    offset0 += binint64len
    allpkeys = set()
    for d1, props in pmap:
        allpkeys.update([x[0] for x in props])
    binpkeys = catb(map(str_bin_32, sorted(allpkeys)))
    offset0 += len(binpkeys)

    # Construct binary representations of properties for each entry.
    # Compute byte offsets for each of these binary blobs, in the given order.
    binprops = []
    plength = 0
    poffset = offset0 + binint32len
    for d1, props in pmap:
        cbinprops = catb(sum([list(map(str_bin_32, x)) for x in props], []))
        cbinprops = catb([int_bin_32(len(props)), int_bin_32(len(cbinprops)),
                          cbinprops])
        offset = poffset + plength
        binprops.append([cbinprops, offset])
        poffset = offset
        plength = len(cbinprops)

    # Construct binary representations of all entry keys with property offsets.
    allekeys = []
    binekeys = []
    for (ekeys, d1), (d2, offset) in zip(pmap, binprops):
        binoffset = int_bin_64(offset)
        cbinekeys = catb([str_bin_32(x) + binoffset for x in ekeys])
        binekeys.append(cbinekeys)
        allekeys.extend(ekeys)
    binekeys = catb(binekeys)
    assert(binekeyslen == len(binekeys))

    # Write everything out.
    fh.write(mbytestr)
    fh.write(int_bin_32(len(allekeys)))
    fh.write(int_bin_64(len(binekeys)))
    fh.write(binekeys)
    fh.write(int_bin_32(len(allpkeys)))
    fh.write(int_bin_64(len(binpkeys)))
    fh.write(binpkeys)
    fh.write(int_bin_32(len(pmap)))
    for cbinprops, d1 in binprops:
        fh.write(cbinprops)


def main ():

    if len(sys.argv) != 3:
        error("usage: %s INPUT_FILE OUTPUT_FILE" % cmdname)

    try:
        import psyco
        psyco.full()
    except ImportError:
        pass

    ifile = sys.argv[1]
    ofile = sys.argv[2]

    pmap = read_pmap(ifile)
    ofh = open(ofile, "wb")
    write_map_bin_01(ofh, pmap)
    ofh.close()


if __name__ == '__main__':
    main()
