GloomyGhost
/
ecantorix-sinsy-ng
forked from isengaara/ecantorix-sg


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
							#!/usr/bin/python3

import os
import re
import sys
import math
import codecs

import xml.etree.ElementTree as etree

class Emoji:
    def __init__(self, m):
        self.emoji = m.group(1)
        self.pronunciation = m.group(2)
        self.codepoints = m.group(3)
        self.comment = m.group(4)

    def __repr__(self):
        return "Emoji(emoji={0}, pronunciation={1}, codepoints={2}, comment={3})".format(
            repr(self.emoji),
            repr(self.pronunciation),
            repr(self.codepoints),
            repr(self.comment))

    def __str__(self):
        return "{0}{1}// [{2}]{3}".format(self.emoji, self.pronunciation, self.codepoints, self.comment)

def read_annotations(filename):
    ldml = etree.parse(filename).getroot()
    for annotations in ldml.findall("annotations"):
        for annotation in annotations.findall("annotation"):
            if annotation.attrib.get("type", "") == "tts":
                yield annotation.attrib["cp"], annotation.text

def read_emoji(filename, encoding="utf-8"):
    re_emoji = re.compile(r"^([^ \t]*)([^/]*)// \[([^\]]*)\](.*)$")
    with codecs.open(filename, "r", encoding) as f:
        for line in f:
            line = line.replace("\n", "")
            if line.strip() == "":
                yield line # blank line
            elif line.startswith("//"):
                yield line # line comment
            elif line.startswith("$"):
                yield line # flags only
            else:
                m = re_emoji.match(line)
                if m:
                    yield Emoji(m)
                else:
                    yield line

def find_langname(lang):
    espeak_data_path = os.path.join(os.getcwd(), "espeak-ng-data")
    for root, dirnames, filenames in os.walk(espeak_data_path):
        if lang in filenames:
            filename = os.path.join(root, lang)
            with codecs.open(filename, "r", "utf-8") as f:
                for line in f:
                    line = line.replace("\n", "")
                    if line.startswith("name "):
                        return line.replace("name ", "")

def normalize(text):
    text = text.replace("„", "")
    text = text.replace("“", "")
    text = text.replace("\"", "")
    text = text.replace("‘", "'")
    text = text.replace("·", " ")
    text = text.replace("| ", "") # alternatives, e.g. af "vonkstok | skitterstokkie"
    return text

emoji_dict = sys.argv[1]
lang = sys.argv[2]
cldr_path = sys.argv[3]

filenames = [
    os.path.join(cldr_path, "common", "annotations", "{0}.xml".format(lang)),
    os.path.join(cldr_path, "common", "annotationsDerived", "{0}.xml".format(lang))
]

annotations = {}
for filename in filenames:
    for cp, name in read_annotations(filename):
        annotations[cp] = name

for entry in read_emoji(emoji_dict):
    if isinstance(entry, Emoji):
        translation = annotations.get(entry.emoji.replace("\uFE0F", ""), None)
        if translation:
            translation = normalize(translation)
            length = len(entry.pronunciation.strip())
            tabs = entry.pronunciation.count('\t') - 1
            first_tab = 8 - (length % 8)
            tab_length = length + first_tab + ((tabs - 1) * 8)

            new_length = len(translation)
            new_tabs = math.ceil((tab_length - new_length)/8)

            entry.pronunciation = "\t{0}{1}".format(translation, "\t"*int(new_tabs))
        else:
            entry.comment += " (no translation)"
    elif entry == "// Emoji and Other Symbol pronunciations for English":
        langname = find_langname(lang)
        entry = "// Emoji and Other Symbol pronunciations for {0}".format(langname)
    elif entry == "//   2.  common/annotations/en.xml (CLDR)":
        entry = "//   2.  common/annotations/{0}.xml (CLDR)".format(lang)
    print(entry)