emoji 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/python3
  2. import os
  3. import re
  4. import sys
  5. import math
  6. import codecs
  7. import xml.etree.ElementTree as etree
  8. class Emoji:
  9. def __init__(self, m):
  10. self.emoji = m.group(1)
  11. self.pronunciation = m.group(2)
  12. self.codepoints = m.group(3)
  13. self.comment = m.group(4)
  14. def __repr__(self):
  15. return "Emoji(emoji={0}, pronunciation={1}, codepoints={2}, comment={3})".format(
  16. repr(self.emoji),
  17. repr(self.pronunciation),
  18. repr(self.codepoints),
  19. repr(self.comment))
  20. def __str__(self):
  21. return "{0}{1}// [{2}]{3}".format(self.emoji, self.pronunciation, self.codepoints, self.comment)
  22. def read_annotations(filename):
  23. ldml = etree.parse(filename).getroot()
  24. for annotations in ldml.findall("annotations"):
  25. for annotation in annotations.findall("annotation"):
  26. if annotation.attrib.get("type", "") == "tts":
  27. yield annotation.attrib["cp"], annotation.text
  28. def read_emoji(filename, encoding="utf-8"):
  29. re_emoji = re.compile(r"^([^ \t]*)([^/]*)// \[([^\]]*)\](.*)$")
  30. with codecs.open(filename, "r", encoding) as f:
  31. for line in f:
  32. line = line.replace("\n", "")
  33. if line.strip() == "":
  34. yield line # blank line
  35. elif line.startswith("//"):
  36. yield line # line comment
  37. elif line.startswith("$"):
  38. yield line # flags only
  39. else:
  40. m = re_emoji.match(line)
  41. if m:
  42. yield Emoji(m)
  43. else:
  44. yield line
  45. def find_langname(lang):
  46. espeak_data_path = os.path.join(os.getcwd(), "espeak-ng-data")
  47. for root, dirnames, filenames in os.walk(espeak_data_path):
  48. if lang in filenames:
  49. filename = os.path.join(root, lang)
  50. with codecs.open(filename, "r", "utf-8") as f:
  51. for line in f:
  52. line = line.replace("\n", "")
  53. if line.startswith("name "):
  54. return line.replace("name ", "")
  55. def normalize(text):
  56. text = text.replace("„", "")
  57. text = text.replace("“", "")
  58. text = text.replace("\"", "")
  59. text = text.replace("‘", "'")
  60. text = text.replace("·", " ")
  61. text = text.replace("| ", "") # alternatives, e.g. af "vonkstok | skitterstokkie"
  62. return text
  63. emoji_dict = sys.argv[1]
  64. lang = sys.argv[2]
  65. cldr_path = sys.argv[3]
  66. filenames = [
  67. os.path.join(cldr_path, "common", "annotations", "{0}.xml".format(lang)),
  68. os.path.join(cldr_path, "common", "annotationsDerived", "{0}.xml".format(lang))
  69. ]
  70. annotations = {}
  71. for filename in filenames:
  72. for cp, name in read_annotations(filename):
  73. annotations[cp] = name
  74. for entry in read_emoji(emoji_dict):
  75. if isinstance(entry, Emoji):
  76. translation = annotations.get(entry.emoji.replace("\uFE0F", ""), None)
  77. if translation:
  78. translation = normalize(translation)
  79. length = len(entry.pronunciation.strip())
  80. tabs = entry.pronunciation.count('\t') - 1
  81. first_tab = 8 - (length % 8)
  82. tab_length = length + first_tab + ((tabs - 1) * 8)
  83. new_length = len(translation)
  84. new_tabs = math.ceil((tab_length - new_length)/8)
  85. entry.pronunciation = "\t{0}{1}".format(translation, "\t"*int(new_tabs))
  86. else:
  87. entry.comment += " (no translation)"
  88. elif entry == "// Emoji and Other Symbol pronunciations for English":
  89. langname = find_langname(lang)
  90. entry = "// Emoji and Other Symbol pronunciations for {0}".format(langname)
  91. elif entry == "// 2. common/annotations/en.xml (CLDR)":
  92. entry = "// 2. common/annotations/{0}.xml (CLDR)".format(lang)
  93. print(entry)