update_engine_traits.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. #!/usr/bin/env python
  2. # SPDX-License-Identifier: AGPL-3.0-or-later
  3. """Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
  4. :py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
  5. Persistence of engines traits, fetched from the engines.
  6. :origin:`searx/languages.py`
  7. Is generated from intersecting each engine's supported traits.
  8. The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
  9. the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
  10. """
  11. # pylint: disable=invalid-name
  12. from unicodedata import lookup
  13. from pathlib import Path
  14. from pprint import pformat
  15. import babel
  16. from searx import settings, searx_dir
  17. from searx import network
  18. from searx.engines import load_engines
  19. from searx.enginelib.traits import EngineTraitsMap
  20. # Output files.
  21. languages_file = Path(searx_dir) / 'sxng_locales.py'
  22. languages_file_header = """\
  23. # SPDX-License-Identifier: AGPL-3.0-or-later
  24. '''List of SearXNG's locale codes.
  25. .. hint::
  26. Don't modify this file, this file is generated by::
  27. ./manage data.traits
  28. '''
  29. sxng_locales = (
  30. """
  31. languages_file_footer = """,
  32. )
  33. '''
  34. A list of five-digit tuples:
  35. 0. SearXNG's internal locale tag (a language or region tag)
  36. 1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
  37. 2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
  38. Empty string for language tags.
  39. 3. English language name (from :py:obj:`babel.core.Locale.english_name`)
  40. 4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
  41. are represented by a globe (\U0001F310)
  42. .. code:: python
  43. ('en', 'English', '', 'English', '\U0001f310'),
  44. ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
  45. ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
  46. ..
  47. ('fr', 'Français', '', 'French', '\U0001f310'),
  48. ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
  49. ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
  50. :meta hide-value:
  51. '''
  52. """
  53. lang2emoji = {
  54. 'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
  55. 'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
  56. 'jp': '\U0001F1EF\U0001F1F5', # Japanese
  57. 'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
  58. 'he': '\U0001F1EE\U0001F1F1', # Hebrew
  59. }
  60. def main():
  61. load_engines(settings['engines'])
  62. # traits_map = EngineTraitsMap.from_data()
  63. traits_map = fetch_traits_map()
  64. sxng_tag_list = filter_locales(traits_map)
  65. write_languages_file(sxng_tag_list)
  66. def fetch_traits_map():
  67. """Fetches supported languages for each engine and writes json file with those."""
  68. network.set_timeout_for_thread(10.0)
  69. def log(msg):
  70. print(msg)
  71. traits_map = EngineTraitsMap.fetch_traits(log=log)
  72. print("fetched properties from %s engines" % len(traits_map))
  73. print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
  74. traits_map.save_data()
  75. return traits_map
  76. def filter_locales(traits_map: EngineTraitsMap):
  77. """Filter language & region tags by a threshold."""
  78. min_eng_per_region = 18
  79. min_eng_per_lang = 20
  80. _ = {}
  81. for eng in traits_map.values():
  82. for reg in eng.regions.keys():
  83. _[reg] = _.get(reg, 0) + 1
  84. regions = set(k for k, v in _.items() if v >= min_eng_per_region)
  85. lang_from_region = set(k.split('-')[0] for k in regions)
  86. _ = {}
  87. for eng in traits_map.values():
  88. for lang in eng.languages.keys():
  89. # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
  90. # already counted by existence of 'zh' or 'sr', 'pa')
  91. if '_' in lang:
  92. # print("ignore %s" % lang)
  93. continue
  94. _[lang] = _.get(lang, 0) + 1
  95. languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
  96. sxng_tag_list = set()
  97. sxng_tag_list.update(regions)
  98. sxng_tag_list.update(lang_from_region)
  99. sxng_tag_list.update(languages)
  100. return sxng_tag_list
  101. def write_languages_file(sxng_tag_list):
  102. language_codes = []
  103. for sxng_tag in sorted(sxng_tag_list):
  104. sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
  105. flag = get_unicode_flag(sxng_locale) or ''
  106. item = (
  107. sxng_tag,
  108. sxng_locale.get_language_name().title(), # type: ignore
  109. sxng_locale.get_territory_name() or '',
  110. sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '',
  111. UnicodeEscape(flag),
  112. )
  113. language_codes.append(item)
  114. language_codes = tuple(language_codes)
  115. with languages_file.open('w', encoding='utf-8') as new_file:
  116. file_content = "{header} {language_codes}{footer}".format(
  117. header=languages_file_header,
  118. language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
  119. footer=languages_file_footer,
  120. )
  121. new_file.write(file_content)
  122. new_file.close()
  123. class UnicodeEscape(str):
  124. """Escape unicode string in :py:obj:`pprint.pformat`"""
  125. def __repr__(self):
  126. return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
  127. def get_unicode_flag(locale: babel.Locale):
  128. """Determine a unicode flag (emoji) that fits to the ``locale``"""
  129. emoji = lang2emoji.get(locale.language)
  130. if emoji:
  131. return emoji
  132. if not locale.territory:
  133. return '\U0001F310'
  134. emoji = lang2emoji.get(locale.territory.lower())
  135. if emoji:
  136. return emoji
  137. try:
  138. c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
  139. c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
  140. # print("OK : %s --> %s%s" % (locale, c1, c2))
  141. except KeyError as exc:
  142. print("ERROR: %s --> %s" % (locale, exc))
  143. return None
  144. return c1 + c2
  145. if __name__ == "__main__":
  146. main()