fetch_languages.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from intersecting each engine's supported languages.
  3. #
  4. # Output files (engines_languages.json and languages.py)
  5. # are written in current directory to avoid overwriting in case something goes wrong.
  6. import json
  7. from pathlib import Path
  8. from pprint import pformat
  9. from sys import path
  10. from babel import Locale, UnknownLocaleError
  11. from babel.languages import get_global
  12. path.append('../searx') # noqa
  13. from searx import settings, searx_dir
  14. from searx.engines import initialize_engines, engines
  15. # Output files.
  16. engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
  17. languages_file = Path(searx_dir) / 'languages.py'
  18. # Fetchs supported languages for each engine and writes json file with those.
  19. def fetch_supported_languages():
  20. engines_languages = dict()
  21. names = list(engines)
  22. names.sort()
  23. for engine_name in names:
  24. if hasattr(engines[engine_name], 'fetch_supported_languages'):
  25. engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
  26. print("fetched %s languages from engine %s" % (
  27. len(engines_languages[engine_name]), engine_name))
  28. if type(engines_languages[engine_name]) == list:
  29. engines_languages[engine_name] = sorted(engines_languages[engine_name])
  30. # write json file
  31. with open(engines_languages_file, 'w', encoding='utf-8') as f:
  32. json.dump(engines_languages, f, indent=2, sort_keys=True)
  33. return engines_languages
  34. # Get babel Locale object from lang_code if possible.
  35. def get_locale(lang_code):
  36. try:
  37. locale = Locale.parse(lang_code, sep='-')
  38. return locale
  39. except (UnknownLocaleError, ValueError):
  40. return None
  41. # Join all language lists.
  42. def join_language_lists(engines_languages):
  43. language_list = dict()
  44. for engine_name in engines_languages:
  45. for lang_code in engines_languages[engine_name]:
  46. # apply custom fixes if necessary
  47. if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
  48. lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
  49. if lang_code == alias)
  50. locale = get_locale(lang_code)
  51. # ensure that lang_code uses standard language and country codes
  52. if locale and locale.territory:
  53. lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
  54. short_code = lang_code.split('-')[0]
  55. # add language without country if not in list
  56. if short_code not in language_list:
  57. if locale:
  58. # get language's data from babel's Locale object
  59. language_name = locale.get_language_name().title()
  60. english_name = locale.english_name.split(' (')[0]
  61. elif short_code in engines_languages['wikipedia']:
  62. # get language's data from wikipedia if not known by babel
  63. language_name = engines_languages['wikipedia'][short_code]['name']
  64. english_name = engines_languages['wikipedia'][short_code]['english_name']
  65. else:
  66. language_name = None
  67. english_name = None
  68. # add language to list
  69. language_list[short_code] = {'name': language_name,
  70. 'english_name': english_name,
  71. 'counter': set(),
  72. 'countries': dict()}
  73. # add language with country if not in list
  74. if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
  75. country_name = ''
  76. if locale:
  77. # get country name from babel's Locale object
  78. country_name = locale.get_territory_name()
  79. language_list[short_code]['countries'][lang_code] = {'country_name': country_name,
  80. 'counter': set()}
  81. # count engine for both language_country combination and language alone
  82. language_list[short_code]['counter'].add(engine_name)
  83. if lang_code != short_code:
  84. language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
  85. return language_list
  86. # Filter language list so it only includes the most supported languages and countries
  87. def filter_language_list(all_languages):
  88. min_engines_per_lang = 15
  89. min_engines_per_country = 10
  90. main_engines = [engine_name for engine_name in engines.keys()
  91. if 'general' in engines[engine_name].categories and
  92. engines[engine_name].supported_languages and
  93. not engines[engine_name].disabled]
  94. # filter list to include only languages supported by most engines or all default general engines
  95. filtered_languages = {code: lang for code, lang
  96. in all_languages.items()
  97. if (len(lang['counter']) >= min_engines_per_lang or
  98. all(main_engine in lang['counter']
  99. for main_engine in main_engines))}
  100. def _copy_lang_data(lang, country_name=None):
  101. new_dict = dict()
  102. new_dict['name'] = all_languages[lang]['name']
  103. new_dict['english_name'] = all_languages[lang]['english_name']
  104. if country_name:
  105. new_dict['country_name'] = country_name
  106. return new_dict
  107. def _country_count(i):
  108. return len(countries[sorted_countries[i]]['counter'])
  109. # for each language get country codes supported by most engines or at least one country code
  110. filtered_languages_with_countries = dict()
  111. for lang, lang_data in filtered_languages.items():
  112. countries = lang_data['countries']
  113. filtered_countries = dict()
  114. # get language's country codes with enough supported engines
  115. for lang_country, country_data in countries.items():
  116. if len(country_data['counter']) >= min_engines_per_country:
  117. filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
  118. # add language without countries too if there's more than one country to choose from
  119. if len(filtered_countries) > 1:
  120. filtered_countries[lang] = _copy_lang_data(lang)
  121. elif len(filtered_countries) == 1:
  122. # if there's only one country per language, it's not necessary to show country name
  123. lang_country = next(iter(filtered_countries))
  124. filtered_countries[lang_country]['country_name'] = None
  125. # if no country has enough engines try to get most likely country code from babel
  126. if not filtered_countries:
  127. lang_country = None
  128. subtags = get_global('likely_subtags').get(lang)
  129. if subtags:
  130. country_code = subtags.split('_')[-1]
  131. if len(country_code) == 2:
  132. lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
  133. if lang_country:
  134. filtered_countries[lang_country] = _copy_lang_data(lang)
  135. else:
  136. filtered_countries[lang] = _copy_lang_data(lang)
  137. filtered_languages_with_countries.update(filtered_countries)
  138. return filtered_languages_with_countries
  139. # Write languages.py.
  140. def write_languages_file(languages):
  141. file_headers = (
  142. "# -*- coding: utf-8 -*-",
  143. "# list of language codes",
  144. "# this file is generated automatically by utils/fetch_languages.py",
  145. "language_codes ="
  146. )
  147. language_codes = tuple([
  148. (
  149. code,
  150. languages[code]['name'].split(' (')[0],
  151. languages[code].get('country_name') or '',
  152. languages[code].get('english_name') or ''
  153. ) for code in sorted(languages)
  154. ])
  155. with open(languages_file, 'w') as new_file:
  156. file_content = "{file_headers} \\\n{language_codes}".format(
  157. file_headers='\n'.join(file_headers),
  158. language_codes=pformat(language_codes, indent=4)
  159. )
  160. new_file.write(file_content)
  161. new_file.close()
  162. if __name__ == "__main__":
  163. initialize_engines(settings['engines'])
  164. engines_languages = fetch_supported_languages()
  165. all_languages = join_language_lists(engines_languages)
  166. filtered_languages = filter_language_list(all_languages)
  167. write_languages_file(filtered_languages)