utils.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. # -*- coding: utf-8 -*-
  2. import csv
  3. import hashlib
  4. import hmac
  5. import os
  6. import re
  7. from babel.core import get_global
  8. from babel.dates import format_date
  9. from codecs import getincrementalencoder
  10. from imp import load_source
  11. from numbers import Number
  12. from os.path import splitext, join
  13. from io import open
  14. from random import choice
  15. from lxml.etree import XPath
  16. import sys
  17. import json
  18. from searx import settings
  19. from searx.version import VERSION_STRING
  20. from searx.languages import language_codes
  21. from searx import settings
  22. from searx import logger
  23. try:
  24. from cStringIO import StringIO
  25. except:
  26. from io import StringIO
  27. try:
  28. from HTMLParser import HTMLParser
  29. except:
  30. from html.parser import HTMLParser
  31. if sys.version_info[0] == 3:
  32. unichr = chr
  33. unicode = str
  34. IS_PY2 = False
  35. basestring = str
  36. else:
  37. IS_PY2 = True
  38. logger = logger.getChild('utils')
  39. blocked_tags = ('script',
  40. 'style')
  41. ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
  42. ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
  43. useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
  44. + "/data/useragents.json", 'r', encoding='utf-8').read())
  45. xpath_cache = dict()
  46. lang_to_lc_cache = dict()
  47. def searx_useragent():
  48. return 'searx/{searx_version} {suffix}'.format(
  49. searx_version=VERSION_STRING,
  50. suffix=settings['outgoing'].get('useragent_suffix', ''))
  51. def gen_useragent(os=None):
  52. return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))
  53. def highlight_content(content, query):
  54. if not content:
  55. return None
  56. # ignoring html contents
  57. # TODO better html content detection
  58. if content.find('<') != -1:
  59. return content
  60. query = query.decode('utf-8')
  61. if content.lower().find(query.lower()) > -1:
  62. query_regex = u'({0})'.format(re.escape(query))
  63. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  64. content, flags=re.I | re.U)
  65. else:
  66. regex_parts = []
  67. for chunk in query.split():
  68. if len(chunk) == 1:
  69. regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk)))
  70. else:
  71. regex_parts.append(u'{0}'.format(re.escape(chunk)))
  72. query_regex = u'({0})'.format('|'.join(regex_parts))
  73. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  74. content, flags=re.I | re.U)
  75. return content
  76. class HTMLTextExtractor(HTMLParser):
  77. def __init__(self):
  78. HTMLParser.__init__(self)
  79. self.result = []
  80. self.tags = []
  81. def handle_starttag(self, tag, attrs):
  82. self.tags.append(tag)
  83. def handle_endtag(self, tag):
  84. if not self.tags:
  85. return
  86. if tag != self.tags[-1]:
  87. raise Exception("invalid html")
  88. self.tags.pop()
  89. def is_valid_tag(self):
  90. return not self.tags or self.tags[-1] not in blocked_tags
  91. def handle_data(self, d):
  92. if not self.is_valid_tag():
  93. return
  94. self.result.append(d)
  95. def handle_charref(self, number):
  96. if not self.is_valid_tag():
  97. return
  98. if number[0] in (u'x', u'X'):
  99. codepoint = int(number[1:], 16)
  100. else:
  101. codepoint = int(number)
  102. self.result.append(unichr(codepoint))
  103. def handle_entityref(self, name):
  104. if not self.is_valid_tag():
  105. return
  106. # codepoint = htmlentitydefs.name2codepoint[name]
  107. # self.result.append(unichr(codepoint))
  108. self.result.append(name)
  109. def get_text(self):
  110. return u''.join(self.result).strip()
  111. def html_to_text(html):
  112. html = html.replace('\n', ' ')
  113. html = ' '.join(html.split())
  114. s = HTMLTextExtractor()
  115. s.feed(html)
  116. return s.get_text()
  117. class UnicodeWriter:
  118. """
  119. A CSV writer which will write rows to CSV file "f",
  120. which is encoded in the given encoding.
  121. """
  122. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  123. # Redirect output to a queue
  124. self.queue = StringIO()
  125. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  126. self.stream = f
  127. self.encoder = getincrementalencoder(encoding)()
  128. def writerow(self, row):
  129. if IS_PY2:
  130. row = [s.encode("utf-8") if hasattr(s, 'encode') else s for s in row]
  131. self.writer.writerow(row)
  132. # Fetch UTF-8 output from the queue ...
  133. data = self.queue.getvalue()
  134. if IS_PY2:
  135. data = data.decode("utf-8")
  136. else:
  137. data = data.strip('\x00')
  138. # ... and reencode it into the target encoding
  139. data = self.encoder.encode(data)
  140. # write to the target stream
  141. if IS_PY2:
  142. self.stream.write(data)
  143. else:
  144. self.stream.write(data.decode("utf-8"))
  145. # empty queue
  146. self.queue.truncate(0)
  147. def writerows(self, rows):
  148. for row in rows:
  149. self.writerow(row)
  150. def get_resources_directory(searx_directory, subdirectory, resources_directory):
  151. if not resources_directory:
  152. resources_directory = os.path.join(searx_directory, subdirectory)
  153. if not os.path.isdir(resources_directory):
  154. raise Exception(resources_directory + " is not a directory")
  155. return resources_directory
  156. def get_themes(templates_path):
  157. """Returns available themes list."""
  158. themes = os.listdir(templates_path)
  159. if '__common__' in themes:
  160. themes.remove('__common__')
  161. return themes
  162. def get_static_files(static_path):
  163. static_files = set()
  164. static_path_length = len(static_path) + 1
  165. for directory, _, files in os.walk(static_path):
  166. for filename in files:
  167. f = os.path.join(directory[static_path_length:], filename)
  168. static_files.add(f)
  169. return static_files
  170. def get_result_templates(templates_path):
  171. result_templates = set()
  172. templates_path_length = len(templates_path) + 1
  173. for directory, _, files in os.walk(templates_path):
  174. if directory.endswith('result_templates'):
  175. for filename in files:
  176. f = os.path.join(directory[templates_path_length:], filename)
  177. result_templates.add(f)
  178. return result_templates
  179. def format_date_by_locale(date, locale_string):
  180. # strftime works only on dates after 1900
  181. if date.year <= 1900:
  182. return date.isoformat().split('T')[0]
  183. if locale_string == 'all':
  184. locale_string = settings['ui']['default_locale'] or 'en_US'
  185. # to avoid crashing if locale is not supported by babel
  186. try:
  187. formatted_date = format_date(date, locale=locale_string)
  188. except:
  189. formatted_date = format_date(date, "YYYY-MM-dd")
  190. return formatted_date
  191. def dict_subset(d, properties):
  192. result = {}
  193. for k in properties:
  194. if k in d:
  195. result[k] = d[k]
  196. return result
  197. def prettify_url(url, max_length=74):
  198. if len(url) > max_length:
  199. chunk_len = int(max_length / 2 + 1)
  200. return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
  201. else:
  202. return url
  203. # get element in list or default value
  204. def list_get(a_list, index, default=None):
  205. if len(a_list) > index:
  206. return a_list[index]
  207. else:
  208. return default
  209. def get_torrent_size(filesize, filesize_multiplier):
  210. try:
  211. filesize = float(filesize)
  212. if filesize_multiplier == 'TB':
  213. filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
  214. elif filesize_multiplier == 'GB':
  215. filesize = int(filesize * 1024 * 1024 * 1024)
  216. elif filesize_multiplier == 'MB':
  217. filesize = int(filesize * 1024 * 1024)
  218. elif filesize_multiplier == 'KB':
  219. filesize = int(filesize * 1024)
  220. elif filesize_multiplier == 'TiB':
  221. filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
  222. elif filesize_multiplier == 'GiB':
  223. filesize = int(filesize * 1000 * 1000 * 1000)
  224. elif filesize_multiplier == 'MiB':
  225. filesize = int(filesize * 1000 * 1000)
  226. elif filesize_multiplier == 'KiB':
  227. filesize = int(filesize * 1000)
  228. except:
  229. filesize = None
  230. return filesize
  231. def convert_str_to_int(number_str):
  232. if number_str.isdigit():
  233. return int(number_str)
  234. else:
  235. return 0
  236. # convert a variable to integer or return 0 if it's not a number
  237. def int_or_zero(num):
  238. if isinstance(num, list):
  239. if len(num) < 1:
  240. return 0
  241. num = num[0]
  242. return convert_str_to_int(num)
  243. def is_valid_lang(lang):
  244. is_abbr = (len(lang) == 2)
  245. lang = lang.lower().decode('utf-8')
  246. if is_abbr:
  247. for l in language_codes:
  248. if l[0][:2] == lang:
  249. return (True, l[0][:2], l[3].lower())
  250. return False
  251. else:
  252. for l in language_codes:
  253. if l[1].lower() == lang or l[3].lower() == lang:
  254. return (True, l[0][:2], l[3].lower())
  255. return False
  256. def _get_lang_to_lc_dict(lang_list):
  257. key = str(lang_list)
  258. value = lang_to_lc_cache.get(key, None)
  259. if value is None:
  260. value = dict()
  261. for lc in lang_list:
  262. value.setdefault(lc.split('-')[0], lc)
  263. lang_to_lc_cache[key] = value
  264. return value
  265. # auxiliary function to match lang_code in lang_list
  266. def _match_language(lang_code, lang_list=[], custom_aliases={}):
  267. # replace language code with a custom alias if necessary
  268. if lang_code in custom_aliases:
  269. lang_code = custom_aliases[lang_code]
  270. if lang_code in lang_list:
  271. return lang_code
  272. # try to get the most likely country for this language
  273. subtags = get_global('likely_subtags').get(lang_code)
  274. if subtags:
  275. subtag_parts = subtags.split('_')
  276. new_code = subtag_parts[0] + '-' + subtag_parts[-1]
  277. if new_code in custom_aliases:
  278. new_code = custom_aliases[new_code]
  279. if new_code in lang_list:
  280. return new_code
  281. # try to get the any supported country for this language
  282. return _get_lang_to_lc_dict(lang_list).get(lang_code, None)
  283. # get the language code from lang_list that best matches locale_code
  284. def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
  285. # try to get language from given locale_code
  286. language = _match_language(locale_code, lang_list, custom_aliases)
  287. if language:
  288. return language
  289. locale_parts = locale_code.split('-')
  290. lang_code = locale_parts[0]
  291. # try to get language using an equivalent country code
  292. if len(locale_parts) > 1:
  293. country_alias = get_global('territory_aliases').get(locale_parts[-1])
  294. if country_alias:
  295. language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
  296. if language:
  297. return language
  298. # try to get language using an equivalent language code
  299. alias = get_global('language_aliases').get(lang_code)
  300. if alias:
  301. language = _match_language(alias, lang_list, custom_aliases)
  302. if language:
  303. return language
  304. if lang_code != locale_code:
  305. # try to get language from given language without giving the country
  306. language = _match_language(lang_code, lang_list, custom_aliases)
  307. return language or fallback
  308. def load_module(filename, module_dir):
  309. modname = splitext(filename)[0]
  310. if modname in sys.modules:
  311. del sys.modules[modname]
  312. filepath = join(module_dir, filename)
  313. module = load_source(modname, filepath)
  314. module.name = modname
  315. return module
  316. def new_hmac(secret_key, url):
  317. try:
  318. secret_key_bytes = bytes(secret_key, 'utf-8')
  319. except TypeError as err:
  320. if isinstance(secret_key, bytes):
  321. secret_key_bytes = secret_key
  322. else:
  323. raise err
  324. if sys.version_info[0] == 2:
  325. return hmac.new(bytes(secret_key), url, hashlib.sha256).hexdigest()
  326. else:
  327. return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest()
  328. def to_string(obj):
  329. if isinstance(obj, basestring):
  330. return obj
  331. if isinstance(obj, Number):
  332. return unicode(obj)
  333. if hasattr(obj, '__str__'):
  334. return obj.__str__()
  335. if hasattr(obj, '__repr__'):
  336. return obj.__repr__()
  337. def ecma_unescape(s):
  338. """
  339. python implementation of the unescape javascript function
  340. https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
  341. https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
  342. """
  343. # s = unicode(s)
  344. # "%u5409" becomes "吉"
  345. s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
  346. # "%20" becomes " ", "%F3" becomes "ó"
  347. s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
  348. return s
  349. def get_engine_from_settings(name):
  350. """Return engine configuration from settings.yml of a given engine name"""
  351. if 'engines' not in settings:
  352. return {}
  353. for engine in settings['engines']:
  354. if 'name' not in engine:
  355. continue
  356. if name == engine['name']:
  357. return engine
  358. return {}
  359. def get_xpath(xpath_str):
  360. result = xpath_cache.get(xpath_str, None)
  361. if result is None:
  362. result = XPath(xpath_str)
  363. xpath_cache[xpath_str] = result
  364. return result
  365. def eval_xpath(element, xpath_str):
  366. xpath = get_xpath(xpath_str)
  367. return xpath(element)