https_rewrite.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. from urllib.parse import urlparse
  16. from lxml import etree
  17. from os import listdir, environ
  18. from os.path import isfile, isdir, join
  19. from searx.plugins import logger
  20. from flask_babel import gettext
  21. from searx import searx_dir
  22. name = "HTTPS rewrite"
  23. description = gettext('Rewrite HTTP links to HTTPS if possible')
  24. default_on = True
  25. preference_section = 'privacy'
  26. if 'SEARX_HTTPS_REWRITE_PATH' in environ:
  27. rules_path = environ['SEARX_rules_path']
  28. else:
  29. rules_path = join(searx_dir, 'plugins/https_rules')
  30. logger = logger.getChild("https_rewrite")
  31. # https://gitweb.torproject.org/\
  32. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  33. # HTTPS rewrite rules
  34. https_rules = []
  35. # load single ruleset from a xml file
  36. def load_single_https_ruleset(rules_path):
  37. ruleset = ()
  38. # init parser
  39. parser = etree.XMLParser()
  40. # load and parse xml-file
  41. try:
  42. tree = etree.parse(rules_path, parser)
  43. except:
  44. # TODO, error message
  45. return ()
  46. # get root node
  47. root = tree.getroot()
  48. # check if root is a node with the name ruleset
  49. # TODO improve parsing
  50. if root.tag != 'ruleset':
  51. return ()
  52. # check if rule is deactivated by default
  53. if root.attrib.get('default_off'):
  54. return ()
  55. # check if rule does only work for specific platforms
  56. if root.attrib.get('platform'):
  57. return ()
  58. hosts = []
  59. rules = []
  60. exclusions = []
  61. # parse childs from ruleset
  62. for ruleset in root:
  63. # this child define a target
  64. if ruleset.tag == 'target':
  65. # check if required tags available
  66. if not ruleset.attrib.get('host'):
  67. continue
  68. # convert host-rule to valid regex
  69. host = ruleset.attrib.get('host')\
  70. .replace('.', r'\.').replace('*', '.*')
  71. # append to host list
  72. hosts.append(host)
  73. # this child define a rule
  74. elif ruleset.tag == 'rule':
  75. # check if required tags available
  76. if not ruleset.attrib.get('from')\
  77. or not ruleset.attrib.get('to'):
  78. continue
  79. # TODO hack, which convert a javascript regex group
  80. # into a valid python regex group
  81. rule_from = ruleset.attrib['from'].replace('$', '\\')
  82. if rule_from.endswith('\\'):
  83. rule_from = rule_from[:-1] + '$'
  84. rule_to = ruleset.attrib['to'].replace('$', '\\')
  85. if rule_to.endswith('\\'):
  86. rule_to = rule_to[:-1] + '$'
  87. # TODO, not working yet because of the hack above,
  88. # currently doing that in webapp.py
  89. # rule_from_rgx = re.compile(rule_from, re.I)
  90. # append rule
  91. try:
  92. rules.append((re.compile(rule_from, re.I | re.U), rule_to))
  93. except:
  94. # TODO log regex error
  95. continue
  96. # this child define an exclusion
  97. elif ruleset.tag == 'exclusion':
  98. # check if required tags available
  99. if not ruleset.attrib.get('pattern'):
  100. continue
  101. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  102. # append exclusion
  103. exclusions.append(exclusion_rgx)
  104. # convert list of possible hosts to a simple regex
  105. # TODO compress regex to improve performance
  106. try:
  107. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  108. except:
  109. return ()
  110. # return ruleset
  111. return (target_hosts, rules, exclusions)
  112. # load all https rewrite rules
  113. def load_https_rules(rules_path):
  114. # check if directory exists
  115. if not isdir(rules_path):
  116. logger.error("directory not found: '" + rules_path + "'")
  117. return
  118. # search all xml files which are stored in the https rule directory
  119. xml_files = [join(rules_path, f)
  120. for f in listdir(rules_path)
  121. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  122. # load xml-files
  123. for ruleset_file in xml_files:
  124. # calculate rewrite-rules
  125. ruleset = load_single_https_ruleset(ruleset_file)
  126. # skip if no ruleset returned
  127. if not ruleset:
  128. continue
  129. # append ruleset
  130. https_rules.append(ruleset)
  131. logger.info('{n} rules loaded'.format(n=len(https_rules)))
  132. def https_url_rewrite(result):
  133. skip_https_rewrite = False
  134. # check if HTTPS rewrite is possible
  135. for target, rules, exclusions in https_rules:
  136. # check if target regex match with url
  137. if target.match(result['parsed_url'].netloc):
  138. # process exclusions
  139. for exclusion in exclusions:
  140. # check if exclusion match with url
  141. if exclusion.match(result['url']):
  142. skip_https_rewrite = True
  143. break
  144. # skip https rewrite if required
  145. if skip_https_rewrite:
  146. break
  147. # process rules
  148. for rule in rules:
  149. try:
  150. new_result_url = rule[0].sub(rule[1], result['url'])
  151. except:
  152. break
  153. # parse new url
  154. new_parsed_url = urlparse(new_result_url)
  155. # continiue if nothing was rewritten
  156. if result['url'] == new_result_url:
  157. continue
  158. # get domainname from result
  159. # TODO, does only work correct with TLD's like
  160. # asdf.com, not for asdf.com.de
  161. # TODO, using publicsuffix instead of this rewrite rule
  162. old_result_domainname = '.'.join(
  163. result['parsed_url'].hostname.split('.')[-2:])
  164. new_result_domainname = '.'.join(
  165. new_parsed_url.hostname.split('.')[-2:])
  166. # check if rewritten hostname is the same,
  167. # to protect against wrong or malicious rewrite rules
  168. if old_result_domainname == new_result_domainname:
  169. # set new url
  170. result['url'] = new_result_url
  171. # target has matched, do not search over the other rules
  172. break
  173. return result
  174. def on_result(request, search, result):
  175. if 'parsed_url' not in result:
  176. return True
  177. if result['parsed_url'].scheme == 'http':
  178. https_url_rewrite(result)
  179. return True
  180. load_https_rules(rules_path)