searx.py 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037
  1. ########################################################################
  2. # Searx-Qt - Lightweight desktop application for Searx.
  3. # Copyright (C) 2020-2022 CYBERDEViL
  4. #
  5. # This file is part of Searx-Qt.
  6. #
  7. # Searx-Qt is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # Searx-Qt is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <https://www.gnu.org/licenses/>.
  19. #
  20. ########################################################################
  21. import time
  22. import urllib.parse
  23. from bs4 import BeautifulSoup
  24. from searxqt.core.requests import JsonResult, ErrorType, Schemas, Result
  25. from searxqt.core.handler import HandlerProto, NetworkTypes
  26. from searxqt.utils.string import parseFilesize
  27. from searxqt.translations import _
  28. class LinkTokenResult(Result):
  29. def __init__(self, url, response, err="", errType=ErrorType.Success):
  30. Result.__init__(self, url, response, err=err, errType=errType)
  31. ## API result (format=json)
  32. class SearchResult(JsonResult):
  33. Schema = Schemas['searxng_query']
  34. def __init__(self, url, response, err="", errType=ErrorType.Success):
  35. JsonResult.__init__(self, url, response, err=err, errType=errType)
  36. def verifyFurther(self):
  37. JsonResult.verifyFurther(self)
  38. # One of the following keys has to be not empty, else we count it as
  39. # no (usable) result.
  40. validKeys = [
  41. 'results',
  42. 'answers',
  43. 'corrections',
  44. 'infoboxes',
  45. 'suggestions'
  46. ]
  47. if self._errType == ErrorType.Success:
  48. data = self.json()
  49. valid = False
  50. for key in validKeys:
  51. if len(data.get(key, [])):
  52. valid = True
  53. break
  54. if not valid:
  55. self._errType = ErrorType.NoResults
  56. self._err = f"NoResults: got: `{self.json()}`"
  57. ## HTML result that will be parsed into JSON
  58. class SearchResult2(SearchResult):
  59. Schema = Schemas['searxng_query']
  60. def __init__(self, url, response, err="", errType=ErrorType.Success):
  61. ## @see https://github.com/searxng/searxng/blob/master/searx/botdetection/link_token.py
  62. self._linktoken = None
  63. SearchResult.__init__(self, url, response, err=err, errType=errType)
  64. @property
  65. def linktoken(self):
  66. return self._linktoken
  67. def makeUrlAbsolute(self, url):
  68. """! Returns a absolute URL. It will add the SearXNG instance its
  69. schema and location in front when they are missing."""
  70. parsedUrl = urllib.parse.urlparse(url)
  71. instanceUrl = urllib.parse.urlparse(self.url())
  72. if not parsedUrl.netloc:
  73. url = f"{instanceUrl.netloc}{url}"
  74. if not parsedUrl.scheme:
  75. url = f"{instanceUrl.scheme}://{url}"
  76. return url
  77. def json(self):
  78. if self.errorType() != ErrorType.Success:
  79. return {}
  80. jsonResult = {
  81. 'results': [],
  82. 'answers': [],
  83. 'corrections': [],
  84. 'infoboxes': [],
  85. 'suggestions': [],
  86. 'unresponsive_engines': []
  87. }
  88. soup = BeautifulSoup(self.content(), "html.parser")
  89. # Find css bot detection file
  90. # <link rel="stylesheet" href="/client8uw9qw2jc3yhiq2c.css" type="text/css">
  91. for link in soup.find_all("link", {"rel": "stylesheet"}, href=True):
  92. href = link.get("href")
  93. if href.startswith("/client"):
  94. self._linktoken = self.makeUrlAbsolute(href)
  95. break
  96. #######################################################################
  97. ## 'results' key
  98. ##########################################################################
  99. for result in soup.find_all("article", {"class": "result"}):
  100. """
  101. <article class="result result-default category-general qwant duckduckgo google">
  102. <a href="https://linuxize.com/post/curl-post-request/" class="url_wrapper" rel="noreferrer">
  103. <span class="url_o1">
  104. <span class="url_i1">https://linuxize.com</span>
  105. </span>
  106. <span class="url_o2">
  107. <span class="url_i2"> › post › curl-post-request</span>
  108. </span>
  109. </a>
  110. <h3>
  111. <a href="https://linuxize.com/post/curl-post-request/" rel="noreferrer">
  112. How to make a <span class="highlight">POST</span>
  113. <span class="highlight">request</span>
  114. with <span class="highlight">curl</span>
  115. </a>
  116. </h3>
  117. <p class="content">
  118. Learn how to use <span class="highlight">curl</span>, a command-line utility for transferring data from or to a remote server, to make <span class="highlight">POST</span> requests. See examples of sending data, files, and JSON data with <span class="highlight">curl</span> options and options.
  119. </p>
  120. <div class="engines">
  121. <span>qwant</span>
  122. <span>duckduckgo</span>
  123. <span>google</span>
  124. <a href="https://web.archive.org/web/https://linuxize.com/post/curl-post-request/" class="cache_link" rel="noreferrer">
  125. <svg SVG_STUFF .../></svg>
  126. cached
  127. </a>
  128. &lrm;
  129. </div>
  130. <div class="break"></div>
  131. </article>
  132. """
  133. """
  134. <article class="result result-torrent category-files solidtorrents">
  135. <a href="https://solidtorrents.to/torrents/STUFF .../" class="url_wrapper" rel="noreferrer">
  136. <span class="url_o1">
  137. <span class="url_i1">https://solidtorrents.to</span>
  138. </span>
  139. <span class="url_o2">
  140. <span class="url_i2"> › torrents › SOME_NAME › SOME_HASH</span>
  141. </span>
  142. </a>
  143. <h3>
  144. <a href="https://solidtorrents.to/torrents/SOME_NAME/SOME_HASH/" rel="noreferrer">
  145. <span class="highlight">SOME</span>-<span class="highlight">NAME</span>
  146. </a>
  147. </h3>
  148. <time class="published_date" datetime="2018-10-20 00:00:00" >Oct 20, 2018</time>
  149. <div class="highlight">Other/Archive</div>
  150. <p class="altlink">
  151. &bull;
  152. <a href="magnet:MAGNET_LINK ..." class="magnetlink" rel="noreferrer"><svg SVG_STUFF .../></svg>magnet link</a>
  153. </p>
  154. <p class="altlink">
  155. &bull;
  156. <a href="https://itorrents.org/torrent/TORRENT_LINK ..." class="torrentfile" rel="noreferrer">torrent file</a>
  157. </p>
  158. <p class="stat">
  159. &bull; Seeder
  160. <span class="badge">407</span>
  161. &bull; Leecher
  162. <span class="badge">748</span>
  163. </p>
  164. <p class="stat"> Filesize
  165. <span class="badge">2.88 GiB</span>
  166. </p>
  167. <div class="engines">
  168. <span>solidtorrents</span>
  169. <a href="https://web.archive.org/web/https://solidtorrents.to/torrents/TORRENT_STUFF ..." class="cache_link" rel="noreferrer"><svg SVG_STUFF .../></svg>cached</a>
  170. &lrm;
  171. </div>
  172. <div class="break"></div>
  173. </article>
  174. """
  175. title = ''
  176. url = ''
  177. content = ''
  178. engines = []
  179. publishedDate = ''
  180. magnetlink = ''
  181. torrentfile = ''
  182. filesize = 0
  183. files = 0 # TODO unused for now
  184. seed = None
  185. leech = None
  186. # !! GET Title
  187. try:
  188. title = result.h3.a.get_text().lstrip().rstrip()
  189. except AttributeError:
  190. print("Failed to get title")
  191. # !! GET URL
  192. try:
  193. url = result.h3.a.get("href")
  194. except AttributeError:
  195. print("Failed to get url")
  196. # !! GET Content
  197. felem = result.find("p", {"class": "content"})
  198. if felem:
  199. content = felem.get_text().lstrip().rstrip()
  200. # !! GET Engines
  201. felem = result.find("div", {"class": "engines"})
  202. if felem:
  203. for engine in felem.find_all("span"):
  204. engines.append(engine.get_text().rstrip().lstrip())
  205. ## !! Get publishDate
  206. felem = result.find("time", {"class": "published_date"})
  207. if felem:
  208. publishedDate = felem.get("datetime", "")
  209. ## !! Get magnetlink
  210. felem = result.find("a", {"class": "magnetlink"})
  211. if felem:
  212. magnetlink = felem.get('href')
  213. ## !! Get torrentfile
  214. felem = result.find("a", {"class": "torrentfile"})
  215. if felem:
  216. torrentfile = felem.get('href')
  217. ## !! Get filesize
  218. for felem in result.find_all("span", {"class": "badge"}):
  219. if felem.previousSibling:
  220. precedingText = felem.previousSibling
  221. if "Filesize" in precedingText:
  222. filesize = parseFilesize(felem.get_text().rstrip().lstrip())
  223. elif "Seeder" in precedingText:
  224. seed = felem.get_text()
  225. elif "Leecher" in precedingText:
  226. leech = felem.get_text()
  227. # !! Add result
  228. resultData = {
  229. 'title': title,
  230. 'url': url,
  231. 'content': content,
  232. 'engines': [engine for engine in engines],
  233. # Optional
  234. 'publishedDate': publishedDate,
  235. # File attributes
  236. 'magnetlink': magnetlink,
  237. 'torrentfile': torrentfile,
  238. 'filesize': filesize,
  239. 'files': files,
  240. 'img_format': '' # TODO
  241. }
  242. if seed is not None:
  243. resultData.update({'seed': seed})
  244. if leech is not None:
  245. resultData.update({'leech': leech})
  246. jsonResult['results'].append(resultData)
  247. ##########################################################################
  248. ## 'suggestions' key
  249. ##########################################################################
  250. """
  251. <div id="sidebar">
  252. <div id="suggestions" role="complementary" aria-labelledby="suggestions-title">
  253. <details class="sidebar-collapsable">
  254. <summary class="title" id="suggestions-title">Suggestions</summary>
  255. <div class="wrapper">
  256. <form method="POST" action="/search">
  257. <input type="hidden" name="q" value="curl post request json">
  258. <input type="hidden" name="category_general" value="1">
  259. <input type="hidden" name="language" value="auto">
  260. <input type="hidden" name="time_range" value="">
  261. <input type="hidden" name="safesearch" value="0">
  262. <input type="hidden" name="theme" value="simple">
  263. <input type="submit" class="suggestion" role="link" value="&bull; curl post request json">
  264. """
  265. felem = soup.find("div", {"id": "suggestions"})
  266. if felem:
  267. for suggestion in felem.find_all("input", {"name": "q"}):
  268. jsonResult['suggestions'].append(suggestion.get("value"))
  269. ##########################################################################
  270. ## 'answers' key
  271. ##########################################################################
  272. """
  273. <h4 class="title" id="answers-title">Answers : </h4>
  274. <div class="answer">
  275. <span>LONG TEXT ...</span>
  276. <a href="some url ..." class="answer-url">url text ...</a>
  277. </div>
  278. """
  279. for answer in soup.find_all("div", {"class": "answer"}):
  280. felem = answer.find("span")
  281. if felem:
  282. jsonResult['answers'].append(felem.get_text())
  283. ##########################################################################
  284. ## 'corrections' key
  285. ##########################################################################
  286. """ TODO """
  287. ##########################################################################
  288. ## 'infoboxes' key
  289. ##########################################################################
  290. """
  291. <details open="" class="sidebar-collapsable">
  292. <summary class="title">Info</summary>
  293. <aside class="infobox" aria-label="Banana">
  294. <h2 class="title"><bdi>Banana</bdi></h2>
  295. <img src="/image_proxy?url=long_image_url" title="Banana" alt="Banana">
  296. <p><bdi>LOGNG TEXT HERE ...</bdi></p>
  297. <div class="urls">
  298. <ul>
  299. <li class="url"><bdi><a href="https://en.wikipedia.org/wiki/Banana" rel="noreferrer">Wikipedia</a></bdi></li>
  300. <li class="url"><bdi><a href="http://www.wikidata.org/entity/Q503" rel="noreferrer">Wikidata</a></bdi></li>
  301. </ul>
  302. </div>
  303. </aside>
  304. </details>
  305. """
  306. """
  307. <details open="" class="sidebar-collapsable">
  308. <summary class="title">Info</summary>
  309. <aside class="infobox" aria-label="Water">
  310. <h2 class="title"><bdi>Water</bdi></h2>
  311. <img src="/image_proxy?url=long url .." title="Water" alt="Water">
  312. <p><bdi>LONG TEXT ...</bdi></p>
  313. <div class="attributes">
  314. <dl>
  315. <dt><bdi>Chemical formula :</bdi></dt>
  316. <dd><bdi>H₂O</bdi></dd>
  317. </dl>
  318. </div>
  319. <div class="urls">
  320. <ul>
  321. <li class="url"><bdi><a href="https://en.wikipedia.org/wiki/Water" rel="noreferrer">Wikipedia</a></bdi></li>
  322. <li class="url"><bdi><a href="http://www.wikidata.org/entity/Q283" rel="noreferrer">Wikidata</a></bdi></li>
  323. </ul>
  324. </div>
  325. </aside>
  326. </details>
  327. """
  328. """
  329. infoboxes = []
  330. ibox = {
  331. 'infobox': 'str',
  332. 'id': 'uri',
  333. 'content': 'str',
  334. 'img_src': 'uri' | null
  335. 'urls': [
  336. {
  337. 'title': 'str',
  338. 'url': 'uri',
  339. 'entity': 'str',
  340. 'official': true
  341. }
  342. ],
  343. 'attributes': [
  344. {
  345. 'label': 'str',
  346. 'value': 'str',
  347. 'entity': 'str'
  348. }
  349. ],
  350. 'engines': ['str'],
  351. 'engine': 'str'
  352. }
  353. """
  354. for infobox in soup.find_all("aside", {"class": "infobox"}):
  355. title = ""
  356. id = ""
  357. content = ""
  358. img_src = ""
  359. urls = []
  360. attributes = []
  361. engines = []
  362. # Title
  363. felem = infobox.find("h2", {"class": "title"})
  364. if felem:
  365. title = felem.get_text().rstrip().lstrip()
  366. # ID
  367. # TODO
  368. # Content
  369. felem = infobox.find("p")
  370. if felem:
  371. felem = felem.find("bdi")
  372. if felem:
  373. content = felem.get_text().rstrip().lstrip()
  374. # Image
  375. felem = infobox.find("img")
  376. if felem:
  377. img_src = felem.get("src")
  378. # URLs
  379. for felem in infobox.find_all("li", {"class": "url"}):
  380. felem = felem.find("a")
  381. if felem:
  382. urls.append({
  383. 'title': felem.get_text().lstrip().rstrip(),
  384. 'url': felem.get("href", ""),
  385. 'entity': '', # TODO
  386. 'official': False # TODO
  387. })
  388. # Attributes
  389. """
  390. <div class="attributes">
  391. <dl>
  392. <dt><bdi>Chemical formula :</bdi></dt>
  393. <dd><bdi>H₂O</bdi></dd>
  394. </dl>
  395. </div>
  396. """
  397. felem = infobox.find("div", {"class": "attributes"})
  398. if felem:
  399. for item in felem.find_all("dl"):
  400. label = ""
  401. value = ""
  402. entity = "" # TODO
  403. try:
  404. label = item.dt.bdi.get_text().rstrip().lstrip()
  405. value = item.dd.bdi.get_text().rstrip().lstrip()
  406. except AttributeError:
  407. continue
  408. attributes.append({
  409. "label": label,
  410. "value": value,
  411. "entity": entity
  412. })
  413. # Engines
  414. for url in urls:
  415. engines.append(url['title'].lower())
  416. jsonResult['infoboxes'].append({
  417. "infobox": title,
  418. "id": id,
  419. "content": content,
  420. "img_src": img_src,
  421. "urls": urls,
  422. "attributes": attributes,
  423. "engines": engines
  424. })
  425. ##########################################################################
  426. ## 'unresponsive_engines' key
  427. ##########################################################################
  428. """
  429. <div id="engines_msg">
  430. <details class="sidebar-collapsable" open="">
  431. <summary class="title" id="engines_msg-title">Messages from the search engines</summary>
  432. <div class="dialog-error" role="alert">
  433. <svg class="ion-icon-big" etc..></svg>
  434. <div>
  435. <p>
  436. <strong>Error!</strong>
  437. Engines cannot retrieve results:
  438. </p>
  439. <p>
  440. brave (<a href="/stats?engine=brave" title="View error logs and submit a bug report">Suspended: too many requests</a>)
  441. </p>
  442. <p>
  443. qwant (<a href="/stats?engine=qwant" title="View error logs and submit a bug report">Suspended: too many requests</a>)
  444. </p>
  445. </div>
  446. </div>
  447. </details>
  448. </div>
  449. """
  450. felem = soup.find("div", {"id": "engines_msg"})
  451. if felem:
  452. for errDialog in felem.find_all("div", {"class": "dialog-error"}):
  453. for p in errDialog.find_all("p"):
  454. a = p.find("a")
  455. if not a:
  456. continue
  457. engine, msg = p.get_text().split(" ", 1)
  458. jsonResult['unresponsive_engines'].append([engine, msg])
  459. return jsonResult
  460. class SearxConfigResult(JsonResult):
  461. Schema = Schemas['searxng_config']
  462. def __init__(self, url, response, err="", errType=ErrorType.Success):
  463. JsonResult.__init__(self, url, response, err=err, errType=errType)
  464. class Categories:
  465. types = {
  466. 'general': (_('General'), 'category_general'),
  467. 'files': (_('Files'), 'category_files'),
  468. 'images': (_('Images'), 'category_images'),
  469. 'videos': (_('Videos'), 'category_videos'),
  470. 'it': (_('IT'), 'category_it'),
  471. 'map': (_('Location'), 'category_map'),
  472. 'music': (_('Music'), 'category_music'),
  473. 'news': (_('News'), 'category_news'),
  474. 'science': (_('Science'), 'category_science'),
  475. 'social media': (_('Social'), 'category_social media'),
  476. 'onions': (_('Onions'), 'category_onions'),
  477. 'shopping': (_('Shopping'), 'category_shopping')
  478. }
  479. def __init__(self):
  480. self._options = {}
  481. self.__makeOptions()
  482. def __makeOptions(self):
  483. self._options.clear()
  484. for key, t in self.types.items():
  485. self._options.update({key: False})
  486. def reset(self):
  487. self.__makeOptions()
  488. def get(self, key):
  489. return self._options[key]
  490. def set(self, key, state):
  491. """
  492. @param key: One of the keys in Categories.types
  493. @type key: str
  494. @param state: Enabled / disabled state
  495. @type state: bool
  496. """
  497. self._options[key] = state
  498. def dict(self):
  499. newDict = {}
  500. for key, state in self._options.items():
  501. if state:
  502. newDict.update({self.types[key][1]: 'on'})
  503. return newDict
  504. def enabledKeys(self):
  505. """ Returns a list with enabled engine strings (key from
  506. Categories.types)
  507. """
  508. return [key for key, state in self._options.items() if state]
  509. class Engines(list):
  510. def __init__(self):
  511. list.__init__(self)
  512. def dict(self):
  513. if not self:
  514. return {}
  515. return {
  516. 'engines': ",".join(self)
  517. }
  518. class SearX:
  519. Periods = {
  520. '': _('Anytime'),
  521. 'day': _('Last day'),
  522. 'week': _('Last week'),
  523. 'month': _('Last month'),
  524. 'year': _('Last year')
  525. }
  526. # https://github.com/asciimoo/searx/blob/master/searx/languages.py
  527. Languages = {
  528. '': _('No language'),
  529. 'all': _('Default language'),
  530. 'af-NA': 'Afrikaans - af-NA',
  531. 'ca-AD': 'Català - ca-AD',
  532. 'da-DK': 'Dansk - da-DK',
  533. 'de': 'Deutsch - de',
  534. 'de-AT': 'Deutsch (Österreich) - de-AT',
  535. 'de-CH': 'Deutsch (Schweiz) - de-CH',
  536. 'de-DE': 'Deutsch (Deutschland) - de-DE',
  537. 'et-EE': 'Eesti - et-EE',
  538. 'en': 'English - en',
  539. 'en-AU': 'English (Australia) - en-AU',
  540. 'en-CA': 'English (Canada) - en-CA',
  541. 'en-GB': 'English (United Kingdom) - en-GB',
  542. 'en-IE': 'English (Ireland) - en-IE',
  543. 'en-IN': 'English (India) - en-IN',
  544. 'en-NZ': 'English (New Zealand) - en-NZ',
  545. 'en-PH': 'English (Philippines) - en-PH',
  546. 'en-SG': 'English (Singapore) - en-SG',
  547. 'en-US': 'English (United States) - en-US',
  548. 'es': 'Español - es',
  549. 'es-AR': 'Español (Argentina) - es-AR',
  550. 'es-CL': 'Español (Chile) - es-CL',
  551. 'es-ES': 'Español (España) - es-ES',
  552. 'es-MX': 'Español (México) - es-MX',
  553. 'fr': 'Français - fr',
  554. 'fr-BE': 'Français (Belgique) - fr-BE',
  555. 'fr-CA': 'Français (Canada) - fr-CA',
  556. 'fr-CH': 'Français (Suisse) - fr-CH',
  557. 'fr-FR': 'Français (France) - fr-FR',
  558. 'hr-HR': 'Hrvatski - hr-HR',
  559. 'id-ID': 'Indonesia - id-ID',
  560. 'it-IT': 'Italiano - it-IT',
  561. 'sw-KE': 'Kiswahili - sw-KE',
  562. 'lv-LV': 'Latviešu - lv-LV',
  563. 'lt-LT': 'Lietuvių - lt-LT',
  564. 'hu-HU': 'Magyar - hu-HU',
  565. 'ms-MY': 'Melayu - ms-MY',
  566. 'nl': 'Nederlands - nl',
  567. 'nl-BE': 'Nederlands (België) - nl-BE',
  568. 'nl-NL': 'Nederlands (Nederland) - nl-NL',
  569. 'nb-NO': 'Norsk Bokmål - nb-NO',
  570. 'pl-PL': 'Polski - pl-PL',
  571. 'pt': 'Português - pt',
  572. 'pt-BR': 'Português (Brasil) - pt-BR',
  573. 'pt-PT': 'Português (Portugal) - pt-PT',
  574. 'ro-RO': 'Română - ro-RO',
  575. 'sk-SK': 'Slovenčina - sk-SK',
  576. 'sl-SI': 'Slovenščina - sl-SI',
  577. 'sr-RS': 'Srpski - sr-RS',
  578. 'fi-FI': 'Suomi - fi-FI',
  579. 'sv-SE': 'Svenska - sv-SE',
  580. 'vi-VN': 'Tiếng Việt - vi-VN',
  581. 'tr-TR': 'Türkçe - tr-TR',
  582. 'is-IS': 'Íslenska - is-IS',
  583. 'cs-CZ': 'Čeština - cs-CZ',
  584. 'el-GR': 'Ελληνικά - el-GR',
  585. 'be-BY': 'Беларуская - be-BY',
  586. 'bg-BG': 'Български - bg-BG',
  587. 'ru-RU': 'Русский - ru-RU',
  588. 'uk-UA': 'Українська - uk-UA',
  589. 'hy-AM': 'Հայերեն - hy-AM',
  590. 'he-IL': 'עברית - he-IL',
  591. 'ar-SA': 'العربية - ar-SA',
  592. 'fa-IR': 'فارسی - fa-IR',
  593. 'th-TH': 'ไทย - th-TH',
  594. 'zh': '中文 - zh',
  595. 'zh-CN': '中文 (中国) - zh-CN',
  596. 'zh-TW': '中文 (台灣) - zh-TW',
  597. 'ja-JP': '日本語 - ja-JP',
  598. 'ko-KR': '한국어 - ko-KR'
  599. }
  600. def __init__(self, requestHandler):
  601. self._requestHandler = requestHandler
  602. self._url = ""
  603. self._categories = Categories()
  604. self._engines = Engines()
  605. self._query = ""
  606. self._lang = ""
  607. self._pageno = "" # int formatted as string
  608. self._timeRange = "" # '', 'day', 'week', 'month' or 'year'
  609. self._safesearch = False
  610. self._parseHtml = True
  611. @property
  612. def categories(self): return self._categories
  613. @property
  614. def engines(self): return self._engines
  615. @property
  616. def url(self):
  617. """
  618. @return: Instance url
  619. @rtype: str
  620. """
  621. return self._url
  622. @url.setter
  623. def url(self, url):
  624. """
  625. @param url: Instance url
  626. @type url: str
  627. """
  628. self._url = url
  629. @property
  630. def query(self):
  631. """
  632. @return: Search query
  633. @rtype: str
  634. """
  635. return self._query
  636. @query.setter
  637. def query(self, q):
  638. """
  639. @param q: Search query
  640. @type q: str
  641. """
  642. self._query = q
  643. @property
  644. def lang(self):
  645. """
  646. @return: Language code
  647. @rtype: str
  648. """
  649. return self._lang
  650. @lang.setter
  651. def lang(self, lang):
  652. """
  653. @param lang: Language code
  654. @type lang: str
  655. """
  656. self._lang = lang
  657. @property
  658. def pageno(self):
  659. """
  660. @return: Page number
  661. @rtype: int
  662. """
  663. return int(self._pageno)
  664. @pageno.setter
  665. def pageno(self, i):
  666. """
  667. @param i: Page number
  668. @type i: int
  669. """
  670. self._pageno = str(i)
  671. @property
  672. def timeRange(self):
  673. """
  674. @return: Search time range ('', 'day', 'week', 'month' or 'year')
  675. @rtype: str
  676. """
  677. return self._timeRange
  678. @timeRange.setter
  679. def timeRange(self, value):
  680. """
  681. @param value: Key from SearX.Periods
  682. @type value: str
  683. """
  684. self._timeRange = value
  685. @property
  686. def safeSearch(self):
  687. """
  688. @return: Whether safe search is enabled or not.
  689. @rtype: bool
  690. """
  691. return self._safesearch
  692. @safeSearch.setter
  693. def safeSearch(self, state):
  694. """
  695. @param state: Enable/disable safe search.
  696. @type state: bool
  697. """
  698. self._safesearch = state
  699. @property
  700. def parseHtml(self):
  701. """
  702. @return: Whether parsing HTML is enabled, this will not use the
  703. JSON API when it returns True.
  704. @rtype: bool
  705. """
  706. return self._parseHtml
  707. @parseHtml.setter
  708. def parseHtml(self, state):
  709. """
  710. @param value: Enable/disable parsing HTML instead of using the JSON API
  711. @type value: bool
  712. """
  713. self._parseHtml = state
  714. @property
  715. def requestKwargs(self):
  716. """ Returns current data that will be send with the POST
  717. request used for the search operation. The search query,
  718. language, page-number and enabled categories/engines.
  719. @rtype: dict
  720. """
  721. data = {
  722. "q": self.query,
  723. "safesearch": "1" if self.safeSearch else "0"
  724. }
  725. # Choose what resource to use (JSON API or HTML parser)
  726. if self.parseHtml:
  727. data.update({"theme": "simple"})
  728. else:
  729. data.update({"format": "json"})
  730. # After testing found that searx will honor only engines when
  731. # both engines and categories are set.
  732. if self.engines:
  733. data.update(self.engines.dict())
  734. elif self.categories:
  735. data.update(self.categories.dict())
  736. if self.lang:
  737. data.update({"language": self.lang})
  738. if self.pageno:
  739. data.update({"pageno": self.pageno})
  740. if self.timeRange:
  741. data.update({"time_range": self.timeRange})
  742. return data
  743. def reset(self):
  744. self.url = ""
  745. self.timeRange = ""
  746. self.lang = ""
  747. self.pageno = 1
  748. self.categories.reset()
  749. self.engines.clear()
  750. self.query = ""
  751. def search(self):
  752. """ Preform search operation with current set values.
  753. @returns: The result of this search.
  754. @rtype: SearchResult
  755. """
  756. rtype = SearchResult
  757. if self.parseHtml:
  758. rtype = SearchResult2
  759. result = self._requestHandler.post(
  760. urllib.parse.urljoin(self.url, "/search"),
  761. data=self.requestKwargs,
  762. ResultType=rtype
  763. )
  764. result = self.handleLinkToken(result)
  765. return result
  766. def handleLinkToken(self, result):
  767. """! Searx-Qt is not a bot
  768. @see https://github.com/searxng/searxng/blob/master/searx/botdetection/link_token.py
  769. @note variables in https://searx.instance/config:
  770. - bool ["limiter"]["botdetection.ip_limit.link_token"]
  771. - bool ["limiter"]["botdetection.ip_limit.pass_searxng_org"]
  772. """
  773. # Not relevant
  774. if result.errorType() != ErrorType.NoResults or not self.parseHtml:
  775. return result
  776. # No linktoken found
  777. if result.linktoken is None:
  778. return result
  779. # Request the dummy css
  780. linkResult = self._requestHandler.get(
  781. result.linktoken,
  782. ResultType=LinkTokenResult
  783. )
  784. # Failed to get dummy css
  785. if linkResult.errorType() != ErrorType.Success:
  786. return result
  787. # Redo the original request
  788. return self._requestHandler.post(
  789. result.url(),
  790. data=self.requestKwargs,
  791. ResultType=SearchResult2
  792. )
  793. class SearxConfigHandler(HandlerProto):
  794. def __init__(self, requestsHandler):
  795. HandlerProto.__init__(self, requestsHandler)
  796. def updateInstance(self, url):
  797. newUrl = urllib.parse.urljoin(url, "/config")
  798. result = self.requestsHandler.get(newUrl, ResultType=SearxConfigResult)
  799. if result:
  800. instance = self.instances[url]
  801. j = result.json()
  802. """ Update instance version
  803. """
  804. instance.update({
  805. "version": j.get("version", "")
  806. })
  807. """ Update instance network_type to use our own network type
  808. definitions as class NetworkTypes (core/handler.py)
  809. """
  810. instance.update({"network_type": NetworkTypes.netTypeFromUrl(url)})
  811. """ Update Engines
  812. What we get:
  813. "engines": [
  814. categories (list, str)
  815. enabled (bool)
  816. language_support (bool)
  817. name (str)
  818. paging (bool)
  819. safesearch (bool)
  820. shortcut (str)
  821. supported_languages (list, str)
  822. time_range_support (bool)
  823. timeout (float)
  824. ]
  825. What instanceModel wants
  826. "engines" : {
  827. "not evil": {
  828. "error_rate": 15,
  829. "errors": [
  830. 0
  831. ]
  832. }
  833. }
  834. What enginesModel wants
  835. "engines": {
  836. "1337x": {
  837. "categories": [
  838. "videos"
  839. ],
  840. "language_support": true,
  841. "paging": true,
  842. "safesearch": false,
  843. "shortcut": "1337x",
  844. "time_range_support": false
  845. },
  846. """
  847. newInstanceEngines = {}
  848. newEnginesEngines = {}
  849. for engine in j.get('engines', []):
  850. name = engine.get('name', "")
  851. if not name:
  852. continue
  853. newInstanceEngines.update({
  854. name: {}
  855. })
  856. if name not in self.engines:
  857. newEnginesEngines.update({
  858. name: {
  859. "categories": list(engine.get('categories', [])),
  860. "language_support": engine.get(
  861. 'language_support',
  862. False
  863. ),
  864. "paging": engine.get('paging', False),
  865. "safesearch": engine.get('safesearch', False),
  866. "shortcut": engine.get('shortcut', ""),
  867. "time_range_support": engine.get(
  868. 'time_range_support',
  869. False
  870. )
  871. }
  872. })
  873. instance.update({
  874. "engines": dict(newInstanceEngines)
  875. })
  876. self.engines.update(newEnginesEngines)
  877. """ Update instance lastUpdated
  878. """
  879. instance.update({
  880. "lastUpdated": time.time()
  881. })
  882. return True
  883. return False
  884. def addInstance(self, url):
  885. if url not in self.instances:
  886. self._instances[url] = {}
  887. return True
  888. return False
  889. def removeInstance(self, url):
  890. """
  891. @param url: url of the instance to remove.
  892. @type url: str
  893. """
  894. del self._instances[url]
  895. def removeMultiInstances(self, urls):
  896. """ Remove instance(s) by url without emitting changed for every
  897. instance that got removed.
  898. @param urls: list with urls of instances to remove.
  899. @type urls: list
  900. """
  901. for url in urls:
  902. del self._instances[url]