ina.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. INA (Videos)
  4. """
  5. from json import loads
  6. from html import unescape
  7. from urllib.parse import urlencode
  8. from lxml import html
  9. from dateutil import parser
  10. from searx.utils import extract_text
  11. # about
  12. about = {
  13. "website": 'https://www.ina.fr/',
  14. "wikidata_id": 'Q1665109',
  15. "official_api_documentation": None,
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. }
  20. # engine dependent config
  21. categories = ['videos']
  22. paging = True
  23. page_size = 48
  24. # search-url
  25. base_url = 'https://www.ina.fr'
  26. search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
  27. # specific xpath variables
  28. results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'
  29. url_xpath = './/a/@href'
  30. title_xpath = './/h3[@class="h3--title media-heading"]'
  31. thumbnail_xpath = './/img/@src'
  32. publishedDate_xpath = './/span[@class="broadcast"]'
  33. content_xpath = './/p[@class="media-body__summary"]'
  34. # do search-request
  35. def request(query, params):
  36. params['url'] = search_url.format(ps=page_size,
  37. start=params['pageno'] * page_size,
  38. query=urlencode({'q': query}))
  39. return params
  40. # get response from search-request
  41. def response(resp):
  42. results = []
  43. # we get html in a JSON container...
  44. response = loads(resp.text)
  45. if "content" not in response:
  46. return []
  47. dom = html.fromstring(response["content"])
  48. # parse results
  49. for result in dom.xpath(results_xpath):
  50. videoid = result.xpath(url_xpath)[0]
  51. url = base_url + videoid
  52. title = unescape(extract_text(result.xpath(title_xpath)))
  53. try:
  54. thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
  55. except:
  56. thumbnail = ''
  57. if thumbnail and thumbnail[0] == '/':
  58. thumbnail = base_url + thumbnail
  59. d = extract_text(result.xpath(publishedDate_xpath)[0])
  60. d = d.split('/')
  61. # force ISO date to avoid wrong parsing
  62. d = "%s-%s-%s" % (d[2], d[1], d[0])
  63. publishedDate = parser.parse(d)
  64. content = extract_text(result.xpath(content_xpath))
  65. # append result
  66. results.append({'url': url,
  67. 'title': title,
  68. 'content': content,
  69. 'template': 'videos.html',
  70. 'publishedDate': publishedDate,
  71. 'thumbnail': thumbnail})
  72. # return results
  73. return results