yahoo_news.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Yahoo (News)
  4. """
  5. import re
  6. from datetime import datetime, timedelta
  7. from urllib.parse import urlencode
  8. from lxml import html
  9. from searx.engines.yahoo import parse_url, language_aliases
  10. from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
  11. from dateutil import parser
  12. from searx.utils import extract_text, extract_url, match_language
  13. # about
  14. about = {
  15. "website": 'https://news.yahoo.com',
  16. "wikidata_id": 'Q3044717',
  17. "official_api_documentation": 'https://developer.yahoo.com/api/',
  18. "use_official_api": False,
  19. "require_api_key": False,
  20. "results": 'HTML',
  21. }
  22. # engine dependent config
  23. categories = ['news']
  24. paging = True
  25. # search-url
  26. search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa
  27. # specific xpath variables
  28. results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li'
  29. url_xpath = './/h3/a/@href'
  30. title_xpath = './/h3/a'
  31. content_xpath = './/div[@class="compText"]'
  32. publishedDate_xpath = './/span[contains(@class,"tri")]'
  33. suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
  34. # do search-request
  35. def request(query, params):
  36. offset = (params['pageno'] - 1) * 10 + 1
  37. if params['language'] == 'all':
  38. language = 'en'
  39. else:
  40. language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
  41. params['url'] = search_url.format(offset=offset,
  42. query=urlencode({'p': query}),
  43. lang=language)
  44. # TODO required?
  45. params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
  46. .format(lang=language)
  47. return params
  48. def sanitize_url(url):
  49. if ".yahoo.com/" in url:
  50. return re.sub("\\;\\_ylt\\=.+$", "", url)
  51. else:
  52. return url
  53. # get response from search-request
  54. def response(resp):
  55. results = []
  56. dom = html.fromstring(resp.text)
  57. # parse results
  58. for result in dom.xpath(results_xpath):
  59. urls = result.xpath(url_xpath)
  60. if len(urls) != 1:
  61. continue
  62. url = sanitize_url(parse_url(extract_url(urls, search_url)))
  63. title = extract_text(result.xpath(title_xpath)[0])
  64. content = extract_text(result.xpath(content_xpath)[0])
  65. # parse publishedDate
  66. publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
  67. # still useful ?
  68. if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
  69. publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))
  70. elif re.match("^[0-9]+ days? ago$", publishedDate):
  71. publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group()))
  72. elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
  73. timeNumbers = re.findall(r'\d+', publishedDate)
  74. publishedDate = datetime.now()\
  75. - timedelta(hours=int(timeNumbers[0]))\
  76. - timedelta(minutes=int(timeNumbers[1]))
  77. else:
  78. try:
  79. publishedDate = parser.parse(publishedDate)
  80. except:
  81. publishedDate = datetime.now()
  82. if publishedDate.year == 1900:
  83. publishedDate = publishedDate.replace(year=datetime.now().year)
  84. # append result
  85. results.append({'url': url,
  86. 'title': title,
  87. 'content': content,
  88. 'publishedDate': publishedDate})
  89. # return results
  90. return results