internet.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. from __future__ import annotations
  2. from aiohttp import ClientSession, ClientTimeout
  3. try:
  4. from duckduckgo_search import DDGS
  5. from bs4 import BeautifulSoup
  6. has_requirements = True
  7. except ImportError:
  8. has_requirements = False
  9. from ...errors import MissingRequirementsError
  10. import asyncio
  11. class SearchResults():
  12. def __init__(self, results: list):
  13. self.results = results
  14. def __iter__(self):
  15. yield from self.results
  16. def __str__(self):
  17. search = ""
  18. for idx, result in enumerate(self.results):
  19. if search:
  20. search += "\n\n\n"
  21. search += f"Title: {result.title}\n\n"
  22. if result.text:
  23. search += result.text
  24. else:
  25. search += result.snippet
  26. search += f"\n\nSource: [[{idx}]]({result.url})"
  27. return search
  28. class SearchResultEntry():
  29. def __init__(self, title: str, url: str, snippet: str, text: str = None):
  30. self.title = title
  31. self.url = url
  32. self.snippet = snippet
  33. self.text = text
  34. def set_text(self, text: str):
  35. self.text = text
  36. def scrape_text(html: str, max_words: int = None) -> str:
  37. soup = BeautifulSoup(html, "html.parser")
  38. for exclude in soup(["script", "style"]):
  39. exclude.extract()
  40. for selector in [
  41. "main",
  42. ".main-content-wrapper",
  43. ".main-content",
  44. ".emt-container-inner",
  45. ".content-wrapper",
  46. "#content",
  47. "#mainContent",
  48. ]:
  49. select = soup.select_one(selector)
  50. if select:
  51. soup = select
  52. break
  53. # Zdnet
  54. for remove in [".c-globalDisclosure"]:
  55. select = soup.select_one(remove)
  56. if select:
  57. select.extract()
  58. clean_text = ""
  59. for paragraph in soup.select("p"):
  60. text = paragraph.get_text()
  61. for line in text.splitlines():
  62. words = []
  63. for word in line.replace("\t", " ").split(" "):
  64. if word:
  65. words.append(word)
  66. count = len(words)
  67. if not count:
  68. continue
  69. if max_words:
  70. max_words -= count
  71. if max_words <= 0:
  72. break
  73. if clean_text:
  74. clean_text += "\n"
  75. clean_text += " ".join(words)
  76. return clean_text
  77. async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str:
  78. try:
  79. async with session.get(url) as response:
  80. if response.status == 200:
  81. html = await response.text()
  82. return scrape_text(html, max_words)
  83. except:
  84. return
  85. async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults:
  86. if not has_requirements:
  87. raise MissingRequirementsError('Install "duckduckgo-search" and "beautifulsoup4" package')
  88. with DDGS() as ddgs:
  89. results = []
  90. for result in ddgs.text(
  91. query,
  92. region="wt-wt",
  93. safesearch="moderate",
  94. timelimit="y",
  95. ):
  96. results.append(SearchResultEntry(
  97. result["title"],
  98. result["href"],
  99. result["body"]
  100. ))
  101. if len(results) >= n_results:
  102. break
  103. if add_text:
  104. requests = []
  105. async with ClientSession(timeout=ClientTimeout(5)) as session:
  106. for entry in results:
  107. requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1))))
  108. texts = await asyncio.gather(*requests)
  109. formatted_results = []
  110. left_words = max_words
  111. for i, entry in enumerate(results):
  112. if add_text:
  113. entry.text = texts[i]
  114. if left_words:
  115. left_words -= entry.title.count(" ") + 5
  116. if entry.text:
  117. left_words -= entry.text.count(" ")
  118. else:
  119. left_words -= entry.snippet.count(" ")
  120. if 0 > left_words:
  121. break
  122. formatted_results.append(entry)
  123. return SearchResults(formatted_results)
  124. def get_search_message(prompt) -> str:
  125. try:
  126. search_results = asyncio.run(search(prompt))
  127. message = f"""
  128. {search_results}
  129. Instruction: Using the provided web search results, to write a comprehensive reply to the user request.
  130. Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
  131. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.
  132. User request:
  133. {prompt}
  134. """
  135. return message
  136. except Exception as e:
  137. print("Couldn't do web search:", e)
  138. return prompt