scrape_good_scummvm_games.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. Scrape ScummVM supported games at good or excellent level, and create YAML clone templates
  3. Uses libraries:
  4. - aiohttp
  5. - asyncio
  6. - beautifulsoup4
  7. - tenacity
  8. """
  9. import asyncio
  10. from pathlib import Path
  11. from typing import Container, Optional
  12. import aiohttp
  13. import yaml
  14. from bs4 import BeautifulSoup
  15. from tenacity import stop_after_attempt, retry, wait_exponential
  16. SCUMMVM_LIST = "https://www.scummvm.org/compatibility/"
  17. SCUMMVM_BASE_URL = "https://www.scummvm.org"
  18. SUPPORT_LEVELS = {"Good", "Excellent"}
  19. PLATFORM_ALIASES = {
  20. "Apple IIgs": "Apple II",
  21. "Atari ST": "Atari",
  22. "Macintosh": "Classic Mac OS",
  23. "Steam": "Windows",
  24. "Tandy Color Computer 3": "CoCo",
  25. }
  26. async def main():
  27. # Get list of OSGC originals
  28. osgc_originals = set()
  29. for p in Path("originals").iterdir():
  30. if p.is_file() and p.suffix == ".yaml":
  31. originals = yaml.safe_load(open(p, encoding="utf-8"))
  32. for original in originals:
  33. osgc_originals.add(original["name"])
  34. # Get platforms
  35. platforms = yaml.safe_load(open(Path("schema") / "originals.yaml", encoding="utf-8"))["schema;platforms"]["enum"]
  36. # Get list of games
  37. async with aiohttp.ClientSession() as session:
  38. async with session.get(SCUMMVM_LIST) as resp:
  39. content = await resp.text()
  40. soup = BeautifulSoup(content, "html.parser")
  41. game_links = {}
  42. for td_name, td_support_level in zip(soup.find_all("td", class_="gameFullName"), soup.find_all("td", class_="gameSupportLevel")):
  43. # Filter out those that aren't good enough
  44. if td_support_level.text not in SUPPORT_LEVELS:
  45. continue
  46. game_links[td_name.text] = SCUMMVM_BASE_URL + td_name.a.attrs["href"]
  47. # Generate originals list
  48. originals = list(game_links)
  49. print("ScummVM originals:")
  50. for original in sorted(originals):
  51. print(f"- {original}")
  52. # Filter out those we already have
  53. missing_originals = {original for original in originals if original not in osgc_originals}
  54. for original in missing_originals:
  55. if game_info := await scrape_game_info(session, game_links[original], platforms):
  56. print(yaml.dump(game_info))
  57. @retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=10))
  58. async def scrape_game_info(session: aiohttp.ClientSession, link: str, platforms: Container[str]) -> Optional[dict]:
  59. # Go to game subpage
  60. async with session.get(link) as resp:
  61. content = await resp.text()
  62. soup = BeautifulSoup(content, "html.parser")
  63. # Don't add games that aren't clones
  64. if soup.find("a", string="ScummVM Freeware Games"):
  65. return None
  66. # Generate game entry, with name
  67. game = {
  68. "name": soup.find("td", class_="gameFullName").text,
  69. "external": {},
  70. "platforms": [],
  71. "meta": {
  72. "genres": [],
  73. "themes": [],
  74. }
  75. }
  76. # Add Supported Platforms
  77. supported_platforms_title = soup.find("h3", string="Supported Platforms")
  78. if supported_platforms_lis := supported_platforms_title.find_next_sibling("ul"):
  79. for li in supported_platforms_lis.find_all("li"):
  80. platform = li.text.strip()
  81. platform = PLATFORM_ALIASES.get(platform, platform)
  82. if platform not in platforms:
  83. print(f"{platform=} unknown")
  84. elif platform not in game["platforms"]:
  85. game["platforms"].append(platform)
  86. # Find links
  87. if wikipedia_link := soup.find("a", string="Wikipedia"):
  88. game["external"]["wikipedia"] = wikipedia_name(wikipedia_link.attrs["href"])
  89. if mobygames_link := soup.find("a", string="MobyGames"):
  90. game["external"]["website"] = mobygames_link.attrs["href"]
  91. if not wikipedia_link and not mobygames_link:
  92. # Use ScummVM wiki as fallback
  93. if scummvm_link := soup.find("a", string="ScummVM Wiki"):
  94. game["external"]["website"] = scummvm_link.attrs["href"]
  95. else:
  96. print(f"Cannot find link for {game['name']}")
  97. return game
  98. def wikipedia_name(link: str) -> str:
  99. """
  100. >>> wikipedia_name("https://en.wikipedia.org/wiki/Operation_Stealth")
  101. 'Operation Stealth'
  102. """
  103. return link.split("/")[-1].replace("_", " ")
  104. if __name__ == "__main__":
  105. asyncio.run(main())