scrape_good_scummvm_games.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. """
  2. Scrape ScummVM supported games at good or excellent level, and create YAML clone templates
  3. Uses libraries:
  4. - beautifulsoup4
  5. - httpx
  6. - tenacity
  7. """
  8. import re
  9. from pathlib import Path
  10. from typing import Container, Optional
  11. import httpx
  12. import yaml
  13. from bs4 import BeautifulSoup
  14. from tenacity import stop_after_attempt, retry, wait_exponential
  15. from utils import originals
  16. SCUMMVM_LIST = "https://www.scummvm.org/compatibility/"
  17. SCUMMVM_BASE_URL = "https://www.scummvm.org"
  18. SUPPORT_LEVELS = {"Good", "Excellent"}
  19. PLATFORM_ALIASES = {
  20. "Apple IIgs": "Apple II",
  21. "Atari ST": "Atari",
  22. "Macintosh": "Classic Mac OS",
  23. "Sega CD": "Genesis",
  24. "Steam": "Windows",
  25. "Tandy Color Computer 3": "CoCo",
  26. }
  27. # These games are not games or compilations/demos that shouldn't have their own game entries
  28. BLACKLIST = {
  29. "Inside the Chest",
  30. "King's Questions",
  31. "Passport to Adventure (Indiana Jones and the Last Crusade, The Secret of Monkey Island, Loom)",
  32. "Mission Supernova 1",
  33. "Mission Supernova 2"
  34. }
  35. def main():
  36. # Get list of OSGC originals
  37. osgc_originals = set()
  38. for original in originals():
  39. osgc_originals.add(original["name"])
  40. for name in original.get("names", []):
  41. osgc_originals.add(name)
  42. # Get platforms
  43. platforms = yaml.safe_load(open(Path("schema") / "originals.yaml", encoding="utf-8"))["schema;platforms"]["enum"]
  44. # Get list of games
  45. resp = httpx.get(SCUMMVM_LIST)
  46. content = resp.text
  47. soup = BeautifulSoup(content, "html.parser")
  48. game_links = {}
  49. for td_name, td_support_level in zip(soup.find_all("td", class_="gameFullName"), soup.find_all("td", class_="gameSupportLevel")):
  50. # Filter out those that aren't good enough
  51. if td_support_level.text not in SUPPORT_LEVELS:
  52. continue
  53. name = td_name.text.strip()
  54. # Filter out engines
  55. if name.endswith(" games"):
  56. continue
  57. # Filter out blacklist
  58. if name in BLACKLIST:
  59. continue
  60. # Use name in parens if present
  61. if match := re.match(r".+ \((.+)\)", name):
  62. name = match.group(1)
  63. game_links[name] = SCUMMVM_BASE_URL + td_name.a.attrs["href"]
  64. # Generate originals list
  65. origs = list(game_links)
  66. # Filter out those we already have (match case-insensitive)
  67. def game_is_in_original(game: str) -> bool:
  68. if game in osgc_originals:
  69. return True
  70. # Try case-insensitive
  71. if game.lower() in {o.lower() for o in osgc_originals}:
  72. return True
  73. # Try using the name before or after the colon
  74. if (match := re.match(r"(.+):(.+)", game)) and (match.group(1).strip() in osgc_originals or match.group(2).strip() in osgc_originals):
  75. return True
  76. # Try matching without certain punctuation
  77. if game.replace("!", "") in {o.replace("!", "") for o in osgc_originals}:
  78. return True
  79. return False
  80. missing_originals = {original for original in origs if not game_is_in_original(original)}
  81. print("ScummVM originals:")
  82. for original in sorted(missing_originals):
  83. print(f"- {original}")
  84. for original in missing_originals:
  85. if game_info := scrape_game_info(game_links[original], platforms):
  86. print(yaml.dump(game_info))
  87. @retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=10))
  88. def scrape_game_info(link: str, platforms: Container[str]) -> Optional[dict]:
  89. # Go to game subpage
  90. resp = httpx.get(link)
  91. content = resp.text
  92. soup = BeautifulSoup(content, "html.parser")
  93. # Don't add games that aren't clones
  94. if soup.find("a", string="ScummVM Freeware Games"):
  95. return None
  96. # Generate game entry, with name
  97. game = {
  98. "name": soup.find("td", class_="gameFullName").text,
  99. "external": {},
  100. "platforms": [],
  101. "meta": {
  102. "genres": [],
  103. "themes": [],
  104. }
  105. }
  106. # Add Supported Platforms
  107. supported_platforms_title = soup.find("h3", string="Supported Platforms")
  108. if supported_platforms_lis := supported_platforms_title.find_next_sibling("ul"):
  109. for li in supported_platforms_lis.find_all("li"):
  110. platform = li.text.strip()
  111. platform = PLATFORM_ALIASES.get(platform, platform)
  112. if platform not in platforms:
  113. print(f"{platform=} unknown")
  114. elif platform not in game["platforms"]:
  115. game["platforms"].append(platform)
  116. # Find links
  117. if wikipedia_link := soup.find("a", string="Wikipedia"):
  118. game["external"]["wikipedia"] = wikipedia_name(wikipedia_link.attrs["href"])
  119. if mobygames_link := soup.find("a", string="MobyGames"):
  120. game["external"]["website"] = mobygames_link.attrs["href"]
  121. if not wikipedia_link and not mobygames_link:
  122. # Use ScummVM wiki as fallback
  123. if scummvm_link := soup.find("a", string="ScummVM Wiki"):
  124. game["external"]["website"] = scummvm_link.attrs["href"]
  125. else:
  126. print(f"Cannot find link for {game['name']}")
  127. return game
  128. def wikipedia_name(link: str) -> str:
  129. """
  130. >>> wikipedia_name("https://en.wikipedia.org/wiki/Operation_Stealth")
  131. 'Operation Stealth'
  132. """
  133. return link.split("/")[-1].replace("_", " ")
  134. if __name__ == "__main__":
  135. main()