direct-ingestion.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import antennaDB
  4. from os import getenv
  5. import datetime
  6. import time
  7. import configparser
  8. import argparse
  9. import gemcall
  10. import URLHelper
  11. from multiFeedParsing import parsetwtxt,parsegemsub,parsexml,FeedEntry,TwtxtEntry
  12. from pageGeneration import updateStatFile,generateFeedPage,generateAtomFeed,generateTwtxtPage,generateTwtxtFile
  13. # Uniform log messages and output
  14. def log(msg: str = "", logfile: str = "", response:int = None) -> None:
  15. log = open(logfile, "a")
  16. log.write(datetime.datetime.utcnow().isoformat() + " " + msg + "\n")
  17. log.close()
  18. if response:
  19. print(f"{str(response)} {msg}\r\n")
  20. def getFeed(uh: URLHelper = None,feedurl: str = None, redirect: int = 0, redirectLimit: int = 1) -> { "response": gemcall.Response, "url": str } :
  21. response = None
  22. url = None
  23. try:
  24. url = uh.resolve(feedurl)
  25. except:
  26. log(msg = f"ERROR: '{url}' is not a valid URL",logfile = logfile, response = 40)
  27. # Have we already redirected too many times?
  28. if url and redirect <= redirectLimit:
  29. if not uh.mightBeAURL(url):
  30. log(msg = f"ERROR: pretty sure '{url}' is not a real URL...",logfile = logfile, response = 40)
  31. elif uh.isBlocked(url):
  32. log(msg = f"ERROR: feed URL '{url}' is blocked by rules.",logfile = logfile, response = 40)
  33. db.deleteFeeds([url])
  34. else:
  35. try:
  36. response = gemcall.request(url)
  37. except:
  38. log(msg = f"ERROR: failed to fetch feed from '{url}'",logfile = logfile, response = 40)
  39. response = None
  40. if response and (response.responsecode == 30 or response.responsecode == 31):
  41. log(msg = f"INFO: following redirect from '{url}' to '{response.meta}'.",logfile = logfile)
  42. return getFeed(uh = uh, feedurl = response.meta, redirect = redirect + 1, redirectLimit = redirectLimit)
  43. # If it's neither a redirect nor a successful response
  44. elif response and response.responsecode != 20:
  45. log(msg = f"ERROR: bad response for feed '{url}': '{str(response.responsecode)} {response.meta}'",logfile = logfile, response = 40)
  46. response = None
  47. elif url:
  48. log(msg = f"ERROR: Will not follow a redirect from '{url}' to '{response.meta}'",logfile = logfile, response = 40)
  49. # If we received a proper response then return it
  50. return { "response":response, "url":url }
  51. feedurl = getenv('QUERY_STRING')
  52. if not feedurl:
  53. print("10 Feed URL:\r\n")
  54. exit()
  55. argparser = argparse.ArgumentParser(description="Ingest feed $QUERY_STRING to Antenna, according to settings in --config file.")
  56. argparser.add_argument("--config", help="Path to config file.")
  57. args = argparser.parse_args()
  58. if not args.config:
  59. argparser.print_help()
  60. exit()
  61. config = configparser.ConfigParser()
  62. config.read(args.config)
  63. nonpublic = config["nonpublic"]
  64. public = config["public"]
  65. rules = config["rules"]
  66. rootdir = nonpublic.get("rootdir","")
  67. outputdir = public.get("outputdir","")
  68. if not rootdir:
  69. print("ERROR: The value 'rootdir' is missing in the [nonpublic] section of config file.")
  70. exit(1)
  71. if not outputdir:
  72. print("ERROR: The value 'outputdir' is missing in the [public] section of config file.")
  73. exit(1)
  74. db = antennaDB.AntennaDB(f"{rootdir}/{nonpublic.get('db','antenna.sqlite')}")
  75. blocklist = f"{rootdir}/{nonpublic.get('blocklist','blocklist.txt')}"
  76. logfile = f"{rootdir}/{nonpublic.get('logfile','antenna.log')}"
  77. uh = URLHelper.URLHelper(blocklist = blocklist)
  78. agelimit = int(time.mktime(datetime.datetime.utcnow().utctimetuple())) - (3600*24*int(rules.get('agelimit','7')))
  79. feedResponse = getFeed(uh, feedurl)
  80. response = feedResponse["response"]
  81. correctedfeedurl = feedResponse["url"]
  82. if not response:
  83. exit()
  84. try: # 300kb should be enough for most feeds
  85. feed = response.read(300*1024).decode('UTF-8')
  86. except:
  87. log(msg = f"ERROR: failed to properly read content from '{correctedfeedurl}'",logfile = logfile, response = 40)
  88. exit()
  89. # Since we received an updated feed we'll start by removing existing entries
  90. log(msg = f"INFO: fetched feed from '{correctedfeedurl}', removing from DB",logfile = logfile)
  91. db.deleteFeeds([correctedfeedurl])
  92. log(msg = f"INFO: attempting to parse feed '{correctedfeedurl}' as gemlog feed",logfile = logfile)
  93. preliminaryEntries = parsegemsub(feed, correctedfeedurl) or parsexml(feed, correctedfeedurl)
  94. entries = []
  95. while len(preliminaryEntries) > 0:
  96. entry = preliminaryEntries.pop()
  97. if not uh.isBlocked(entry.link):
  98. entries.append(entry)
  99. else:
  100. log(msg = f"ERROR: entry URL '{entry.link}' is blocked by rules.",logfile = logfile)
  101. if entries:
  102. db.insertFeedEntries(entries, agelimit)
  103. else:
  104. log(msg = f"INFO: attempting to parse feed '{correctedfeedurl}' as twtxt",logfile = logfile)
  105. db.insertTwtxtEntries(parsetwtxt(feed, correctedfeedurl), agelimit)
  106. db.pruneDB(agelimit)
  107. # Short of getting all entries and comparing them fully to each other there's really no way to tell whether anything needs regeneration.
  108. twts = db.getTwts()
  109. feedEntries = db.getEntries()
  110. feedURLs = set()
  111. for entry in feedEntries:
  112. feedURLs.add(entry.feedurl)
  113. # twtgmi = twtxt.gmi
  114. # twtxt = twtxt.txt
  115. # atom = atom.xml
  116. # stats = stats.tsv
  117. # index = index.gmi
  118. updateStatFile(len(feedURLs), len(feedEntries), f"{outputdir}/{public.get('stats','stats.tsv')}")
  119. generateFeedPage(feedEntries, f"{outputdir}/{public.get('index', 'index.gmi')}")
  120. generateAtomFeed(feedEntries, f"{outputdir}/{public.get('atom', 'atom.xml')}")
  121. generateTwtxtPage(twts, f"{outputdir}/{public.get('twtgmi', 'twtxt.gmi')}")
  122. generateTwtxtPage(twts, f"{outputdir}/{public.get('twtxt', 'twtxt.txt')}")
  123. print("20 text/gemini\r\nThank you for your submission! Antenna has now been updated.\n")