ingestfeeds.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import html
  4. import time
  5. import urllib.parse
  6. from argparse import Namespace
  7. from collections.abc import Iterable
  8. from datetime import datetime
  9. from typing import Optional
  10. import gemcall
  11. from gemini_antenna.URLHelper import URLHelper
  12. from gemini_antenna.db import AntennaDB
  13. from gemini_antenna.signoffs import getsig
  14. from gemini_antenna.multiFeedParsing import (parsetwtxt, parsegemsub, parsexml,
  15. FeedEntry, TwtxtEntry)
  16. urllib.parse.uses_relative.append("gemini")
  17. urllib.parse.uses_netloc.append("gemini")
  18. def formatTime(timestamp: Optional[float], dateformat: str) -> str:
  19. """
  20. Convert a timestamp to a string according to a format specification.
  21. :param timestamp: Seconds since the Epoch. If `None`, current time is used.
  22. :param dateformat: strftime(3) format specification.
  23. """
  24. if timestamp is None:
  25. timestamp = datetime.utcnow()
  26. utc = datetime.utcfromtimestamp(timestamp)
  27. return utc.strftime(dateformat)
  28. def printPage(pagetext: str, output=None) -> None:
  29. """ Generate a text/gemini page. """
  30. if output:
  31. with open(output, "w") as outputfile:
  32. outputfile.write(pagetext)
  33. else:
  34. print(pagetext)
  35. def updateStatFile(urlNr: int, entryNr: int, output=None) -> None:
  36. # avoid ZeroDivisionError
  37. if urlNr == 0:
  38. return
  39. timestr = formatTime(None, '%Y-%m-%dT%H:%M:%SZ')
  40. line = "{0}\t{1}\t{2}\t{3:.2}".format(timestr, urlNr, entryNr, entryNr / urlNr)
  41. if output:
  42. with open(output, "a") as outputfile:
  43. print(line, file=outputfile)
  44. else:
  45. print(line)
  46. def generateFeedPage(entries: Iterable[FeedEntry], output=None) -> None:
  47. pagetext='''# Antenna
  48. ## Receiving Transmissions From Geminispace
  49. => about.gmi What is this?
  50. => cgi-bin/submit Send transmission!
  51. '''
  52. datestamp = "0000-00-00"
  53. for entry in entries:
  54. timestamp = formatTime(entry.updated, '%Y-%m-%d')
  55. if not datestamp == timestamp:
  56. datestamp = timestamp
  57. pagetext += "\n"
  58. pagetext += f"=> {entry.link} {timestamp} {entry.author}: {entry.title}\n"
  59. pagetext += f'''
  60. > {getsig()}
  61. => cgi-bin/log Tail the log
  62. => twtxt.gmi Antenna's twtxt page.
  63. => atom.xml Antenna's Atom feed.
  64. => cgi-bin/filter Customize your Antenna view.
  65. => stats.tsv Check out the latest statistics in tsv format.
  66. '''
  67. printPage(pagetext, output)
  68. def generateAtomFeed(entries: Iterable[FeedEntry], output=None) -> None:
  69. pagetext=f'''<?xml version="1.0" encoding="UTF-8"?><feed xmlns="http://www.w3.org/2005/Atom">
  70. <title>Antenna</title>
  71. <id>gemini://warmedal.se/~antenna/</id>
  72. <updated>{formatTime(None, '%Y-%m-%dT%H:%M:%SZ')}</updated>
  73. <subtitle>Receiving transmissions from geminispace</subtitle>
  74. <link href="gemini://warmedal.se/~antenna/" rel="alternate"></link>
  75. <link href="gemini://warmedal.se/~antenna/atom.xml" rel="self"></link>
  76. '''
  77. for entry in entries:
  78. timestamp = formatTime(entry.updated, '%Y-%m-%dT%H:%M:%SZ')
  79. pagetext += f''' <entry>
  80. <id>{html.escape(entry.link)}</id>
  81. <title>{html.escape(entry.title)}</title>
  82. <updated>{timestamp}</updated>
  83. <link href="{html.escape(entry.link)}" rel="alternate"></link>
  84. <author>
  85. <name>{html.escape(entry.author)}</name>
  86. </author>
  87. </entry>
  88. '''
  89. pagetext += "</feed>"
  90. printPage(pagetext, output)
  91. def generateTwtxtPage(entries: Iterable[TwtxtEntry], output=None):
  92. pagetext='''# Antenna Twtxt
  93. => twtxt.txt Plain text version
  94. '''
  95. for entry in entries:
  96. timestamp = formatTime(entry.posted, '%Y-%m-%dT%H:%M:%SZ')
  97. pagetext += f"{entry.author} {entry.feedurl}\n"
  98. pagetext += f"> {timestamp} {entry.twt}\n\n"
  99. pagetext += "\n=> cgi-bin/log Tail the log\n\n"
  100. pagetext += "> " + getsig() + "\n"
  101. printPage(pagetext, output)
  102. def generateTwtxtFile(entries: Iterable[TwtxtEntry], output=None):
  103. pagetext = ""
  104. for entry in entries:
  105. timestamp = formatTime(entry.posted, '%Y-%m-%dT%H:%M:%SZ')
  106. pagetext += f"{entry.author}\t{entry.feedurl}\t{timestamp}\t{entry.twt}\n"
  107. printPage(pagetext, output)
  108. def generateIndex(db: AntennaDB,
  109. feedPage: Optional[str] = None,
  110. atomFeedPage: Optional[str] = None,
  111. statFile: Optional[str] = None) -> None:
  112. logEntries = db.getEntries()
  113. feedURLs = {entry.feedurl for entry in logEntries}
  114. generateFeedPage(logEntries, feedPage)
  115. generateAtomFeed(logEntries, atomFeedPage)
  116. updateStatFile(len(feedURLs), len(logEntries), statFile)
  117. def generateTwtxt(db: AntennaDB,
  118. twtxtPage: Optional[str] = None,
  119. twtxtFile: Optional[str] = None) -> None:
  120. twts = db.getTwts()
  121. generateTwtxtPage(twts, twtxtPage)
  122. generateTwtxtFile(twts, twtxtFile)
  123. def actionRefresh(args: Namespace) -> None:
  124. def log(msg):
  125. """ Uniform log messages. """
  126. timestamp = datetime.utcnow().isoformat()
  127. with open(args.dataroot / "antenna.log", "a") as logfile:
  128. print(timestamp, msg, file=logfile)
  129. # *Phew* That was a lot! Time to get started on all those feeds.
  130. db = AntennaDB(args.dataroot / "antenna.sqlite")
  131. feedqueue = set(db.getQueue())
  132. # Let's not do anything if we don't have to (or really want to)
  133. if not feedqueue and not args.force:
  134. return
  135. log("INFO: validating feeds: " + str(feedqueue))
  136. uh = URLHelper(args.dataroot / "blocklist.txt")
  137. removefromqueue = []
  138. newFeedEntries = 0
  139. newTwtxtEntries = 0
  140. agelimit = int(time.mktime(datetime.utcnow().utctimetuple())) - 3600*24*7
  141. for feedurl in feedqueue:
  142. # A path may contain '.' and '..' fragments, we'll have to resolve them
  143. correctedfeedurl = uh.resolve(feedurl)
  144. if not uh.mightBeAURL(correctedfeedurl):
  145. log(f"ERROR: pretty sure '{feedurl}' is not a real URL...")
  146. removefromqueue.append(feedurl)
  147. continue
  148. # This is a bit messy, but we want to allow a few redirects and
  149. # still keep track of which URL we're actually fetching.
  150. response = None
  151. entries = []
  152. for _ in range(3):
  153. if uh.isBlocked(correctedfeedurl):
  154. log(f"ERROR: feed URL '{feedurl}' is blocked by rules.")
  155. removefromqueue.append(feedurl)
  156. break
  157. try:
  158. response = gemcall.request(correctedfeedurl)
  159. except Exception:
  160. log(f"ERROR: failed to fetch feed from '{correctedfeedurl}'")
  161. break
  162. if response.responsecode in (30, 31):
  163. log(f"INFO: following redirect from '{correctedfeedurl}' "
  164. f"to '{response.meta}'.")
  165. correctedfeedurl = uh.resolve(
  166. urllib.parse.urljoin(correctedfeedurl, response.meta)
  167. )
  168. elif response.responsecode != 20:
  169. log(f"ERROR: bad response for feed '{correctedfeedurl}': "
  170. f"{str(response.responsecode)} {response.meta}")
  171. removefromqueue.append(feedurl)
  172. response = None
  173. break
  174. else:
  175. break
  176. if not response:
  177. continue
  178. try:
  179. feed = response.read(200*1024).decode('UTF-8')
  180. except Exception:
  181. log(f"ERROR: failed to read socket at '{correctedfeedurl}'")
  182. continue
  183. log(f"INFO: fetched feed from '{correctedfeedurl}'")
  184. db.deleteFeeds(correctedfeedurl)
  185. newFeedEntriesForFeed = 0
  186. newTwtxtEntriesForFeed = 0
  187. log(f"INFO: attempting to parse feed '{correctedfeedurl}' as gemlog feed")
  188. preliminaryEntries = (parsegemsub(feed, correctedfeedurl)
  189. or parsexml(feed, correctedfeedurl))
  190. for entry in preliminaryEntries:
  191. if not uh.isBlocked(entry.link):
  192. entries.append(entry)
  193. else:
  194. log(f"ERROR: entry URL '{entry.link}' is blocked by rules.")
  195. if entries:
  196. newFeedEntriesForFeed = db.insertFeedEntries(entries, agelimit)
  197. else:
  198. log(f"INFO: attempting to parse feed '{correctedfeedurl}' as twtxt")
  199. newTwtxtEntriesForFeed = db.insertTwtxtEntries(
  200. parsetwtxt(feed, correctedfeedurl), agelimit
  201. )
  202. if newFeedEntriesForFeed == 0 and newTwtxtEntriesForFeed == 0:
  203. log(f"INFO: parsing feed '{correctedfeedurl}' did not result in new entries.")
  204. else:
  205. newFeedEntries += newFeedEntriesForFeed
  206. newTwtxtEntries += newTwtxtEntriesForFeed
  207. removefromqueue.append(feedurl)
  208. db.deleteFromQueue(*removefromqueue)
  209. # And last of all: regenerate the static page
  210. if not args.silent or args.force:
  211. # Remove entries older than 7 days from db
  212. db.pruneDB(agelimit)
  213. if newFeedEntries > 0 or args.force:
  214. log("INFO: regenerating feed page.")
  215. generateIndex(db, args.feedpage, args.atomfeedpage, args.statfile)
  216. if newTwtxtEntries > 0 or args.force:
  217. log("INFO: regenerating twtxt pages.")
  218. generateTwtxt(db, args.twtxtpage, args.twtxtfile)