multiFeedParsing.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import re
  4. import time
  5. import urllib.parse
  6. from dataclasses import dataclass
  7. from datetime import datetime
  8. import feedparser
  9. from gemini_antenna.URLHelper import URLHelper
  10. urllib.parse.uses_relative.append("gemini")
  11. urllib.parse.uses_netloc.append("gemini")
  12. whitespace_re = re.compile(r'\s')
  13. @dataclass
  14. class FeedEntry:
  15. feedurl: str
  16. author: str
  17. updated: int
  18. title: str
  19. link: str
  20. @dataclass
  21. class TwtxtEntry:
  22. feedurl: str
  23. author: str
  24. posted: float
  25. twt: str
  26. def _cw(text: str) -> str:
  27. """ Collapse whitespace """
  28. return whitespace_re.sub(' ', text)
  29. def parsegemsub(feed: str, baseurl: str) -> list[FeedEntry]:
  30. """
  31. :param feed: Feed contents.
  32. :param baseurl: Feed URL.
  33. """
  34. baseurl = URLHelper.resolve(baseurl)
  35. entries = []
  36. authorpattern = r'^#\s*([^#\r\n]+)'
  37. entriespattern = r'^=>\s*(\S+)\s+(\d{4}-\d{2}-\d{2})[^\r\n\S]*([^\r\n]*)'
  38. entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
  39. authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
  40. if authorpatternmatch:
  41. author = authorpatternmatch[0]
  42. else:
  43. return None
  44. for entrypatternmatch in entriespatternmatches:
  45. # Get our YYYY-MM-DD string, add time of day, parse to datetime.datetime,
  46. # convert to unix timestamp and cast to int
  47. try:
  48. timestr = datetime.strptime(entrypatternmatch[1] + " 12:00:00",
  49. "%Y-%m-%d %H:%M:%S")
  50. updated = int(datetime.timestamp(timestr))
  51. except Exception:
  52. continue
  53. # A gemsub feed can often have relative links, we'll have to absolutize them
  54. link = urllib.parse.urljoin(baseurl, entrypatternmatch[0])
  55. title = entrypatternmatch[2] if entrypatternmatch[2] else entrypatternmatch[1]
  56. entries.append(FeedEntry(baseurl, author, updated, title, link))
  57. return entries
  58. def parsetwtxt(feed: str, baseurl: str) -> list[TwtxtEntry]:
  59. """
  60. :param feed: Feed contents.
  61. :param baseurl: Feed URL.
  62. """
  63. baseurl = URLHelper.resolve(baseurl)
  64. entries = []
  65. authorpattern = r'^#\s*nick\s*=\s*(\S+)'
  66. # This is a naive match, but we'll only keep those that validate eventually
  67. entriespattern = r'^(\S+)\t([^\r\n]+)'
  68. entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
  69. authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
  70. if authorpatternmatch:
  71. author = authorpatternmatch[0]
  72. else:
  73. author = baseurl
  74. for entrypatternmatch in entriespatternmatches:
  75. # Get our datetime string, parse to datetime.datetime, convert to unix
  76. # timestamp and cast to int
  77. try:
  78. posted = int(datetime.timestamp(datetime.strptime(entrypatternmatch[0],
  79. "%Y-%m-%dT%H:%M:%S%z")))
  80. except Exception:
  81. continue
  82. entries.append(TwtxtEntry(feedurl=baseurl, author=author, posted=posted,
  83. twt=entrypatternmatch[1]))
  84. return entries
  85. def parsexml(feed: str, baseurl: str) -> list[FeedEntry]:
  86. """
  87. :param feed: Feed contents.
  88. :param baseurl: Feed URL.
  89. """
  90. baseurl = URLHelper.resolve(baseurl)
  91. scheme = urllib.parse.urlparse(baseurl).scheme
  92. entries = []
  93. parsedfeed = feedparser.parse(feed)
  94. # Let's set author's name, or lacking that use the feed title.
  95. feedauthor = None
  96. feedtitle = None
  97. if (parsedfeed['feed'].has_key('author_detail')
  98. and parsedfeed['feed']['author_detail'].has_key('name')):
  99. feedauthor = _cw(parsedfeed['feed']['author_detail']['name'])
  100. if parsedfeed['feed'].has_key('title'):
  101. feedtitle = _cw(parsedfeed['feed']['title'])
  102. if not feedauthor and feedtitle:
  103. feedauthor = feedtitle
  104. if not parsedfeed.has_key('entries'):
  105. return None
  106. for entry in parsedfeed['entries']:
  107. try: # The feed could miss all sorts of fields...
  108. if (entry.has_key('author_detail')
  109. and entry['author_detail'].has_key('name')):
  110. author = _cw(entry['author_detail']['name'])
  111. elif feedauthor:
  112. author = feedauthor
  113. else:
  114. continue
  115. # Seconds since epoch
  116. updated = int(time.mktime(entry['updated_parsed']))
  117. title = _cw(entry['title'])
  118. link = None
  119. if len(entry['links']) > 1:
  120. for l in entry['links']:
  121. href = urllib.parse.urljoin(baseurl, l['href'])
  122. if urllib.parse.urlparse(href).scheme == scheme:
  123. link = href
  124. break
  125. else:
  126. link = urllib.parse.urljoin(baseurl, _cw(entry['link']))
  127. if not link:
  128. continue
  129. except Exception:
  130. continue
  131. entries.append(FeedEntry(baseurl, author, updated, title, link))
  132. return entries