multiFeedParsing.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import URLHelper
  4. import re
  5. import feedparser
  6. import time
  7. from datetime import datetime
  8. # collapse whitespace
  9. def _cw(text):
  10. return re.sub(r'\s+', ' ', text).strip()
  11. def parsegemsub(feed, baseurl):
  12. entries = []
  13. authorpattern = r'^#\s*([^#\r\n]+)'
  14. entriespattern = r'^=>\s*(\S+)\s+(\d{4}-\d{2}-\d{2})[^\r\n\S]*([^\r\n]*)'
  15. entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
  16. authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
  17. if authorpatternmatch:
  18. author = authorpatternmatch[0]
  19. else:
  20. return None
  21. uh = URLHelper.URLHelper()
  22. for entrypatternmatch in entriespatternmatches:
  23. # Get our YYYY-MM-DD string, add time of day, parse to datetime.datetime, convert to unix timestamp and cast to int
  24. try:
  25. updated = int(datetime.timestamp(datetime.strptime(entrypatternmatch[1] + " 12:00:00", "%Y-%m-%d %H:%M:%S")))
  26. except:
  27. continue
  28. # A gemsub feed can often have relative links, we'll have to absolutize them
  29. link = uh.resolve(baseurl, entrypatternmatch[0])
  30. title = entrypatternmatch[2] if entrypatternmatch[2] else entrypatternmatch[1]
  31. entries.append(FeedEntry(baseurl, author, updated, title, link))
  32. return entries
  33. def parsetwtxt(feed, baseurl):
  34. entries = []
  35. authorpattern = r'^#\s*nick\s*=\s*(\S+)'
  36. # This is a naive match, but we'll only keep those that validate eventually
  37. entriespattern = r'^(\S+)\t([^\r\n]+)'
  38. entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
  39. authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
  40. if authorpatternmatch:
  41. author = authorpatternmatch[0]
  42. else:
  43. author = baseurl
  44. for entrypatternmatch in entriespatternmatches:
  45. # Get our datetime string, parse to datetime.datetime, convert to unix timestamp and cast to int
  46. try:
  47. posted = int(datetime.timestamp(datetime.strptime(entrypatternmatch[0], "%Y-%m-%dT%H:%M:%S%z")))
  48. except:
  49. continue
  50. entries.append(TwtxtEntry(feedurl = baseurl, author = author, posted = posted, twt = entrypatternmatch[1]))
  51. return entries
  52. def parsexml(feed, baseurl):
  53. scheme = "gemini"
  54. entries = []
  55. parsedfeed = feedparser.parse(feed)
  56. uh = URLHelper.URLHelper()
  57. baseurl = uh.resolve(baseurl)
  58. # Let's set author name, or lacking that use the feed title.
  59. feedauthor = _cw(parsedfeed['feed']['author_detail']['name']) if parsedfeed['feed'].has_key('author_detail') and parsedfeed['feed']['author_detail'].has_key('name') else None
  60. feedtitle = _cw(parsedfeed['feed']['title']) if parsedfeed['feed'].has_key('title') else None
  61. if not feedauthor and feedtitle:
  62. feedauthor = feedtitle
  63. if not parsedfeed.has_key('entries'):
  64. return None
  65. for entry in parsedfeed['entries']:
  66. try: # The feed could miss all sorts of fields...
  67. if entry.has_key('author_detail') and entry['author_detail'].has_key('name'):
  68. author = _cw(entry['author_detail']['name'])
  69. elif feedauthor:
  70. author = feedauthor
  71. else:
  72. continue
  73. updated = int(time.mktime(entry['updated_parsed'])) # Seconds since epoch
  74. title = _cw(entry['title'])
  75. if len(entry['links']) >= 1:
  76. link = [l for l in entry['links'] if l['href'].startswith(scheme)][0]['href']
  77. else:
  78. link = _cw(entry['link'])
  79. if not link:
  80. continue
  81. link = uh.resolve(baseurl, link)
  82. if not uh.getNetLoc(link) == uh.getNetLoc(baseurl):
  83. continue
  84. except:
  85. continue
  86. entries.append(FeedEntry(baseurl, author, updated, title, link))
  87. return entries
  88. class FeedEntry():
  89. def __init__(self, feedurl, author, updated, title, link):
  90. self.feedurl = feedurl
  91. self.author = author
  92. self.updated = updated
  93. self.title = title
  94. self.link = link
  95. class TwtxtEntry():
  96. def __init__(self, feedurl, author, posted, twt):
  97. self.feedurl = feedurl
  98. self.author = author
  99. self.posted = posted
  100. self.twt = twt