123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- #!/usr/bin/env python3
- # vim: tabstop=4 shiftwidth=4 expandtab
- import re
- import time
- import urllib.parse
- from dataclasses import dataclass
- from datetime import datetime
- import feedparser
- from gemini_antenna.URLHelper import URLHelper
- urllib.parse.uses_relative.append("gemini")
- urllib.parse.uses_netloc.append("gemini")
- whitespace_re = re.compile(r'\s')
- @dataclass
- class FeedEntry:
- feedurl: str
- author: str
- updated: int
- title: str
- link: str
- @dataclass
- class TwtxtEntry:
- feedurl: str
- author: str
- posted: float
- twt: str
- def _cw(text: str) -> str:
- """ Collapse whitespace """
- return whitespace_re.sub(' ', text)
- def parsegemsub(feed: str, baseurl: str) -> list[FeedEntry]:
- """
- :param feed: Feed contents.
- :param baseurl: Feed URL.
- """
- baseurl = URLHelper.resolve(baseurl)
- entries = []
- authorpattern = r'^#\s*([^#\r\n]+)'
- entriespattern = r'^=>\s*(\S+)\s+(\d{4}-\d{2}-\d{2})[^\r\n\S]*([^\r\n]*)'
- entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
- authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
- if authorpatternmatch:
- author = authorpatternmatch[0]
- else:
- return None
- for entrypatternmatch in entriespatternmatches:
- # Get our YYYY-MM-DD string, add time of day, parse to datetime.datetime,
- # convert to unix timestamp and cast to int
- try:
- timestr = datetime.strptime(entrypatternmatch[1] + " 12:00:00",
- "%Y-%m-%d %H:%M:%S")
- updated = int(datetime.timestamp(timestr))
- except Exception:
- continue
- # A gemsub feed can often have relative links, we'll have to absolutize them
- link = urllib.parse.urljoin(baseurl, entrypatternmatch[0])
- title = entrypatternmatch[2] if entrypatternmatch[2] else entrypatternmatch[1]
- entries.append(FeedEntry(baseurl, author, updated, title, link))
- return entries
- def parsetwtxt(feed: str, baseurl: str) -> list[TwtxtEntry]:
- """
- :param feed: Feed contents.
- :param baseurl: Feed URL.
- """
- baseurl = URLHelper.resolve(baseurl)
- entries = []
- authorpattern = r'^#\s*nick\s*=\s*(\S+)'
- # This is a naive match, but we'll only keep those that validate eventually
- entriespattern = r'^(\S+)\t([^\r\n]+)'
- entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
- authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
- if authorpatternmatch:
- author = authorpatternmatch[0]
- else:
- author = baseurl
- for entrypatternmatch in entriespatternmatches:
- # Get our datetime string, parse to datetime.datetime, convert to unix
- # timestamp and cast to int
- try:
- posted = int(datetime.timestamp(datetime.strptime(entrypatternmatch[0],
- "%Y-%m-%dT%H:%M:%S%z")))
- except Exception:
- continue
- entries.append(TwtxtEntry(feedurl=baseurl, author=author, posted=posted,
- twt=entrypatternmatch[1]))
- return entries
- def parsexml(feed: str, baseurl: str) -> list[FeedEntry]:
- """
- :param feed: Feed contents.
- :param baseurl: Feed URL.
- """
- baseurl = URLHelper.resolve(baseurl)
- scheme = urllib.parse.urlparse(baseurl).scheme
- entries = []
- parsedfeed = feedparser.parse(feed)
- # Let's set author's name, or lacking that use the feed title.
- feedauthor = None
- feedtitle = None
- if (parsedfeed['feed'].has_key('author_detail')
- and parsedfeed['feed']['author_detail'].has_key('name')):
- feedauthor = _cw(parsedfeed['feed']['author_detail']['name'])
- if parsedfeed['feed'].has_key('title'):
- feedtitle = _cw(parsedfeed['feed']['title'])
- if not feedauthor and feedtitle:
- feedauthor = feedtitle
- if not parsedfeed.has_key('entries'):
- return None
- for entry in parsedfeed['entries']:
- try: # The feed could miss all sorts of fields...
- if (entry.has_key('author_detail')
- and entry['author_detail'].has_key('name')):
- author = _cw(entry['author_detail']['name'])
- elif feedauthor:
- author = feedauthor
- else:
- continue
- # Seconds since epoch
- updated = int(time.mktime(entry['updated_parsed']))
- title = _cw(entry['title'])
- link = None
- if len(entry['links']) > 1:
- for l in entry['links']:
- href = urllib.parse.urljoin(baseurl, l['href'])
- if urllib.parse.urlparse(href).scheme == scheme:
- link = href
- break
- else:
- link = urllib.parse.urljoin(baseurl, _cw(entry['link']))
- if not link:
- continue
- except Exception:
- continue
- entries.append(FeedEntry(baseurl, author, updated, title, link))
- return entries
|