2 Коммиты f19f3c80ff ... 86aa7a7fa3

Автор SHA1 Сообщение Дата
  Björn Wärmedal 86aa7a7fa3 Improved URL handling, abuse mitigation 6 месяцев назад
  Björn Wärmedal 102634a4b7 Strip URLs 6 месяцев назад
2 измененных файлов с 21 добавлено и 34 удалено
  1. 11 27
      URLHelper.py
  2. 10 7
      multiFeedParsing.py

+ 11 - 27
URLHelper.py

@@ -4,6 +4,8 @@
 import re
 import urllib.parse
 from pathlib import Path, PosixPath
+urllib.parse.uses_relative.append("gemini")
+urllib.parse.uses_netloc.append("gemini")
 
 class URLHelper():
 
@@ -40,24 +42,11 @@ class URLHelper():
         return bool(re.match(pattern, url))
 
     @classmethod
-    def correct(cls, url: str) -> str:
-        """
-        Unquote a URL and add gemini:// scheme if needed.
-
-        >>> URLHelper.correct("example.com/my%20feed")
-        'gemini://example.com/my feed'
-        """
-        url = urllib.parse.unquote(url)
-
-        if not re.findall(r'^[\w:]*//', url):
-            url = "gemini://" + url
-        elif not urllib.parse.urlparse(url).netloc:
-            url = "gemini:" + url
-
-        return url
+    def getNetLoc(cls, url: str) -> str:
+        return urllib.parse.urlparse(url).netloc
 
     @classmethod
-    def resolve(cls, url: str) -> str:
+    def resolve(cls, url: str, url2: str = "") -> str:
         """
         Resolve relative paths in URLs.
         This method calls :meth:`~URLHelper.correct` beforehand.
@@ -65,15 +54,10 @@ class URLHelper():
         >>> URLHelper.resolve("gemini://example.com/1/../2")
         'gemini://example.com/2'
         """
-        url = urllib.parse.urlparse(cls.correct(url))
 
-        if not url.path:
-            path = ""
-        elif not url.path.startswith("/"):
-            raise ValueError("Not an absoulute URL")
-        else:
-            path = str(PosixPath(url.path).resolve())
-            # restore lost trailing slash
-            if url.path.endswith("/") and path != "/":
-                path += "/"
-        return urllib.parse.urlunparse(url._replace(path=path))
+        url = urllib.parse.unquote(url)
+        url2 = urllib.parse.unquote(url2)
+
+        parseResult = urllib.parse.urlparse(urllib.parse.urljoin(url, url2))
+
+        return f"{parseResult.scheme}://{parseResult.netloc}{PosixPath(parseResult.path).resolve()}"

+ 10 - 7
multiFeedParsing.py

@@ -1,17 +1,15 @@
 #!/usr/bin/env python3
 # vim: tabstop=4 shiftwidth=4 expandtab
 
-import urllib.parse
+import URLHelper
 import re
 import feedparser
 import time
 from datetime import datetime
-urllib.parse.uses_relative.append("gemini")
-urllib.parse.uses_netloc.append("gemini")
 
 # collapse whitespace
 def _cw(text):
-    return re.sub(r'\s', ' ', text)
+    return re.sub(r'\s+', ' ', text).strip()
 
 def parsegemsub(feed, baseurl):
     entries = []
@@ -23,6 +21,7 @@ def parsegemsub(feed, baseurl):
         author = authorpatternmatch[0]
     else:
         return None
+    uh = URLHelper.URLHelper()
     for entrypatternmatch in entriespatternmatches:
         # Get our YYYY-MM-DD string, add time of day, parse to datetime.datetime, convert to unix timestamp and cast to int
         try:
@@ -30,7 +29,7 @@ def parsegemsub(feed, baseurl):
         except:
             continue
         # A gemsub feed can often have relative links, we'll have to absolutize them
-        link = urllib.parse.urljoin(baseurl, entrypatternmatch[0]).replace('/..','').replace('/.','')
+        link = uh.resolve(baseurl, entrypatternmatch[0])
         title = entrypatternmatch[2] if entrypatternmatch[2] else entrypatternmatch[1]
         entries.append(FeedEntry(baseurl, author, updated, title, link))
     return entries
@@ -56,9 +55,11 @@ def parsetwtxt(feed, baseurl):
     return entries
 
 def parsexml(feed, baseurl):
-    scheme = baseurl.split("://")[0]
+    scheme = "gemini"
     entries = []
     parsedfeed = feedparser.parse(feed)
+    uh = URLHelper.URLHelper()
+    baseurl = uh.resolve(baseurl)
 
     # Let's set author name, or lacking that use the feed title.
     feedauthor = _cw(parsedfeed['feed']['author_detail']['name']) if parsedfeed['feed'].has_key('author_detail') and parsedfeed['feed']['author_detail'].has_key('name') else None
@@ -83,7 +84,9 @@ def parsexml(feed, baseurl):
                 link = _cw(entry['link'])
             if not link:
                 continue
-            link = urllib.parse.urljoin(baseurl, link).replace('/..','').replace('/.','')
+            link = uh.resolve(baseurl, link)
+            if not uh.getNetLoc(link) == uh.getNetLoc(baseurl):
+                continue
         except:
             continue
         entries.append(FeedEntry(baseurl, author, updated, title, link))