nipos
/
nikisoftblog


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							import logging
import opengraph
import requests
from bs4 import BeautifulSoup
from little_boxes import activitypub as ap
from little_boxes.errors import NotAnActivityError
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid

from .lookup import lookup

logger = logging.getLogger(__name__)


def links_from_note(note):
    tags_href = set()
    for t in note.get("tag", []):
        h = t.get("href")
        if h:
            tags_href.add(h)

    links = set()
    if "content" in note:
        soup = BeautifulSoup(note["content"])
        for link in soup.find_all("a"):
            h = link.get("href")
            if h.startswith("http") and h not in tags_href and is_url_valid(h):
                links.add(h)

    # FIXME(tsileo): support summary and name fields

    return links


def fetch_og_metadata(user_agent, links):
    res = []
    for l in links:
        check_url(l)

        # Remove any AP actor from the list
        try:
            p = lookup(l)
            if p.has_type(ap.ACTOR_TYPES):
                continue
        except NotAnActivityError:
            pass

        r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
        r.raise_for_status()
        if not r.headers.get("content-type").startswith("text/html"):
            logger.debug(f"skipping {l}")
            continue

        r.encoding = "UTF-8"
        html = r.text
        try:
            data = dict(opengraph.OpenGraph(html=html))
        except Exception:
            logger.exception(f"failed to parse {l}")
            continue
        if data.get("url"):
            res.append(data)

    return res