123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- """
- dl.py - Mythical downloader
- Copyright (C) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>.
- """
- import subprocess
- import urllib.parse
- import sys
- # Don't actually download, just classify
- dry_run = True
- def read_all(file):
- with open(file, "r") as f:
- return f.read()
- # Known sites. Format: hashmap, where the key is the domain, and the value is
- # the list of adaptors. An adaptor is a list, where the first element is the
- # names, and the other elements, if any, are parameters to that adaptor.
- # TODO: Where to source these site lists?
- SITES = {
- "discordapp.com": [["libpurple", "purple-discord"]],
- "discord.gg": [["libpurple", "purple-discord"]]
- }
- def domains_from_file(file, adaptor):
- # All of these domains are for the site in questions
- fanficfare_sites = read_all(file).split("\n")
- for site in fanficfare_sites:
- SITES[site] = [[adaptor]]
- domains_from_file("fanficfare_sites.txt", "fanficfare")
- domains_from_file("ytdl_sites.txt", "youtube-dl")
- domains_from_file("drm_sites.txt", "drm")
- # Functions to download a given URL with a given adaptor
- def ytdl(url):
- # TODO
- subprocess.run(["mpv", "--vo=x11", url])
- def fanficfare(url):
- # TODO
- subprocess.run(["fanficfare", url])
- def libpurple(url):
- # TODO
- print("libpurple'ing " + url)
- def drm(url):
- print("This site requires the use of Digital Restrictions Mangement.")
- print("To learn more, see https://defectivebydesign.org/")
- sys.exit(1)
- ADAPTORS = {
- "youtube-dl": ytdl,
- "fanficfare": fanficfare,
- "libpurple": libpurple,
- "drm": drm
- }
- # Fast track certain extensions
- AV = ["ogg", "ogv", "webm", "mp3", "mp4", "mkv"]
- # If it's not a known site, we can just test individual downloaders over subprocess
- def test_url_sub(args):
- proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- return proc.returncode == 0
- # Parsing a URL
- def url_parts(url):
- return urllib.parse.urlparse(url if "://" in url else "https://" + url)
- def domain_from_url(parts):
- domain = parts.netloc
- # Remove subdomain
- if domain.startswith("www."):
- domain = domain[len("www."):]
- return domain
- # Find adaptors for a URL, first by consulting our dictionary, then by
- # bruteforce
- def find_adaptors(parts, allow_bruteforce=True):
- domain = domain_from_url(parts)
- ext = parts.path.split(".")[-1]
- # These "crude" techniques merely check the hashes
- if domain in SITES:
- return SITES[domain]
- # These run the full regexs stack but is slow and sometimes I/O bound
- if allow_bruteforce or ext in AV:
- if test_url_sub(["youtube-dl", "--simulate", url]):
- return [["youtube-dl"]]
- return []
- # Actualise download
- def download_with_adaptor(url, adaptor):
- fn = ADAPTORS[adaptor[0]]
- args = [url] + adaptors[1:]
- if fn is not None:
- fn(*args)
- else:
- print("Unknwon adaptor " + adaptor[0])
- # Simple test with the cmdline
- url = sys.argv[1]
- parts = url_parts(url)
- adaptors = find_adaptors(parts, allow_bruteforce=True)
- print(adaptors)
- if len(adaptors) > 1:
- print("Warning: ambiguous, using first:")
- print(adaptors)
- elif len(adaptors) == 0:
- print("No adaptors found")
- sys.exit(1)
- if not dry_run:
- download_with_adaptor(url, adaptors[0])
|