123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415 |
- import {spawn} from 'node:child_process'
- import {readdir, stat} from 'node:fs/promises'
- import url from 'node:url'
- import path from 'node:path'
- import {orderBy} from 'natural-orderby'
- import expandHomeDir from 'expand-home-dir'
- // import fetch from 'node-fetch'
- import {downloadPlaylistFromOptionValue, promisifyProcess} from './general-util.js'
- export const musicExtensions = [
- 'ogg', 'oga',
- 'wav', 'mp3', 'm4a', 'aac', 'flac', 'opus',
- 'mp4', 'mov', 'mkv',
- 'mod'
- ]
- export const skipNames = [
- '.DS_Store',
- '.git',
- ]
- // Each value is a function with these additional properties:
- // * crawlerName: The name of the crawler, such as "crawl-http". Used by
- // getCrawlerByName.
- // * isAppropriateForArg: A function returning whether an argument is valid for
- // the crawler. For example, crawlHTTP.isAppropriateForArg returns whether or
- // not the passed argument is a valid URL of the HTTP/HTTPS protocol. Used by
- // getAllCrawlersForArg.
- const allCrawlers = {}
- /* TODO: Removed cheerio, so crawl-http no longer works.
- export function crawlHTTP(absURL, opts = {}, internals = {}) {
- // Recursively crawls a given URL, following every link to a deeper path and
- // recording all links in a tree (in the same format playlists use). Makes
- // multiple attempts to download failed paths.
- const {
- verbose = false,
- maxAttempts = 5,
- allowedExternalHostRegex = null,
- stayInSameDirectory = true,
- keepAnyFileType = false,
- fileTypes = ['wav', 'ogg', 'oga', 'mp3', 'mp4', 'm4a', 'mov', 'mpga', 'mod'],
- forceGroupRegex = null,
- filterRegex = null
- } = opts
- if (!internals.attempts) internals.attempts = 0
- // TODO: Should absURL initially be added into this array? I'd like to
- // re-program this entire crawl function to make more sense - "internal"
- // dictionaries aren't quite easy to reason about!
- if (!internals.allURLs) internals.allURLs = []
- const verboseLog = text => {
- if (verbose) {
- console.error(text)
- }
- }
- const absURLObj = new url.URL(absURL)
- return fetch(absURL)
- .then(
- res => res.text().then(async text => {
- const links = getHTMLLinks(text)
- console.log(links)
- const items = []
- for (const link of links) {
- let [ name, href ] = link
- if (!href) {
- continue
- }
- // If the name (that's the content inside of <a>..</a>) ends with a
- // slash, that's probably just an artifact of a directory lister;
- // not actually part of the intended content. So we remove it!
- if (name.endsWith('/')) {
- name = name.slice(0, -1)
- }
- name = name.trim()
- let base
- if (path.extname(absURL)) {
- base = path.dirname(absURL) + '/'
- console.log('extname:', path.extname(absURL), 'so base:', base)
- } else {
- base = absURL
- }
- const urlObj = new url.URL(href, base)
- const linkURL = url.format(urlObj)
- if (internals.allURLs.includes(linkURL)) {
- verboseLog("[Ignored] Already done this URL: " + linkURL)
- continue
- }
- internals.allURLs.push(linkURL)
- if (filterRegex && !(filterRegex.test(linkURL))) {
- verboseLog("[Ignored] Failed regex: " + linkURL)
- continue
- }
- if (urlObj.host !== absURLObj.host && !(
- allowedExternalHostRegex && new RegExp(allowedExternalHostRegex)
- .test(urlObj.host))) {
- verboseLog("[Ignored] Inconsistent host: " + linkURL)
- continue
- }
- if (stayInSameDirectory) sameDir: {
- // Don't bother with staying in the same directory if it's on a
- // different host.
- if (urlObj.host !== absURLObj.host) {
- break sameDir
- }
- const relative = path.relative((new url.URL(base)).pathname, urlObj.pathname)
- if (relative.startsWith('..') || path.isAbsolute(relative)) {
- verboseLog("[Ignored] Outside of parent directory: " + linkURL + "\n-- relative: " + relative + "\n-- to base: " + base)
- continue
- }
- }
- if (href.endsWith('/') || (forceGroupRegex && new RegExp(forceGroupRegex).test(href))) {
- // It's a directory!
- verboseLog("[Dir] " + linkURL)
- items.push(await (
- crawlHTTP(linkURL, opts, Object.assign({}, internals))
- .then(({ items }) => ({name, items}))
- ))
- } else {
- // It's a file!
- const extensions = fileTypes.map(t => '.' + t)
- if (
- !keepAnyFileType &&
- !(extensions.includes(path.extname(href)))
- ) {
- verboseLog("[Ignored] Bad extension: " + linkURL)
- continue
- }
- verboseLog("[File] " + linkURL)
- items.push({name, downloaderArg: linkURL})
- }
- }
- return {items}
- }),
- err => {
- console.warn("Failed to download: " + absURL)
- if (internals.attempts < maxAttempts) {
- console.warn(
- `Trying again. Attempt ${internals.attempts + 1}/${maxAttempts}...`
- )
- return crawlHTTP(absURL, opts, Object.assign({}, internals, {
- attempts: internals.attempts + 1
- }))
- } else {
- console.error(
- "We've hit the download attempt limit (" + maxAttempts + "). " +
- "Giving up on this path."
- )
- throw 'FAILED_DOWNLOAD'
- }
- }
- )
- .catch(error => {
- if (error === 'FAILED_DOWNLOAD') {
- // Debug logging for this is already handled above.
- return []
- } else {
- throw error
- }
- })
- }
- crawlHTTP.crawlerName = 'crawl-http'
- crawlHTTP.isAppropriateForArg = function(arg) {
- // It is only used for HTTP(S) servers:
- if (!(arg.startsWith('http://') || arg.startsWith('https://'))) {
- return false
- }
- // It will definitely only work for valid URLs:
- let url
- try {
- url = new URL(arg)
- } catch (error) {
- return false
- }
- // If the URL ends with a .json, it is probably meant to be used for a direct
- // playlist download, not to be crawled.
- if (path.extname(url.pathname) === '.json') {
- return false
- }
- // Just to avoid conflict with crawl-youtube, assume crawl-http is not used
- // for URLs on YouTube:
- if (crawlYouTube.isAppropriateForArg(arg)) {
- return false
- }
- return true
- }
- allCrawlers.crawlHTTP = crawlHTTP
- function getHTMLLinks(text) {
- // Never parse HTML with a regex!
- // const $ = cheerio.load(text)
- return $('a').get().map(el => {
- const $el = $(el)
- return [$el.text(), $el.attr('href')]
- })
- }
- */
- function crawlLocal(dirPath, extensions = musicExtensions, isTop = true) {
- // If the passed path is a file:// URL, try to decode it:
- try {
- const url = new URL(dirPath)
- if (url.protocol === 'file:') {
- dirPath = decodeURIComponent(url.pathname)
- }
- } catch (error) {
- // If it's not a URL, it's (assumedly) an ordinary path ("/path/to/the directory").
- // In this case we'll expand any ~ in the path (e.g. ~/Music -> /home/.../Music).
- dirPath = expandHomeDir(dirPath)
- }
- return readdir(dirPath).then(items => {
- items = orderBy(items)
- return Promise.all(items.map(item => {
- // There are a few files which are just never what we're looking for.
- // We skip including or searching under these altogether.
- if (skipNames.includes(item)) {
- return null
- }
- const itemPath = path.join(dirPath, item)
- const itemURL = url.pathToFileURL(itemPath).href
- return stat(itemPath).then(stats => {
- if (stats.isDirectory()) {
- return crawlLocal(itemPath, extensions, false)
- .then(group => Object.assign({name: item, url: itemURL}, group))
- } else if (stats.isFile()) {
- // Extname returns a string starting with a dot; we don't want the
- // dot, so we slice it off of the front.
- const ext = path.extname(item).slice(1)
- if (extensions.includes(ext)) {
- // The name of the track doesn't include the file extension; a user
- // probably wouldn't add the file extensions to a hand-written
- // playlist, or want them in an auto-generated one.
- const basename = path.basename(item, path.extname(item))
- return {name: basename, downloaderArg: itemPath, url: itemURL}
- } else {
- return {name: item, url: itemURL}
- }
- }
- }, _statErr => null)
- }))
- }, err => {
- if (err.code === 'ENOENT') {
- if (isTop) {
- throw 'That directory path does not exist!'
- } else {
- return []
- }
- } else if (err.code === 'EACCES') {
- if (isTop) {
- throw 'You do not have permission to open that directory.'
- } else {
- return []
- }
- } else {
- throw err
- }
- }).then(items => items.filter(Boolean))
- .then(filteredItems => ({
- name: path.basename(dirPath),
- items: filteredItems
- }))
- }
- crawlLocal.crawlerName = 'crawl-local'
- crawlLocal.isAppropriateForArg = function(arg) {
- // When the passed argument is a valid URL, it is only used for file://
- // URLs:
- try {
- const url = new URL(arg)
- if (url.protocol !== 'file:') {
- return false
- }
- } catch (error) {}
- // If the passed argument ends with .json, it is probably not a directory.
- if (path.extname(arg) === '.json') {
- return false
- }
- return true
- }
- allCrawlers.crawlLocal = crawlLocal
- export async function crawlYouTube(url) {
- const ytdl = spawn('youtube-dl', [
- '-j', // Output as JSON
- '--flat-playlist',
- url
- ])
- const items = []
- ytdl.stdout.on('data', data => {
- const lines = data.toString().trim().split('\n')
- items.push(...lines.map(JSON.parse))
- })
- // Pass false so it doesn't show logging.
- try {
- await promisifyProcess(ytdl, false)
- } catch (error) {
- // Yeow.
- throw 'Youtube-dl failed.'
- }
- return {
- name: 'A YouTube playlist',
- items: items.map(item => {
- return {
- name: item.title,
- downloaderArg: 'https://youtube.com/watch?v=' + item.id
- }
- })
- }
- }
- crawlYouTube.crawlerName = 'crawl-youtube'
- crawlYouTube.isAppropriateForArg = function(arg) {
- // It is definitely not used for arguments that are not URLs:
- let url
- try {
- url = new URL(arg)
- } catch (error) {
- return false
- }
- // It is only used for URLs on the YouTube domain:
- if (!(url.hostname === 'youtube.com' || url.hostname === 'www.youtube.com')) {
- return false
- }
- // It is only used for playlist pages:
- if (url.pathname !== '/playlist') {
- return false
- }
- return true
- }
- allCrawlers.crawlYouTube = crawlYouTube
- export async function openFile(input) {
- return JSON.parse(await downloadPlaylistFromOptionValue(input))
- }
- openFile.crawlerName = 'open-file'
- openFile.isAppropriateForArg = function(arg) {
- // It is only valid for arguments that end with .json:
- return path.extname(arg) === '.json'
- }
- allCrawlers.openFile = openFile
- export function getCrawlerByName(name) {
- return Object.values(allCrawlers).find(fn => fn.crawlerName === name)
- }
- export function getAllCrawlersForArg(arg) {
- return Object.values(allCrawlers).filter(fn => fn.isAppropriateForArg(arg))
- }
|