_utils.py 185 KB

  1. import base64
  2. import binascii
  3. import calendar
  4. import codecs
  5. import collections
  6. import collections.abc
  7. import contextlib
  8. import datetime as dt
  9. import email.header
  10. import email.utils
  11. import errno
  12. import functools
  13. import hashlib
  14. import hmac
  15. import html.entities
  16. import html.parser
  17. import inspect
  18. import io
  19. import itertools
  20. import json
  21. import locale
  22. import math
  23. import mimetypes
  24. import netrc
  25. import operator
  26. import os
  27. import platform
  28. import random
  29. import re
  30. import shlex
  31. import socket
  32. import ssl
  33. import struct
  34. import subprocess
  35. import sys
  36. import tempfile
  37. import time
  38. import traceback
  39. import types
  40. import unicodedata
  41. import urllib.error
  42. import urllib.parse
  43. import urllib.request
  44. import xml.etree.ElementTree
  45. from . import traversal
  46. from ..compat import (
  47. compat_etree_fromstring,
  48. compat_expanduser,
  49. compat_HTMLParseError,
  50. )
  51. from ..dependencies import xattr
  52. __name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module
  53. class NO_DEFAULT:
  54. pass
  55. def IDENTITY(x):
  56. return x
  58. 'January', 'February', 'March', 'April', 'May', 'June',
  59. 'July', 'August', 'September', 'October', 'November', 'December']
  60. MONTH_NAMES = {
  62. 'fr': [
  63. 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  64. 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  65. # these follow the genitive grammatical case (dopełniacz)
  66. # some websites might be using nominative, which will require another month list
  67. # https://en.wikibooks.org/wiki/Polish/Noun_cases
  68. 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  69. 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  70. }
  71. # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  73. 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  74. 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
  75. 'EST': -5, 'EDT': -4, # Eastern
  76. 'CST': -6, 'CDT': -5, # Central
  77. 'MST': -7, 'MDT': -6, # Mountain
  78. 'PST': -8, 'PDT': -7, # Pacific
  79. }
  80. # needed for sanitizing filenames in restricted mode
  81. ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  82. itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  83. 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
  84. DATE_FORMATS = (
  85. '%d %B %Y',
  86. '%d %b %Y',
  87. '%B %d %Y',
  88. '%B %dst %Y',
  89. '%B %dnd %Y',
  90. '%B %drd %Y',
  91. '%B %dth %Y',
  92. '%b %d %Y',
  93. '%b %dst %Y',
  94. '%b %dnd %Y',
  95. '%b %drd %Y',
  96. '%b %dth %Y',
  97. '%b %dst %Y %I:%M',
  98. '%b %dnd %Y %I:%M',
  99. '%b %drd %Y %I:%M',
  100. '%b %dth %Y %I:%M',
  101. '%Y %m %d',
  102. '%Y-%m-%d',
  103. '%Y.%m.%d.',
  104. '%Y/%m/%d',
  105. '%Y/%m/%d %H:%M',
  106. '%Y/%m/%d %H:%M:%S',
  107. '%Y%m%d%H%M',
  108. '%Y%m%d%H%M%S',
  109. '%Y%m%d',
  110. '%Y-%m-%d %H:%M',
  111. '%Y-%m-%d %H:%M:%S',
  112. '%Y-%m-%d %H:%M:%S.%f',
  113. '%Y-%m-%d %H:%M:%S:%f',
  114. '%d.%m.%Y %H:%M',
  115. '%d.%m.%Y %H.%M',
  116. '%Y-%m-%dT%H:%M:%SZ',
  117. '%Y-%m-%dT%H:%M:%S.%fZ',
  118. '%Y-%m-%dT%H:%M:%S.%f0Z',
  119. '%Y-%m-%dT%H:%M:%S',
  120. '%Y-%m-%dT%H:%M:%S.%f',
  121. '%Y-%m-%dT%H:%M',
  122. '%b %d %Y at %H:%M',
  123. '%b %d %Y at %H:%M:%S',
  124. '%B %d %Y at %H:%M',
  125. '%B %d %Y at %H:%M:%S',
  126. '%H:%M %d-%b-%Y',
  127. )
  129. DATE_FORMATS_DAY_FIRST.extend([
  130. '%d-%m-%Y',
  131. '%d.%m.%Y',
  132. '%d.%m.%y',
  133. '%d/%m/%Y',
  134. '%d/%m/%y',
  135. '%d/%m/%Y %H:%M:%S',
  136. '%d-%m-%Y %H:%M',
  137. '%H:%M %d/%m/%Y',
  138. ])
  141. '%m-%d-%Y',
  142. '%m.%d.%Y',
  143. '%m/%d/%Y',
  144. '%m/%d/%y',
  145. '%m/%d/%Y %H:%M:%S',
  146. ])
  147. PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
  148. JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
  149. NUMBER_RE = r'\d+(?:\.\d+)?'
  150. @functools.cache
  151. def preferredencoding():
  152. """Get preferred encoding.
  153. Returns the best encoding scheme for the system, based on
  154. locale.getpreferredencoding() and some further tweaks.
  155. """
  156. try:
  157. pref = locale.getpreferredencoding()
  158. 'TEST'.encode(pref)
  159. except Exception:
  160. pref = 'UTF-8'
  161. return pref
  162. def write_json_file(obj, fn):
  163. """ Encode obj as JSON and write it to fn, atomically if possible """
  164. tf = tempfile.NamedTemporaryFile(
  165. prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
  166. suffix='.tmp', delete=False, mode='w', encoding='utf-8')
  167. try:
  168. with tf:
  169. json.dump(obj, tf, ensure_ascii=False)
  170. if sys.platform == 'win32':
  171. # Need to remove existing file on Windows, else os.rename raises
  172. # WindowsError or FileExistsError.
  173. with contextlib.suppress(OSError):
  174. os.unlink(fn)
  175. with contextlib.suppress(OSError):
  176. mask = os.umask(0)
  177. os.umask(mask)
  178. os.chmod(tf.name, 0o666 & ~mask)
  179. os.rename(tf.name, fn)
  180. except Exception:
  181. with contextlib.suppress(OSError):
  182. os.remove(tf.name)
  183. raise
  184. def partial_application(func):
  185. sig = inspect.signature(func)
  186. required_args = [
  187. param.name for param in sig.parameters.values()
  188. if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
  189. if param.default is inspect.Parameter.empty
  190. ]
  191. @functools.wraps(func)
  192. def wrapped(*args, **kwargs):
  193. if set(required_args[len(args):]).difference(kwargs):
  194. return functools.partial(func, *args, **kwargs)
  195. return func(*args, **kwargs)
  196. return wrapped
  197. def find_xpath_attr(node, xpath, key, val=None):
  198. """ Find the xpath xpath[@key=val] """
  199. assert re.match(r'^[a-zA-Z_-]+$', key)
  200. expr = xpath + (f'[@{key}]' if val is None else f"[@{key}='{val}']")
  201. return node.find(expr)
  202. # On python2.6 the xml.etree.ElementTree.Element methods don't support
  203. # the namespace parameter
  204. def xpath_with_ns(path, ns_map):
  205. components = [c.split(':') for c in path.split('/')]
  206. replaced = []
  207. for c in components:
  208. if len(c) == 1:
  209. replaced.append(c[0])
  210. else:
  211. ns, tag = c
  212. replaced.append(f'{{{ns_map[ns]}}}{tag}')
  213. return '/'.join(replaced)
  214. def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  215. def _find_xpath(xpath):
  216. return node.find(xpath)
  217. if isinstance(xpath, str):
  218. n = _find_xpath(xpath)
  219. else:
  220. for xp in xpath:
  221. n = _find_xpath(xp)
  222. if n is not None:
  223. break
  224. if n is None:
  225. if default is not NO_DEFAULT:
  226. return default
  227. elif fatal:
  228. name = xpath if name is None else name
  229. raise ExtractorError(f'Could not find XML element {name}')
  230. else:
  231. return None
  232. return n
  233. def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  234. n = xpath_element(node, xpath, name, fatal=fatal, default=default)
  235. if n is None or n == default:
  236. return n
  237. if n.text is None:
  238. if default is not NO_DEFAULT:
  239. return default
  240. elif fatal:
  241. name = xpath if name is None else name
  242. raise ExtractorError(f'Could not find XML element\'s text {name}')
  243. else:
  244. return None
  245. return n.text
  246. def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
  247. n = find_xpath_attr(node, xpath, key)
  248. if n is None:
  249. if default is not NO_DEFAULT:
  250. return default
  251. elif fatal:
  252. name = f'{xpath}[@{key}]' if name is None else name
  253. raise ExtractorError(f'Could not find XML attribute {name}')
  254. else:
  255. return None
  256. return n.attrib[key]
  257. def get_element_by_id(id, html, **kwargs):
  258. """Return the content of the tag with the specified ID in the passed HTML document"""
  259. return get_element_by_attribute('id', id, html, **kwargs)
  260. def get_element_html_by_id(id, html, **kwargs):
  261. """Return the html of the tag with the specified ID in the passed HTML document"""
  262. return get_element_html_by_attribute('id', id, html, **kwargs)
  263. def get_element_by_class(class_name, html):
  264. """Return the content of the first tag with the specified class in the passed HTML document"""
  265. retval = get_elements_by_class(class_name, html)
  266. return retval[0] if retval else None
  267. def get_element_html_by_class(class_name, html):
  268. """Return the html of the first tag with the specified class in the passed HTML document"""
  269. retval = get_elements_html_by_class(class_name, html)
  270. return retval[0] if retval else None
  271. def get_element_by_attribute(attribute, value, html, **kwargs):
  272. retval = get_elements_by_attribute(attribute, value, html, **kwargs)
  273. return retval[0] if retval else None
  274. def get_element_html_by_attribute(attribute, value, html, **kargs):
  275. retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
  276. return retval[0] if retval else None
  277. def get_elements_by_class(class_name, html, **kargs):
  278. """Return the content of all tags with the specified class in the passed HTML document as a list"""
  279. return get_elements_by_attribute(
  280. 'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
  281. html, escape_value=False)
  282. def get_elements_html_by_class(class_name, html):
  283. """Return the html of all tags with the specified class in the passed HTML document as a list"""
  284. return get_elements_html_by_attribute(
  285. 'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
  286. html, escape_value=False)
  287. def get_elements_by_attribute(*args, **kwargs):
  288. """Return the content of the tag with the specified attribute in the passed HTML document"""
  289. return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  290. def get_elements_html_by_attribute(*args, **kwargs):
  291. """Return the html of the tag with the specified attribute in the passed HTML document"""
  292. return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  293. def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
  294. """
  295. Return the text (content) and the html (whole) of the tag with the specified
  296. attribute in the passed HTML document
  297. """
  298. if not value:
  299. return
  300. quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
  301. value = re.escape(value) if escape_value else value
  302. partial_element_re = rf'''(?x)
  303. <(?P<tag>{tag})
  304. (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
  305. \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
  306. '''
  307. for m in re.finditer(partial_element_re, html):
  308. content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
  309. yield (
  310. unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
  311. whole,
  312. )
  313. class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
  314. """
  315. HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
  316. closing tag for the first opening tag it has encountered, and can be used
  317. as a context manager
  318. """
  319. class HTMLBreakOnClosingTagException(Exception):
  320. pass
  321. def __init__(self):
  322. self.tagstack = collections.deque()
  323. html.parser.HTMLParser.__init__(self)
  324. def __enter__(self):
  325. return self
  326. def __exit__(self, *_):
  327. self.close()
  328. def close(self):
  329. # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
  330. # so data remains buffered; we no longer have any interest in it, thus
  331. # override this method to discard it
  332. pass
  333. def handle_starttag(self, tag, _):
  334. self.tagstack.append(tag)
  335. def handle_endtag(self, tag):
  336. if not self.tagstack:
  337. raise compat_HTMLParseError('no tags in the stack')
  338. while self.tagstack:
  339. inner_tag = self.tagstack.pop()
  340. if inner_tag == tag:
  341. break
  342. else:
  343. raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
  344. if not self.tagstack:
  345. raise self.HTMLBreakOnClosingTagException
  346. # XXX: This should be far less strict
  347. def get_element_text_and_html_by_tag(tag, html):
  348. """
  349. For the first element with the specified tag in the passed HTML document
  350. return its' content (text) and the whole element (html)
  351. """
  352. def find_or_raise(haystack, needle, exc):
  353. try:
  354. return haystack.index(needle)
  355. except ValueError:
  356. raise exc
  357. closing_tag = f'</{tag}>'
  358. whole_start = find_or_raise(
  359. html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
  360. content_start = find_or_raise(
  361. html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
  362. content_start += whole_start + 1
  363. with HTMLBreakOnClosingTagParser() as parser:
  364. parser.feed(html[whole_start:content_start])
  365. if not parser.tagstack or parser.tagstack[0] != tag:
  366. raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
  367. offset = content_start
  368. while offset < len(html):
  369. next_closing_tag_start = find_or_raise(
  370. html[offset:], closing_tag,
  371. compat_HTMLParseError(f'closing {tag} tag not found'))
  372. next_closing_tag_end = next_closing_tag_start + len(closing_tag)
  373. try:
  374. parser.feed(html[offset:offset + next_closing_tag_end])
  375. offset += next_closing_tag_end
  376. except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
  377. return html[content_start:offset + next_closing_tag_start], \
  378. html[whole_start:offset + next_closing_tag_end]
  379. raise compat_HTMLParseError('unexpected end of html')
  380. class HTMLAttributeParser(html.parser.HTMLParser):
  381. """Trivial HTML parser to gather the attributes for a single element"""
  382. def __init__(self):
  383. self.attrs = {}
  384. html.parser.HTMLParser.__init__(self)
  385. def handle_starttag(self, tag, attrs):
  386. self.attrs = dict(attrs)
  387. raise compat_HTMLParseError('done')
  388. class HTMLListAttrsParser(html.parser.HTMLParser):
  389. """HTML parser to gather the attributes for the elements of a list"""
  390. def __init__(self):
  391. html.parser.HTMLParser.__init__(self)
  392. self.items = []
  393. self._level = 0
  394. def handle_starttag(self, tag, attrs):
  395. if tag == 'li' and self._level == 0:
  396. self.items.append(dict(attrs))
  397. self._level += 1
  398. def handle_endtag(self, tag):
  399. self._level -= 1
  400. def extract_attributes(html_element):
  401. """Given a string for an HTML element such as
  402. <el
  403. a="foo" B="bar" c="&98;az" d=boz
  404. empty= noval entity="&amp;"
  405. sq='"' dq="'"
  406. >
  407. Decode and return a dictionary of attributes.
  408. {
  409. 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
  410. 'empty': '', 'noval': None, 'entity': '&',
  411. 'sq': '"', 'dq': '\''
  412. }.
  413. """
  414. parser = HTMLAttributeParser()
  415. with contextlib.suppress(compat_HTMLParseError):
  416. parser.feed(html_element)
  417. parser.close()
  418. return parser.attrs
  419. def parse_list(webpage):
  420. """Given a string for an series of HTML <li> elements,
  421. return a dictionary of their attributes"""
  422. parser = HTMLListAttrsParser()
  423. parser.feed(webpage)
  424. parser.close()
  425. return parser.items
  426. def clean_html(html):
  427. """Clean an HTML snippet into a readable string"""
  428. if html is None: # Convenience for sanitizing descriptions etc.
  429. return html
  430. html = re.sub(r'\s+', ' ', html)
  431. html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
  432. html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
  433. # Strip html tags
  434. html = re.sub('<.*?>', '', html)
  435. # Replace html entities
  436. html = unescapeHTML(html)
  437. return html.strip()
  438. class LenientJSONDecoder(json.JSONDecoder):
  439. # TODO: Write tests
  440. def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
  441. self.transform_source, self.ignore_extra = transform_source, ignore_extra
  442. self._close_attempts = 2 * close_objects
  443. super().__init__(*args, **kwargs)
  444. @staticmethod
  445. def _close_object(err):
  446. doc = err.doc[:err.pos]
  447. # We need to add comma first to get the correct error message
  448. if err.msg.startswith('Expecting \',\''):
  449. return doc + ','
  450. elif not doc.endswith(','):
  451. return
  452. if err.msg.startswith('Expecting property name'):
  453. return doc[:-1] + '}'
  454. elif err.msg.startswith('Expecting value'):
  455. return doc[:-1] + ']'
  456. def decode(self, s):
  457. if self.transform_source:
  458. s = self.transform_source(s)
  459. for attempt in range(self._close_attempts + 1):
  460. try:
  461. if self.ignore_extra:
  462. return self.raw_decode(s.lstrip())[0]
  463. return super().decode(s)
  464. except json.JSONDecodeError as e:
  465. if e.pos is None:
  466. raise
  467. elif attempt < self._close_attempts:
  468. s = self._close_object(e)
  469. if s is not None:
  470. continue
  471. raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
  472. assert False, 'Too many attempts to decode JSON'
  473. def sanitize_open(filename, open_mode):
  474. """Try to open the given filename, and slightly tweak it if this fails.
  475. Attempts to open the given filename. If this fails, it tries to change
  476. the filename slightly, step by step, until it's either able to open it
  477. or it fails and raises a final exception, like the standard open()
  478. function.
  479. It returns the tuple (stream, definitive_file_name).
  480. """
  481. if filename == '-':
  482. if sys.platform == 'win32':
  483. import msvcrt
  484. # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
  485. with contextlib.suppress(io.UnsupportedOperation):
  486. msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  487. return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  488. for attempt in range(2):
  489. try:
  490. try:
  491. if sys.platform == 'win32':
  492. # FIXME: An exclusive lock also locks the file from being read.
  493. # Since windows locks are mandatory, don't lock the file on windows (for now).
  494. # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
  495. raise LockingUnsupportedError
  496. stream = locked_file(filename, open_mode, block=False).__enter__()
  497. except OSError:
  498. stream = open(filename, open_mode)
  499. return stream, filename
  500. except OSError as err:
  501. if attempt or err.errno in (errno.EACCES,):
  502. raise
  503. old_filename, filename = filename, sanitize_path(filename)
  504. if old_filename == filename:
  505. raise
  506. def timeconvert(timestr):
  507. """Convert RFC 2822 defined time string into system timestamp"""
  508. timestamp = None
  509. timetuple = email.utils.parsedate_tz(timestr)
  510. if timetuple is not None:
  511. timestamp = email.utils.mktime_tz(timetuple)
  512. return timestamp
  513. def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
  514. """Sanitizes a string so it could be used as part of a filename.
  515. @param restricted Use a stricter subset of allowed characters
  516. @param is_id Whether this is an ID that should be kept unchanged if possible.
  517. If unset, yt-dlp's new sanitization rules are in effect
  518. """
  519. if s == '':
  520. return ''
  521. def replace_insane(char):
  522. if restricted and char in ACCENT_CHARS:
  523. return ACCENT_CHARS[char]
  524. elif not restricted and char == '\n':
  525. return '\0 '
  526. elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
  527. # Replace with their full-width unicode counterparts
  528. return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
  529. elif char == '?' or ord(char) < 32 or ord(char) == 127:
  530. return ''
  531. elif char == '"':
  532. return '' if restricted else '\''
  533. elif char == ':':
  534. return '\0_\0-' if restricted else '\0 \0-'
  535. elif char in '\\/|*<>':
  536. return '\0_'
  537. if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
  538. return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
  539. return char
  540. # Replace look-alike Unicode glyphs
  541. if restricted and (is_id is NO_DEFAULT or not is_id):
  542. s = unicodedata.normalize('NFKC', s)
  543. s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
  544. result = ''.join(map(replace_insane, s))
  545. if is_id is NO_DEFAULT:
  546. result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
  547. STRIP_RE = r'(?:\0.|[ _-])*'
  548. result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
  549. result = result.replace('\0', '') or '_'
  550. if not is_id:
  551. while '__' in result:
  552. result = result.replace('__', '_')
  553. result = result.strip('_')
  554. # Common case of "Foreign band name - English song title"
  555. if restricted and result.startswith('-_'):
  556. result = result[2:]
  557. if result.startswith('-'):
  558. result = '_' + result[len('-'):]
  559. result = result.lstrip('.')
  560. if not result:
  561. result = '_'
  562. return result
  563. def _sanitize_path_parts(parts):
  564. sanitized_parts = []
  565. for part in parts:
  566. if not part or part == '.':
  567. continue
  568. elif part == '..':
  569. if sanitized_parts and sanitized_parts[-1] != '..':
  570. sanitized_parts.pop()
  571. else:
  572. sanitized_parts.append('..')
  573. continue
  574. # Replace invalid segments with `#`
  575. # - trailing dots and spaces (`asdf...` => `asdf..#`)
  576. # - invalid chars (`<>` => `##`)
  577. sanitized_part = re.sub(r'[/<>:"\|\\?\*]|[\s.]$', '#', part)
  578. sanitized_parts.append(sanitized_part)
  579. return sanitized_parts
  580. def sanitize_path(s, force=False):
  581. """Sanitizes and normalizes path on Windows"""
  582. if sys.platform != 'win32':
  583. if not force:
  584. return s
  585. root = '/' if s.startswith('/') else ''
  586. path = '/'.join(_sanitize_path_parts(s.split('/')))
  587. return root + path if root or path else '.'
  588. normed = s.replace('/', '\\')
  589. if normed.startswith('\\\\'):
  590. # UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`)
  591. parts = normed.split('\\')
  592. root = '\\'.join(parts[:4]) + '\\'
  593. parts = parts[4:]
  594. elif normed[1:2] == ':':
  595. # absolute path or drive relative path
  596. offset = 3 if normed[2:3] == '\\' else 2
  597. root = normed[:offset]
  598. parts = normed[offset:].split('\\')
  599. else:
  600. # relative/drive root relative path
  601. root = '\\' if normed[:1] == '\\' else ''
  602. parts = normed.split('\\')
  603. path = '\\'.join(_sanitize_path_parts(parts))
  604. return root + path if root or path else '.'
  605. def sanitize_url(url, *, scheme='http'):
  606. # Prepend protocol-less URLs with `http:` scheme in order to mitigate
  607. # the number of unwanted failures due to missing protocol
  608. if url is None:
  609. return
  610. elif url.startswith('//'):
  611. return f'{scheme}:{url}'
  612. # Fix some common typos seen so far
  613. COMMON_TYPOS = (
  614. # https://github.com/ytdl-org/youtube-dl/issues/15649
  615. (r'^httpss://', r'https://'),
  616. # https://bx1.be/lives/direct-tv/
  617. (r'^rmtp([es]?)://', r'rtmp\1://'),
  618. )
  619. for mistake, fixup in COMMON_TYPOS:
  620. if re.match(mistake, url):
  621. return re.sub(mistake, fixup, url)
  622. return url
  623. def extract_basic_auth(url):
  624. parts = urllib.parse.urlsplit(url)
  625. if parts.username is None:
  626. return url, None
  627. url = urllib.parse.urlunsplit(parts._replace(netloc=(
  628. parts.hostname if parts.port is None
  629. else f'{parts.hostname}:{parts.port}')))
  630. auth_payload = base64.b64encode(
  631. ('{}:{}'.format(parts.username, parts.password or '')).encode())
  632. return url, f'Basic {auth_payload.decode()}'
  633. def expand_path(s):
  634. """Expand shell variables and ~"""
  635. return os.path.expandvars(compat_expanduser(s))
  636. def orderedSet(iterable, *, lazy=False):
  637. """Remove all duplicates from the input iterable"""
  638. def _iter():
  639. seen = [] # Do not use set since the items can be unhashable
  640. for x in iterable:
  641. if x not in seen:
  642. seen.append(x)
  643. yield x
  644. return _iter() if lazy else list(_iter())
  645. def _htmlentity_transform(entity_with_semicolon):
  646. """Transforms an HTML entity to a character."""
  647. entity = entity_with_semicolon[:-1]
  648. # Known non-numeric HTML entity
  649. if entity in html.entities.name2codepoint:
  650. return chr(html.entities.name2codepoint[entity])
  651. # TODO: HTML5 allows entities without a semicolon.
  652. # E.g. '&Eacuteric' should be decoded as 'Éric'.
  653. if entity_with_semicolon in html.entities.html5:
  654. return html.entities.html5[entity_with_semicolon]
  655. mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
  656. if mobj is not None:
  657. numstr = mobj.group(1)
  658. if numstr.startswith('x'):
  659. base = 16
  660. numstr = f'0{numstr}'
  661. else:
  662. base = 10
  663. # See https://github.com/ytdl-org/youtube-dl/issues/7518
  664. with contextlib.suppress(ValueError):
  665. return chr(int(numstr, base))
  666. # Unknown entity in name, return its literal representation
  667. return f'&{entity};'
  668. def unescapeHTML(s):
  669. if s is None:
  670. return None
  671. assert isinstance(s, str)
  672. return re.sub(
  673. r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  674. def escapeHTML(text):
  675. return (
  676. text
  677. .replace('&', '&amp;')
  678. .replace('<', '&lt;')
  679. .replace('>', '&gt;')
  680. .replace('"', '&quot;')
  681. .replace("'", '&#39;')
  682. )
  683. class netrc_from_content(netrc.netrc):
  684. def __init__(self, content):
  685. self.hosts, self.macros = {}, {}
  686. with io.StringIO(content) as stream:
  687. self._parse('-', stream, False)
  688. class Popen(subprocess.Popen):
  689. if sys.platform == 'win32':
  690. _startupinfo = subprocess.STARTUPINFO()
  691. _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  692. else:
  693. _startupinfo = None
  694. @staticmethod
  695. def _fix_pyinstaller_issues(env):
  696. if not hasattr(sys, '_MEIPASS'):
  697. return
  698. # Force spawning independent subprocesses for exes bundled with PyInstaller>=6.10
  699. # Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes
  700. # https://github.com/yt-dlp/yt-dlp/issues/11259
  702. # Restore LD_LIBRARY_PATH when using PyInstaller
  703. # Ref: https://pyinstaller.org/en/v6.10.0/runtime-information.html#ld-library-path-libpath-considerations
  704. # https://github.com/yt-dlp/yt-dlp/issues/4573
  705. def _fix(key):
  706. orig = env.get(f'{key}_ORIG')
  707. if orig is None:
  708. env.pop(key, None)
  709. else:
  710. env[key] = orig
  711. _fix('LD_LIBRARY_PATH') # Linux
  712. _fix('DYLD_LIBRARY_PATH') # macOS
  713. def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
  714. if env is None:
  715. env = os.environ.copy()
  716. self._fix_pyinstaller_issues(env)
  717. self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
  718. if text is True:
  719. kwargs['universal_newlines'] = True # For 3.6 compatibility
  720. kwargs.setdefault('encoding', 'utf-8')
  721. kwargs.setdefault('errors', 'replace')
  722. if shell and os.name == 'nt' and kwargs.get('executable') is None:
  723. if not isinstance(args, str):
  724. args = shell_quote(args, shell=True)
  725. shell = False
  726. # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
  727. env['='] = '"^\n\n"'
  728. args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
  729. super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
  730. def __comspec(self):
  731. comspec = os.environ.get('ComSpec') or os.path.join(
  732. os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
  733. if os.path.isabs(comspec):
  734. return comspec
  735. raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
  736. def communicate_or_kill(self, *args, **kwargs):
  737. try:
  738. return self.communicate(*args, **kwargs)
  739. except BaseException: # Including KeyboardInterrupt
  740. self.kill(timeout=None)
  741. raise
  742. def kill(self, *, timeout=0):
  743. super().kill()
  744. if timeout != 0:
  745. self.wait(timeout=timeout)
  746. @classmethod
  747. def run(cls, *args, timeout=None, **kwargs):
  748. with cls(*args, **kwargs) as proc:
  749. default = '' if proc.__text_mode else b''
  750. stdout, stderr = proc.communicate_or_kill(timeout=timeout)
  751. return stdout or default, stderr or default, proc.returncode
  752. def encodeArgument(s):
  753. # Legacy code that uses byte strings
  754. # Uncomment the following line after fixing all post processors
  755. # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
  756. return s if isinstance(s, str) else s.decode('ascii')
  757. _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
  758. def timetuple_from_msec(msec):
  759. secs, msec = divmod(msec, 1000)
  760. mins, secs = divmod(secs, 60)
  761. hrs, mins = divmod(mins, 60)
  762. return _timetuple(hrs, mins, secs, msec)
  763. def formatSeconds(secs, delim=':', msec=False):
  764. time = timetuple_from_msec(secs * 1000)
  765. if time.hours:
  766. ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
  767. elif time.minutes:
  768. ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
  769. else:
  770. ret = '%d' % time.seconds
  771. return '%s.%03d' % (ret, time.milliseconds) if msec else ret
  772. def bug_reports_message(before=';'):
  773. from ..update import REPOSITORY
  774. msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
  775. 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
  776. before = before.rstrip()
  777. if not before or before.endswith(('.', '!', '?')):
  778. msg = msg[0].title() + msg[1:]
  779. return (before + ' ' if before else '') + msg
  780. class YoutubeDLError(Exception):
  781. """Base exception for YoutubeDL errors."""
  782. msg = None
  783. def __init__(self, msg=None):
  784. if msg is not None:
  785. self.msg = msg
  786. elif self.msg is None:
  787. self.msg = type(self).__name__
  788. super().__init__(self.msg)
  789. class ExtractorError(YoutubeDLError):
  790. """Error during info extraction."""
  791. def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
  792. """ tb, if given, is the original traceback (so that it can be printed out).
  793. If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
  794. """
  795. from ..networking.exceptions import network_exceptions
  796. if sys.exc_info()[0] in network_exceptions:
  797. expected = True
  798. self.orig_msg = str(msg)
  799. self.traceback = tb
  800. self.expected = expected
  801. self.cause = cause
  802. self.video_id = video_id
  803. self.ie = ie
  804. self.exc_info = sys.exc_info() # preserve original exception
  805. if isinstance(self.exc_info[1], ExtractorError):
  806. self.exc_info = self.exc_info[1].exc_info
  807. super().__init__(self.__msg)
  808. @property
  809. def __msg(self):
  810. return ''.join((
  811. format_field(self.ie, None, '[%s] '),
  812. format_field(self.video_id, None, '%s: '),
  813. self.orig_msg,
  814. format_field(self.cause, None, ' (caused by %r)'),
  815. '' if self.expected else bug_reports_message()))
  816. def format_traceback(self):
  817. return join_nonempty(
  818. self.traceback and ''.join(traceback.format_tb(self.traceback)),
  819. self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
  820. delim='\n') or None
  821. def __setattr__(self, name, value):
  822. super().__setattr__(name, value)
  823. if getattr(self, 'msg', None) and name not in ('msg', 'args'):
  824. self.msg = self.__msg or type(self).__name__
  825. self.args = (self.msg, ) # Cannot be property
  826. class UnsupportedError(ExtractorError):
  827. def __init__(self, url):
  828. super().__init__(
  829. f'Unsupported URL: {url}', expected=True)
  830. self.url = url
  831. class RegexNotFoundError(ExtractorError):
  832. """Error when a regex didn't match"""
  833. pass
  834. class GeoRestrictedError(ExtractorError):
  835. """Geographic restriction Error exception.
  836. This exception may be thrown when a video is not available from your
  837. geographic location due to geographic restrictions imposed by a website.
  838. """
  839. def __init__(self, msg, countries=None, **kwargs):
  840. kwargs['expected'] = True
  841. super().__init__(msg, **kwargs)
  842. self.countries = countries
  843. class UserNotLive(ExtractorError):
  844. """Error when a channel/user is not live"""
  845. def __init__(self, msg=None, **kwargs):
  846. kwargs['expected'] = True
  847. super().__init__(msg or 'The channel is not currently live', **kwargs)
  848. class DownloadError(YoutubeDLError):
  849. """Download Error exception.
  850. This exception may be thrown by FileDownloader objects if they are not
  851. configured to continue on errors. They will contain the appropriate
  852. error message.
  853. """
  854. def __init__(self, msg, exc_info=None):
  855. """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  856. super().__init__(msg)
  857. self.exc_info = exc_info
  858. class EntryNotInPlaylist(YoutubeDLError):
  859. """Entry not in playlist exception.
  860. This exception will be thrown by YoutubeDL when a requested entry
  861. is not found in the playlist info_dict
  862. """
  863. msg = 'Entry not found in info'
  864. class SameFileError(YoutubeDLError):
  865. """Same File exception.
  866. This exception will be thrown by FileDownloader objects if they detect
  867. multiple files would have to be downloaded to the same file on disk.
  868. """
  869. msg = 'Fixed output name but more than one file to download'
  870. def __init__(self, filename=None):
  871. if filename is not None:
  872. self.msg += f': {filename}'
  873. super().__init__(self.msg)
  874. class PostProcessingError(YoutubeDLError):
  875. """Post Processing exception.
  876. This exception may be raised by PostProcessor's .run() method to
  877. indicate an error in the postprocessing task.
  878. """
  879. class DownloadCancelled(YoutubeDLError):
  880. """ Exception raised when the download queue should be interrupted """
  881. msg = 'The download was cancelled'
  882. class ExistingVideoReached(DownloadCancelled):
  883. """ --break-on-existing triggered """
  884. msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
  885. class RejectedVideoReached(DownloadCancelled):
  886. """ --break-match-filter triggered """
  887. msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
  888. class MaxDownloadsReached(DownloadCancelled):
  889. """ --max-downloads limit has been reached. """
  890. msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
  891. class ReExtractInfo(YoutubeDLError):
  892. """ Video info needs to be re-extracted. """
  893. def __init__(self, msg, expected=False):
  894. super().__init__(msg)
  895. self.expected = expected
  896. class ThrottledDownload(ReExtractInfo):
  897. """ Download speed below --throttled-rate. """
  898. msg = 'The download speed is below throttle limit'
  899. def __init__(self):
  900. super().__init__(self.msg, expected=False)
  901. class UnavailableVideoError(YoutubeDLError):
  902. """Unavailable Format exception.
  903. This exception will be thrown when a video is requested
  904. in a format that is not available for that video.
  905. """
  906. msg = 'Unable to download video'
  907. def __init__(self, err=None):
  908. if err is not None:
  909. self.msg += f': {err}'
  910. super().__init__(self.msg)
  911. class ContentTooShortError(YoutubeDLError):
  912. """Content Too Short exception.
  913. This exception may be raised by FileDownloader objects when a file they
  914. download is too small for what the server announced first, indicating
  915. the connection was probably interrupted.
  916. """
  917. def __init__(self, downloaded, expected):
  918. super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
  919. # Both in bytes
  920. self.downloaded = downloaded
  921. self.expected = expected
  922. class XAttrMetadataError(YoutubeDLError):
  923. def __init__(self, code=None, msg='Unknown error'):
  924. super().__init__(msg)
  925. self.code = code
  926. self.msg = msg
  927. # Parsing code and msg
  928. if (self.code in (errno.ENOSPC, errno.EDQUOT)
  929. or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
  930. self.reason = 'NO_SPACE'
  931. elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
  932. self.reason = 'VALUE_TOO_LONG'
  933. else:
  934. self.reason = 'NOT_SUPPORTED'
  935. class XAttrUnavailableError(YoutubeDLError):
  936. pass
  937. def is_path_like(f):
  938. return isinstance(f, (str, bytes, os.PathLike))
  939. def extract_timezone(date_str, default=None):
  940. m = re.search(
  941. r'''(?x)
  942. ^.{8,}? # >=8 char non-TZ prefix, if present
  943. (?P<tz>Z| # just the UTC Z, or
  944. (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
  945. (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
  946. [ ]? # optional space
  947. (?P<sign>\+|-) # +/-
  948. (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
  949. $)
  950. ''', date_str)
  951. timezone = None
  952. if not m:
  953. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  954. timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
  955. if timezone is not None:
  956. date_str = date_str[:-len(m.group('tz'))]
  957. timezone = dt.timedelta(hours=timezone)
  958. else:
  959. date_str = date_str[:-len(m.group('tz'))]
  960. if m.group('sign'):
  961. sign = 1 if m.group('sign') == '+' else -1
  962. timezone = dt.timedelta(
  963. hours=sign * int(m.group('hours')),
  964. minutes=sign * int(m.group('minutes')))
  965. if timezone is None and default is not NO_DEFAULT:
  966. timezone = default or dt.timedelta()
  967. return timezone, date_str
  968. @partial_application
  969. def parse_iso8601(date_str, delimiter='T', timezone=None):
  970. """ Return a UNIX timestamp from the given date """
  971. if date_str is None:
  972. return None
  973. date_str = re.sub(r'\.[0-9]+', '', date_str)
  974. timezone, date_str = extract_timezone(date_str, timezone)
  975. with contextlib.suppress(ValueError, TypeError):
  976. date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
  977. dt_ = dt.datetime.strptime(date_str, date_format) - timezone
  978. return calendar.timegm(dt_.timetuple())
  979. def date_formats(day_first=True):
  981. def unified_strdate(date_str, day_first=True):
  982. """Return a string with the date in the format YYYYMMDD"""
  983. if date_str is None:
  984. return None
  985. upload_date = None
  986. # Replace commas
  987. date_str = date_str.replace(',', ' ')
  988. # Remove AM/PM + timezone
  989. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  990. _, date_str = extract_timezone(date_str)
  991. for expression in date_formats(day_first):
  992. with contextlib.suppress(ValueError):
  993. upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  994. if upload_date is None:
  995. timetuple = email.utils.parsedate_tz(date_str)
  996. if timetuple:
  997. with contextlib.suppress(ValueError):
  998. upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
  999. if upload_date is not None:
  1000. return str(upload_date)
  1001. def unified_timestamp(date_str, day_first=True):
  1002. if not isinstance(date_str, str):
  1003. return None
  1004. date_str = re.sub(r'\s+', ' ', re.sub(
  1005. r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?|sun)(day)?', '', date_str))
  1006. pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
  1007. timezone, date_str = extract_timezone(date_str)
  1008. # Remove AM/PM + timezone
  1009. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  1010. # Remove unrecognized timezones from ISO 8601 alike timestamps
  1011. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  1012. if m:
  1013. date_str = date_str[:-len(m.group('tz'))]
  1014. # Python only supports microseconds, so remove nanoseconds
  1015. m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
  1016. if m:
  1017. date_str = m.group(1)
  1018. for expression in date_formats(day_first):
  1019. with contextlib.suppress(ValueError):
  1020. dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
  1021. return calendar.timegm(dt_.timetuple())
  1022. timetuple = email.utils.parsedate_tz(date_str)
  1023. if timetuple:
  1024. return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
  1025. @partial_application
  1026. def determine_ext(url, default_ext='unknown_video'):
  1027. if url is None or '.' not in url:
  1028. return default_ext
  1029. guess = url.partition('?')[0].rpartition('.')[2]
  1030. if re.match(r'^[A-Za-z0-9]+$', guess):
  1031. return guess
  1032. # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
  1033. elif guess.rstrip('/') in KNOWN_EXTENSIONS:
  1034. return guess.rstrip('/')
  1035. else:
  1036. return default_ext
  1037. def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
  1038. return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
  1039. def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
  1040. R"""
  1041. Return a datetime object from a string.
  1042. Supported format:
  1043. (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
  1044. @param format strftime format of DATE
  1045. @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
  1046. auto: round to the unit provided in date_str (if applicable).
  1047. """
  1048. auto_precision = False
  1049. if precision == 'auto':
  1050. auto_precision = True
  1051. precision = 'microsecond'
  1052. today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
  1053. if date_str in ('now', 'today'):
  1054. return today
  1055. if date_str == 'yesterday':
  1056. return today - dt.timedelta(days=1)
  1057. match = re.match(
  1058. r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
  1059. date_str)
  1060. if match is not None:
  1061. start_time = datetime_from_str(match.group('start'), precision, format)
  1062. time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
  1063. unit = match.group('unit')
  1064. if unit == 'month' or unit == 'year':
  1065. new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
  1066. unit = 'day'
  1067. else:
  1068. if unit == 'week':
  1069. unit = 'day'
  1070. time *= 7
  1071. delta = dt.timedelta(**{unit + 's': time})
  1072. new_date = start_time + delta
  1073. if auto_precision:
  1074. return datetime_round(new_date, unit)
  1075. return new_date
  1076. return datetime_round(dt.datetime.strptime(date_str, format), precision)
  1077. def date_from_str(date_str, format='%Y%m%d', strict=False):
  1078. R"""
  1079. Return a date object from a string using datetime_from_str
  1080. @param strict Restrict allowed patterns to "YYYYMMDD" and
  1081. (now|today|yesterday)(-\d+(day|week|month|year)s?)?
  1082. """
  1083. if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
  1084. raise ValueError(f'Invalid date format "{date_str}"')
  1085. return datetime_from_str(date_str, precision='microsecond', format=format).date()
  1086. def datetime_add_months(dt_, months):
  1087. """Increment/Decrement a datetime object by months."""
  1088. month = dt_.month + months - 1
  1089. year = dt_.year + month // 12
  1090. month = month % 12 + 1
  1091. day = min(dt_.day, calendar.monthrange(year, month)[1])
  1092. return dt_.replace(year, month, day)
  1093. def datetime_round(dt_, precision='day'):
  1094. """
  1095. Round a datetime object's time to a specific precision
  1096. """
  1097. if precision == 'microsecond':
  1098. return dt_
  1099. unit_seconds = {
  1100. 'day': 86400,
  1101. 'hour': 3600,
  1102. 'minute': 60,
  1103. 'second': 1,
  1104. }
  1105. roundto = lambda x, n: ((x + n / 2) // n) * n
  1106. timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
  1107. return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
  1108. def hyphenate_date(date_str):
  1109. """
  1110. Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
  1111. match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
  1112. if match is not None:
  1113. return '-'.join(match.groups())
  1114. else:
  1115. return date_str
  1116. class DateRange:
  1117. """Represents a time interval between two dates"""
  1118. def __init__(self, start=None, end=None):
  1119. """start and end must be strings in the format accepted by date"""
  1120. if start is not None:
  1121. self.start = date_from_str(start, strict=True)
  1122. else:
  1123. self.start = dt.datetime.min.date()
  1124. if end is not None:
  1125. self.end = date_from_str(end, strict=True)
  1126. else:
  1127. self.end = dt.datetime.max.date()
  1128. if self.start > self.end:
  1129. raise ValueError(f'Date range: "{self}" , the start date must be before the end date')
  1130. @classmethod
  1131. def day(cls, day):
  1132. """Returns a range that only contains the given day"""
  1133. return cls(day, day)
  1134. def __contains__(self, date):
  1135. """Check if the date is in the range"""
  1136. if not isinstance(date, dt.date):
  1137. date = date_from_str(date)
  1138. return self.start <= date <= self.end
  1139. def __repr__(self):
  1140. return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
  1141. def __str__(self):
  1142. return f'{self.start} to {self.end}'
  1143. def __eq__(self, other):
  1144. return (isinstance(other, DateRange)
  1145. and self.start == other.start and self.end == other.end)
  1146. @functools.cache
  1147. def system_identifier():
  1148. python_implementation = platform.python_implementation()
  1149. if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
  1150. python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
  1151. libc_ver = []
  1152. with contextlib.suppress(OSError): # We may not have access to the executable
  1153. libc_ver = platform.libc_ver()
  1154. return 'Python {} ({} {} {}) - {} ({}{})'.format(
  1155. platform.python_version(),
  1156. python_implementation,
  1157. platform.machine(),
  1158. platform.architecture()[0],
  1159. platform.platform(),
  1160. ssl.OPENSSL_VERSION,
  1161. format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
  1162. )
  1163. @functools.cache
  1164. def get_windows_version():
  1165. """ Get Windows version. returns () if it's not running on Windows """
  1166. if os.name == 'nt':
  1167. return version_tuple(platform.win32_ver()[1])
  1168. else:
  1169. return ()
  1170. def write_string(s, out=None, encoding=None):
  1171. assert isinstance(s, str)
  1172. out = out or sys.stderr
  1173. # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
  1174. if not out:
  1175. return
  1176. if os.name == 'nt' and supports_terminal_sequences(out):
  1177. s = re.sub(r'([\r\n]+)', r' \1', s)
  1178. enc, buffer = None, out
  1179. # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
  1180. if 'b' in (getattr(out, 'mode', None) or ''):
  1181. enc = encoding or preferredencoding()
  1182. elif hasattr(out, 'buffer'):
  1183. buffer = out.buffer
  1184. enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
  1185. buffer.write(s.encode(enc, 'ignore') if enc else s)
  1186. out.flush()
  1187. # TODO: Use global logger
  1188. def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
  1189. from .. import _IN_CLI
  1190. if _IN_CLI:
  1191. if msg in deprecation_warning._cache:
  1192. return
  1193. deprecation_warning._cache.add(msg)
  1194. if printer:
  1195. return printer(f'{msg}{bug_reports_message()}', **kwargs)
  1196. return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
  1197. else:
  1198. import warnings
  1199. warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
  1200. deprecation_warning._cache = set()
  1201. class LockingUnsupportedError(OSError):
  1202. msg = 'File locking is not supported'
  1203. def __init__(self):
  1204. super().__init__(self.msg)
  1205. # Cross-platform file locking
  1206. if sys.platform == 'win32':
  1207. import ctypes
  1208. import ctypes.wintypes
  1209. import msvcrt
  1210. class OVERLAPPED(ctypes.Structure):
  1211. _fields_ = [
  1212. ('Internal', ctypes.wintypes.LPVOID),
  1213. ('InternalHigh', ctypes.wintypes.LPVOID),
  1214. ('Offset', ctypes.wintypes.DWORD),
  1215. ('OffsetHigh', ctypes.wintypes.DWORD),
  1216. ('hEvent', ctypes.wintypes.HANDLE),
  1217. ]
  1218. kernel32 = ctypes.WinDLL('kernel32')
  1219. LockFileEx = kernel32.LockFileEx
  1220. LockFileEx.argtypes = [
  1221. ctypes.wintypes.HANDLE, # hFile
  1222. ctypes.wintypes.DWORD, # dwFlags
  1223. ctypes.wintypes.DWORD, # dwReserved
  1224. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1225. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1226. ctypes.POINTER(OVERLAPPED), # Overlapped
  1227. ]
  1228. LockFileEx.restype = ctypes.wintypes.BOOL
  1229. UnlockFileEx = kernel32.UnlockFileEx
  1230. UnlockFileEx.argtypes = [
  1231. ctypes.wintypes.HANDLE, # hFile
  1232. ctypes.wintypes.DWORD, # dwReserved
  1233. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1234. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1235. ctypes.POINTER(OVERLAPPED), # Overlapped
  1236. ]
  1237. UnlockFileEx.restype = ctypes.wintypes.BOOL
  1238. whole_low = 0xffffffff
  1239. whole_high = 0x7fffffff
  1240. def _lock_file(f, exclusive, block):
  1241. overlapped = OVERLAPPED()
  1242. overlapped.Offset = 0
  1243. overlapped.OffsetHigh = 0
  1244. overlapped.hEvent = 0
  1245. f._lock_file_overlapped_p = ctypes.pointer(overlapped)
  1246. if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
  1247. (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
  1248. 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1249. # NB: No argument form of "ctypes.FormatError" does not work on PyPy
  1250. raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
  1251. def _unlock_file(f):
  1252. assert f._lock_file_overlapped_p
  1253. handle = msvcrt.get_osfhandle(f.fileno())
  1254. if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1255. raise OSError(f'Unlocking file failed: {ctypes.FormatError()!r}')
  1256. else:
  1257. try:
  1258. import fcntl
  1259. def _lock_file(f, exclusive, block):
  1260. flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
  1261. if not block:
  1262. flags |= fcntl.LOCK_NB
  1263. try:
  1264. fcntl.flock(f, flags)
  1265. except BlockingIOError:
  1266. raise
  1267. except OSError: # AOSP does not have flock()
  1268. fcntl.lockf(f, flags)
  1269. def _unlock_file(f):
  1270. with contextlib.suppress(OSError):
  1271. return fcntl.flock(f, fcntl.LOCK_UN)
  1272. with contextlib.suppress(OSError):
  1273. return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
  1274. return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
  1275. except ImportError:
  1276. def _lock_file(f, exclusive, block):
  1277. raise LockingUnsupportedError
  1278. def _unlock_file(f):
  1279. raise LockingUnsupportedError
  1280. class locked_file:
  1281. locked = False
  1282. def __init__(self, filename, mode, block=True, encoding=None):
  1283. if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
  1284. raise NotImplementedError(mode)
  1285. self.mode, self.block = mode, block
  1286. writable = any(f in mode for f in 'wax+')
  1287. readable = any(f in mode for f in 'r+')
  1288. flags = functools.reduce(operator.ior, (
  1289. getattr(os, 'O_CLOEXEC', 0), # UNIX only
  1290. getattr(os, 'O_BINARY', 0), # Windows only
  1291. getattr(os, 'O_NOINHERIT', 0), # Windows only
  1292. os.O_CREAT if writable else 0, # O_TRUNC only after locking
  1293. os.O_APPEND if 'a' in mode else 0,
  1294. os.O_EXCL if 'x' in mode else 0,
  1295. os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
  1296. ))
  1297. self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
  1298. def __enter__(self):
  1299. exclusive = 'r' not in self.mode
  1300. try:
  1301. _lock_file(self.f, exclusive, self.block)
  1302. self.locked = True
  1303. except OSError:
  1304. self.f.close()
  1305. raise
  1306. if 'w' in self.mode:
  1307. try:
  1308. self.f.truncate()
  1309. except OSError as e:
  1310. if e.errno not in (
  1311. errno.ESPIPE, # Illegal seek - expected for FIFO
  1312. errno.EINVAL, # Invalid argument - expected for /dev/null
  1313. ):
  1314. raise
  1315. return self
  1316. def unlock(self):
  1317. if not self.locked:
  1318. return
  1319. try:
  1320. _unlock_file(self.f)
  1321. finally:
  1322. self.locked = False
  1323. def __exit__(self, *_):
  1324. try:
  1325. self.unlock()
  1326. finally:
  1327. self.f.close()
  1328. open = __enter__
  1329. close = __exit__
  1330. def __getattr__(self, attr):
  1331. return getattr(self.f, attr)
  1332. def __iter__(self):
  1333. return iter(self.f)
  1334. @functools.cache
  1335. def get_filesystem_encoding():
  1336. encoding = sys.getfilesystemencoding()
  1337. return encoding if encoding is not None else 'utf-8'
  1338. _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
  1339. _CMD_QUOTE_TRANS = str.maketrans({
  1340. # Keep quotes balanced by replacing them with `""` instead of `\\"`
  1341. '"': '""',
  1342. # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
  1343. # `=` should be unique since variables containing `=` cannot be set using cmd
  1344. '\n': '%=%',
  1345. '\r': '%=%',
  1346. # Use zero length variable replacement so `%` doesn't get expanded
  1347. # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
  1348. '%': '%%cd:~,%',
  1349. })
  1350. def shell_quote(args, *, shell=False):
  1351. args = list(variadic(args))
  1352. if os.name != 'nt':
  1353. return shlex.join(args)
  1354. trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
  1355. return ' '.join(
  1356. s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
  1357. else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
  1358. for s in args)
  1359. def smuggle_url(url, data):
  1360. """ Pass additional data in a URL for internal use. """
  1361. url, idata = unsmuggle_url(url, {})
  1362. data.update(idata)
  1363. sdata = urllib.parse.urlencode(
  1364. {'__youtubedl_smuggle': json.dumps(data)})
  1365. return url + '#' + sdata
  1366. def unsmuggle_url(smug_url, default=None):
  1367. if '#__youtubedl_smuggle' not in smug_url:
  1368. return smug_url, default
  1369. url, _, sdata = smug_url.rpartition('#')
  1370. jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
  1371. data = json.loads(jsond)
  1372. return url, data
  1373. def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
  1374. """ Formats numbers with decimal sufixes like K, M, etc """
  1375. num, factor = float_or_none(num), float(factor)
  1376. if num is None or num < 0:
  1377. return None
  1379. exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
  1380. suffix = ['', *POSSIBLE_SUFFIXES][exponent]
  1381. if factor == 1024:
  1382. suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
  1383. converted = num / (factor ** exponent)
  1384. return fmt % (converted, suffix)
  1385. def format_bytes(bytes):
  1386. return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
  1387. def lookup_unit_table(unit_table, s, strict=False):
  1388. num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
  1389. units_re = '|'.join(re.escape(u) for u in unit_table)
  1390. m = (re.fullmatch if strict else re.match)(
  1391. rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
  1392. if not m:
  1393. return None
  1394. num = float(m.group('num').replace(',', '.'))
  1395. mult = unit_table[m.group('unit')]
  1396. return round(num * mult)
  1397. def parse_bytes(s):
  1398. """Parse a string indicating a byte quantity into an integer"""
  1399. return lookup_unit_table(
  1400. {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
  1401. s.upper(), strict=True)
  1402. def parse_filesize(s):
  1403. if s is None:
  1404. return None
  1405. # The lower-case forms are of course incorrect and unofficial,
  1406. # but we support those too
  1407. _UNIT_TABLE = {
  1408. 'B': 1,
  1409. 'b': 1,
  1410. 'bytes': 1,
  1411. 'KiB': 1024,
  1412. 'KB': 1000,
  1413. 'kB': 1024,
  1414. 'Kb': 1000,
  1415. 'kb': 1000,
  1416. 'kilobytes': 1000,
  1417. 'kibibytes': 1024,
  1418. 'MiB': 1024 ** 2,
  1419. 'MB': 1000 ** 2,
  1420. 'mB': 1024 ** 2,
  1421. 'Mb': 1000 ** 2,
  1422. 'mb': 1000 ** 2,
  1423. 'megabytes': 1000 ** 2,
  1424. 'mebibytes': 1024 ** 2,
  1425. 'GiB': 1024 ** 3,
  1426. 'GB': 1000 ** 3,
  1427. 'gB': 1024 ** 3,
  1428. 'Gb': 1000 ** 3,
  1429. 'gb': 1000 ** 3,
  1430. 'gigabytes': 1000 ** 3,
  1431. 'gibibytes': 1024 ** 3,
  1432. 'TiB': 1024 ** 4,
  1433. 'TB': 1000 ** 4,
  1434. 'tB': 1024 ** 4,
  1435. 'Tb': 1000 ** 4,
  1436. 'tb': 1000 ** 4,
  1437. 'terabytes': 1000 ** 4,
  1438. 'tebibytes': 1024 ** 4,
  1439. 'PiB': 1024 ** 5,
  1440. 'PB': 1000 ** 5,
  1441. 'pB': 1024 ** 5,
  1442. 'Pb': 1000 ** 5,
  1443. 'pb': 1000 ** 5,
  1444. 'petabytes': 1000 ** 5,
  1445. 'pebibytes': 1024 ** 5,
  1446. 'EiB': 1024 ** 6,
  1447. 'EB': 1000 ** 6,
  1448. 'eB': 1024 ** 6,
  1449. 'Eb': 1000 ** 6,
  1450. 'eb': 1000 ** 6,
  1451. 'exabytes': 1000 ** 6,
  1452. 'exbibytes': 1024 ** 6,
  1453. 'ZiB': 1024 ** 7,
  1454. 'ZB': 1000 ** 7,
  1455. 'zB': 1024 ** 7,
  1456. 'Zb': 1000 ** 7,
  1457. 'zb': 1000 ** 7,
  1458. 'zettabytes': 1000 ** 7,
  1459. 'zebibytes': 1024 ** 7,
  1460. 'YiB': 1024 ** 8,
  1461. 'YB': 1000 ** 8,
  1462. 'yB': 1024 ** 8,
  1463. 'Yb': 1000 ** 8,
  1464. 'yb': 1000 ** 8,
  1465. 'yottabytes': 1000 ** 8,
  1466. 'yobibytes': 1024 ** 8,
  1467. }
  1468. return lookup_unit_table(_UNIT_TABLE, s)
  1469. def parse_count(s):
  1470. if s is None:
  1471. return None
  1472. s = re.sub(r'^[^\d]+\s', '', s).strip()
  1473. if re.match(r'^[\d,.]+$', s):
  1474. return str_to_int(s)
  1475. _UNIT_TABLE = {
  1476. 'k': 1000,
  1477. 'K': 1000,
  1478. 'm': 1000 ** 2,
  1479. 'M': 1000 ** 2,
  1480. 'kk': 1000 ** 2,
  1481. 'KK': 1000 ** 2,
  1482. 'b': 1000 ** 3,
  1483. 'B': 1000 ** 3,
  1484. }
  1485. ret = lookup_unit_table(_UNIT_TABLE, s)
  1486. if ret is not None:
  1487. return ret
  1488. mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
  1489. if mobj:
  1490. return str_to_int(mobj.group(1))
  1491. def parse_resolution(s, *, lenient=False):
  1492. if s is None:
  1493. return {}
  1494. if lenient:
  1495. mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
  1496. else:
  1497. mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
  1498. if mobj:
  1499. return {
  1500. 'width': int(mobj.group('w')),
  1501. 'height': int(mobj.group('h')),
  1502. }
  1503. mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
  1504. if mobj:
  1505. return {'height': int(mobj.group(1))}
  1506. mobj = re.search(r'\b([48])[kK]\b', s)
  1507. if mobj:
  1508. return {'height': int(mobj.group(1)) * 540}
  1509. return {}
  1510. def parse_bitrate(s):
  1511. if not isinstance(s, str):
  1512. return
  1513. mobj = re.search(r'\b(\d+)\s*kbps', s)
  1514. if mobj:
  1515. return int(mobj.group(1))
  1516. def month_by_name(name, lang='en'):
  1517. """ Return the number of a month by (locale-independently) English name """
  1518. month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
  1519. try:
  1520. return month_names.index(name) + 1
  1521. except ValueError:
  1522. return None
  1523. def month_by_abbreviation(abbrev):
  1524. """ Return the number of a month by (locale-independently) English
  1525. abbreviations """
  1526. try:
  1527. return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
  1528. except ValueError:
  1529. return None
  1530. def fix_xml_ampersands(xml_str):
  1531. """Replace all the '&' by '&amp;' in XML"""
  1532. return re.sub(
  1533. r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
  1534. '&amp;',
  1535. xml_str)
  1536. def setproctitle(title):
  1537. assert isinstance(title, str)
  1538. # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
  1539. try:
  1540. import ctypes
  1541. except ImportError:
  1542. return
  1543. try:
  1544. libc = ctypes.cdll.LoadLibrary('libc.so.6')
  1545. except OSError:
  1546. return
  1547. except TypeError:
  1548. # LoadLibrary in Windows Python 2.7.13 only expects
  1549. # a bytestring, but since unicode_literals turns
  1550. # every string into a unicode string, it fails.
  1551. return
  1552. title_bytes = title.encode()
  1553. buf = ctypes.create_string_buffer(len(title_bytes))
  1554. buf.value = title_bytes
  1555. try:
  1556. # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
  1557. libc.prctl(15, buf, 0, 0, 0)
  1558. except AttributeError:
  1559. return # Strange libc, just skip this
  1560. def remove_start(s, start):
  1561. return s[len(start):] if s is not None and s.startswith(start) else s
  1562. def remove_end(s, end):
  1563. return s[:-len(end)] if s is not None and end and s.endswith(end) else s
  1564. def remove_quotes(s):
  1565. if s is None or len(s) < 2:
  1566. return s
  1567. for quote in ('"', "'"):
  1568. if s[0] == quote and s[-1] == quote:
  1569. return s[1:-1]
  1570. return s
  1571. def get_domain(url):
  1572. """
  1573. This implementation is inconsistent, but is kept for compatibility.
  1574. Use this only for "webpage_url_domain"
  1575. """
  1576. return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
  1577. def url_basename(url):
  1578. path = urllib.parse.urlparse(url).path
  1579. return path.strip('/').split('/')[-1]
  1580. def base_url(url):
  1581. return re.match(r'https?://[^?#]+/', url).group()
  1582. @partial_application
  1583. def urljoin(base, path):
  1584. if isinstance(path, bytes):
  1585. path = path.decode()
  1586. if not isinstance(path, str) or not path:
  1587. return None
  1588. if re.match(r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
  1589. return path
  1590. if isinstance(base, bytes):
  1591. base = base.decode()
  1592. if not isinstance(base, str) or not re.match(
  1593. r'^(?:https?:)?//', base):
  1594. return None
  1595. return urllib.parse.urljoin(base, path)
  1596. @partial_application
  1597. def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None):
  1598. if get_attr and v is not None:
  1599. v = getattr(v, get_attr, None)
  1600. if invscale == 1 and scale < 1:
  1601. invscale = int(1 / scale)
  1602. scale = 1
  1603. try:
  1604. return (int(v) if base is None else int(v, base=base)) * invscale // scale
  1605. except (ValueError, TypeError, OverflowError):
  1606. return default
  1607. def str_or_none(v, default=None):
  1608. return default if v is None else str(v)
  1609. def str_to_int(int_str):
  1610. """ A more relaxed version of int_or_none """
  1611. if isinstance(int_str, int):
  1612. return int_str
  1613. elif isinstance(int_str, str):
  1614. int_str = re.sub(r'[,\.\+]', '', int_str)
  1615. return int_or_none(int_str)
  1616. @partial_application
  1617. def float_or_none(v, scale=1, invscale=1, default=None):
  1618. if v is None:
  1619. return default
  1620. if invscale == 1 and scale < 1:
  1621. invscale = int(1 / scale)
  1622. scale = 1
  1623. try:
  1624. return float(v) * invscale / scale
  1625. except (ValueError, TypeError):
  1626. return default
  1627. def bool_or_none(v, default=None):
  1628. return v if isinstance(v, bool) else default
  1629. def strip_or_none(v, default=None):
  1630. return v.strip() if isinstance(v, str) else default
  1631. def url_or_none(url):
  1632. if not url or not isinstance(url, str):
  1633. return None
  1634. url = url.strip()
  1635. return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
  1636. def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
  1637. datetime_object = None
  1638. try:
  1639. if isinstance(timestamp, (int, float)): # unix timestamp
  1640. # Using naive datetime here can break timestamp() in Windows
  1641. # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
  1642. # Also, dt.datetime.fromtimestamp breaks for negative timestamps
  1643. # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
  1644. datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
  1645. + dt.timedelta(seconds=timestamp))
  1646. elif isinstance(timestamp, str): # assume YYYYMMDD
  1647. datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
  1648. date_format = re.sub( # Support %s on windows
  1649. r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
  1650. return datetime_object.strftime(date_format)
  1651. except (ValueError, TypeError, AttributeError):
  1652. return default
  1653. def parse_duration(s):
  1654. if not isinstance(s, str):
  1655. return None
  1656. s = s.strip()
  1657. if not s:
  1658. return None
  1659. days, hours, mins, secs, ms = [None] * 5
  1660. m = re.match(r'''(?x)
  1661. (?P<before_secs>
  1662. (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
  1663. (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
  1664. (?P<ms>[.:][0-9]+)?Z?$
  1665. ''', s)
  1666. if m:
  1667. days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
  1668. else:
  1669. m = re.match(
  1670. r'''(?ix)(?:P?
  1671. (?:
  1672. [0-9]+\s*y(?:ears?)?,?\s*
  1673. )?
  1674. (?:
  1675. [0-9]+\s*m(?:onths?)?,?\s*
  1676. )?
  1677. (?:
  1678. [0-9]+\s*w(?:eeks?)?,?\s*
  1679. )?
  1680. (?:
  1681. (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
  1682. )?
  1683. T)?
  1684. (?:
  1685. (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
  1686. )?
  1687. (?:
  1688. (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
  1689. )?
  1690. (?:
  1691. (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
  1692. )?Z?$''', s)
  1693. if m:
  1694. days, hours, mins, secs, ms = m.groups()
  1695. else:
  1696. m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
  1697. if m:
  1698. hours, mins = m.groups()
  1699. else:
  1700. return None
  1701. if ms:
  1702. ms = ms.replace(':', '.')
  1703. return sum(float(part or 0) * mult for part, mult in (
  1704. (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
  1705. def _change_extension(prepend, filename, ext, expected_real_ext=None):
  1706. name, real_ext = os.path.splitext(filename)
  1707. if not expected_real_ext or real_ext[1:] == expected_real_ext:
  1708. filename = name
  1709. if prepend and real_ext:
  1710. _UnsafeExtensionError.sanitize_extension(ext, prepend=True)
  1711. return f'{filename}.{ext}{real_ext}'
  1712. return f'{filename}.{_UnsafeExtensionError.sanitize_extension(ext)}'
  1713. prepend_extension = functools.partial(_change_extension, True)
  1714. replace_extension = functools.partial(_change_extension, False)
  1715. def check_executable(exe, args=[]):
  1716. """ Checks if the given binary is installed somewhere in PATH, and returns its name.
  1717. args can be a list of arguments for a short output (like -version) """
  1718. try:
  1719. Popen.run([exe, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  1720. except OSError:
  1721. return False
  1722. return exe
  1723. def _get_exe_version_output(exe, args):
  1724. try:
  1725. # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
  1726. # SIGTTOU if yt-dlp is run in the background.
  1727. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
  1728. stdout, _, ret = Popen.run([encodeArgument(exe), *args], text=True,
  1729. stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  1730. if ret:
  1731. return None
  1732. except OSError:
  1733. return False
  1734. return stdout
  1735. def detect_exe_version(output, version_re=None, unrecognized='present'):
  1736. assert isinstance(output, str)
  1737. if version_re is None:
  1738. version_re = r'version\s+([-0-9._a-zA-Z]+)'
  1739. m = re.search(version_re, output)
  1740. if m:
  1741. return m.group(1)
  1742. else:
  1743. return unrecognized
  1744. def get_exe_version(exe, args=['--version'],
  1745. version_re=None, unrecognized=('present', 'broken')):
  1746. """ Returns the version of the specified executable,
  1747. or False if the executable is not present """
  1748. unrecognized = variadic(unrecognized)
  1749. assert len(unrecognized) in (1, 2)
  1750. out = _get_exe_version_output(exe, args)
  1751. if out is None:
  1752. return unrecognized[-1]
  1753. return out and detect_exe_version(out, version_re, unrecognized[0])
  1754. def frange(start=0, stop=None, step=1):
  1755. """Float range"""
  1756. if stop is None:
  1757. start, stop = 0, start
  1758. sign = [-1, 1][step > 0] if step else 0
  1759. while sign * start < sign * stop:
  1760. yield start
  1761. start += step
  1762. class LazyList(collections.abc.Sequence):
  1763. """Lazy immutable list from an iterable
  1764. Note that slices of a LazyList are lists and not LazyList"""
  1765. class IndexError(IndexError): # noqa: A001
  1766. pass
  1767. def __init__(self, iterable, *, reverse=False, _cache=None):
  1768. self._iterable = iter(iterable)
  1769. self._cache = [] if _cache is None else _cache
  1770. self._reversed = reverse
  1771. def __iter__(self):
  1772. if self._reversed:
  1773. # We need to consume the entire iterable to iterate in reverse
  1774. yield from self.exhaust()
  1775. return
  1776. yield from self._cache
  1777. for item in self._iterable:
  1778. self._cache.append(item)
  1779. yield item
  1780. def _exhaust(self):
  1781. self._cache.extend(self._iterable)
  1782. self._iterable = [] # Discard the emptied iterable to make it pickle-able
  1783. return self._cache
  1784. def exhaust(self):
  1785. """Evaluate the entire iterable"""
  1786. return self._exhaust()[::-1 if self._reversed else 1]
  1787. @staticmethod
  1788. def _reverse_index(x):
  1789. return None if x is None else ~x
  1790. def __getitem__(self, idx):
  1791. if isinstance(idx, slice):
  1792. if self._reversed:
  1793. idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
  1794. start, stop, step = idx.start, idx.stop, idx.step or 1
  1795. elif isinstance(idx, int):
  1796. if self._reversed:
  1797. idx = self._reverse_index(idx)
  1798. start, stop, step = idx, idx, 0
  1799. else:
  1800. raise TypeError('indices must be integers or slices')
  1801. if ((start or 0) < 0 or (stop or 0) < 0
  1802. or (start is None and step < 0)
  1803. or (stop is None and step > 0)):
  1804. # We need to consume the entire iterable to be able to slice from the end
  1805. # Obviously, never use this with infinite iterables
  1806. self._exhaust()
  1807. try:
  1808. return self._cache[idx]
  1809. except IndexError as e:
  1810. raise self.IndexError(e) from e
  1811. n = max(start or 0, stop or 0) - len(self._cache) + 1
  1812. if n > 0:
  1813. self._cache.extend(itertools.islice(self._iterable, n))
  1814. try:
  1815. return self._cache[idx]
  1816. except IndexError as e:
  1817. raise self.IndexError(e) from e
  1818. def __bool__(self):
  1819. try:
  1820. self[-1] if self._reversed else self[0]
  1821. except self.IndexError:
  1822. return False
  1823. return True
  1824. def __len__(self):
  1825. self._exhaust()
  1826. return len(self._cache)
  1827. def __reversed__(self):
  1828. return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
  1829. def __copy__(self):
  1830. return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
  1831. def __repr__(self):
  1832. # repr and str should mimic a list. So we exhaust the iterable
  1833. return repr(self.exhaust())
  1834. def __str__(self):
  1835. return repr(self.exhaust())
  1836. class PagedList:
  1837. class IndexError(IndexError): # noqa: A001
  1838. pass
  1839. def __len__(self):
  1840. # This is only useful for tests
  1841. return len(self.getslice())
  1842. def __init__(self, pagefunc, pagesize, use_cache=True):
  1843. self._pagefunc = pagefunc
  1844. self._pagesize = pagesize
  1845. self._pagecount = float('inf')
  1846. self._use_cache = use_cache
  1847. self._cache = {}
  1848. def getpage(self, pagenum):
  1849. page_results = self._cache.get(pagenum)
  1850. if page_results is None:
  1851. page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
  1852. if self._use_cache:
  1853. self._cache[pagenum] = page_results
  1854. return page_results
  1855. def getslice(self, start=0, end=None):
  1856. return list(self._getslice(start, end))
  1857. def _getslice(self, start, end):
  1858. raise NotImplementedError('This method must be implemented by subclasses')
  1859. def __getitem__(self, idx):
  1860. assert self._use_cache, 'Indexing PagedList requires cache'
  1861. if not isinstance(idx, int) or idx < 0:
  1862. raise TypeError('indices must be non-negative integers')
  1863. entries = self.getslice(idx, idx + 1)
  1864. if not entries:
  1865. raise self.IndexError
  1866. return entries[0]
  1867. def __bool__(self):
  1868. return bool(self.getslice(0, 1))
  1869. class OnDemandPagedList(PagedList):
  1870. """Download pages until a page with less than maximum results"""
  1871. def _getslice(self, start, end):
  1872. for pagenum in itertools.count(start // self._pagesize):
  1873. firstid = pagenum * self._pagesize
  1874. nextfirstid = pagenum * self._pagesize + self._pagesize
  1875. if start >= nextfirstid:
  1876. continue
  1877. startv = (
  1878. start % self._pagesize
  1879. if firstid <= start < nextfirstid
  1880. else 0)
  1881. endv = (
  1882. ((end - 1) % self._pagesize) + 1
  1883. if (end is not None and firstid <= end <= nextfirstid)
  1884. else None)
  1885. try:
  1886. page_results = self.getpage(pagenum)
  1887. except Exception:
  1888. self._pagecount = pagenum - 1
  1889. raise
  1890. if startv != 0 or endv is not None:
  1891. page_results = page_results[startv:endv]
  1892. yield from page_results
  1893. # A little optimization - if current page is not "full", ie. does
  1894. # not contain page_size videos then we can assume that this page
  1895. # is the last one - there are no more ids on further pages -
  1896. # i.e. no need to query again.
  1897. if len(page_results) + startv < self._pagesize:
  1898. break
  1899. # If we got the whole page, but the next page is not interesting,
  1900. # break out early as well
  1901. if end == nextfirstid:
  1902. break
  1903. class InAdvancePagedList(PagedList):
  1904. """PagedList with total number of pages known in advance"""
  1905. def __init__(self, pagefunc, pagecount, pagesize):
  1906. PagedList.__init__(self, pagefunc, pagesize, True)
  1907. self._pagecount = pagecount
  1908. def _getslice(self, start, end):
  1909. start_page = start // self._pagesize
  1910. end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
  1911. skip_elems = start - start_page * self._pagesize
  1912. only_more = None if end is None else end - start
  1913. for pagenum in range(start_page, end_page):
  1914. page_results = self.getpage(pagenum)
  1915. if skip_elems:
  1916. page_results = page_results[skip_elems:]
  1917. skip_elems = None
  1918. if only_more is not None:
  1919. if len(page_results) < only_more:
  1920. only_more -= len(page_results)
  1921. else:
  1922. yield from page_results[:only_more]
  1923. break
  1924. yield from page_results
  1925. class PlaylistEntries:
  1926. MissingEntry = object()
  1927. is_exhausted = False
  1928. def __init__(self, ydl, info_dict):
  1929. self.ydl = ydl
  1930. # _entries must be assigned now since infodict can change during iteration
  1931. entries = info_dict.get('entries')
  1932. if entries is None:
  1933. raise EntryNotInPlaylist('There are no entries')
  1934. elif isinstance(entries, list):
  1935. self.is_exhausted = True
  1936. requested_entries = info_dict.get('requested_entries')
  1937. self.is_incomplete = requested_entries is not None
  1938. if self.is_incomplete:
  1939. assert self.is_exhausted
  1940. self._entries = [self.MissingEntry] * max(requested_entries or [0])
  1941. for i, entry in zip(requested_entries, entries):
  1942. self._entries[i - 1] = entry
  1943. elif isinstance(entries, (list, PagedList, LazyList)):
  1944. self._entries = entries
  1945. else:
  1946. self._entries = LazyList(entries)
  1947. PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
  1948. (?P<start>[+-]?\d+)?
  1949. (?P<range>[:-]
  1950. (?P<end>[+-]?\d+|inf(?:inite)?)?
  1951. (?::(?P<step>[+-]?\d+))?
  1952. )?''')
  1953. @classmethod
  1954. def parse_playlist_items(cls, string):
  1955. for segment in string.split(','):
  1956. if not segment:
  1957. raise ValueError('There is two or more consecutive commas')
  1958. mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
  1959. if not mobj:
  1960. raise ValueError(f'{segment!r} is not a valid specification')
  1961. start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
  1962. if int_or_none(step) == 0:
  1963. raise ValueError(f'Step in {segment!r} cannot be zero')
  1964. yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
  1965. def get_requested_items(self):
  1966. playlist_items = self.ydl.params.get('playlist_items')
  1967. playlist_start = self.ydl.params.get('playliststart', 1)
  1968. playlist_end = self.ydl.params.get('playlistend')
  1969. # For backwards compatibility, interpret -1 as whole list
  1970. if playlist_end in (-1, None):
  1971. playlist_end = ''
  1972. if not playlist_items:
  1973. playlist_items = f'{playlist_start}:{playlist_end}'
  1974. elif playlist_start != 1 or playlist_end:
  1975. self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
  1976. for index in self.parse_playlist_items(playlist_items):
  1977. for i, entry in self[index]:
  1978. yield i, entry
  1979. if not entry:
  1980. continue
  1981. try:
  1982. # The item may have just been added to archive. Don't break due to it
  1983. if not self.ydl.params.get('lazy_playlist'):
  1984. # TODO: Add auto-generated fields
  1985. self.ydl._match_entry(entry, incomplete=True, silent=True)
  1986. except (ExistingVideoReached, RejectedVideoReached):
  1987. return
  1988. def get_full_count(self):
  1989. if self.is_exhausted and not self.is_incomplete:
  1990. return len(self)
  1991. elif isinstance(self._entries, InAdvancePagedList):
  1992. if self._entries._pagesize == 1:
  1993. return self._entries._pagecount
  1994. @functools.cached_property
  1995. def _getter(self):
  1996. if isinstance(self._entries, list):
  1997. def get_entry(i):
  1998. try:
  1999. entry = self._entries[i]
  2000. except IndexError:
  2001. entry = self.MissingEntry
  2002. if not self.is_incomplete:
  2003. raise self.IndexError
  2004. if entry is self.MissingEntry:
  2005. raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
  2006. return entry
  2007. else:
  2008. def get_entry(i):
  2009. try:
  2010. return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
  2011. except (LazyList.IndexError, PagedList.IndexError):
  2012. raise self.IndexError
  2013. return get_entry
  2014. def __getitem__(self, idx):
  2015. if isinstance(idx, int):
  2016. idx = slice(idx, idx)
  2017. # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
  2018. step = 1 if idx.step is None else idx.step
  2019. if idx.start is None:
  2020. start = 0 if step > 0 else len(self) - 1
  2021. else:
  2022. start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
  2023. # NB: Do not call len(self) when idx == [:]
  2024. if idx.stop is None:
  2025. stop = 0 if step < 0 else float('inf')
  2026. else:
  2027. stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
  2028. stop += [-1, 1][step > 0]
  2029. for i in frange(start, stop, step):
  2030. if i < 0:
  2031. continue
  2032. try:
  2033. entry = self._getter(i)
  2034. except self.IndexError:
  2035. self.is_exhausted = True
  2036. if step > 0:
  2037. break
  2038. continue
  2039. yield i + 1, entry
  2040. def __len__(self):
  2041. return len(tuple(self[:]))
  2042. class IndexError(IndexError): # noqa: A001
  2043. pass
  2044. def uppercase_escape(s):
  2045. unicode_escape = codecs.getdecoder('unicode_escape')
  2046. return re.sub(
  2047. r'\\U[0-9a-fA-F]{8}',
  2048. lambda m: unicode_escape(m.group(0))[0],
  2049. s)
  2050. def lowercase_escape(s):
  2051. unicode_escape = codecs.getdecoder('unicode_escape')
  2052. return re.sub(
  2053. r'\\u[0-9a-fA-F]{4}',
  2054. lambda m: unicode_escape(m.group(0))[0],
  2055. s)
  2056. def parse_qs(url, **kwargs):
  2057. return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
  2058. def read_batch_urls(batch_fd):
  2059. def fixup(url):
  2060. if not isinstance(url, str):
  2061. url = url.decode('utf-8', 'replace')
  2062. BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
  2063. for bom in BOM_UTF8:
  2064. if url.startswith(bom):
  2065. url = url[len(bom):]
  2066. url = url.lstrip()
  2067. if not url or url.startswith(('#', ';', ']')):
  2068. return False
  2069. # "#" cannot be stripped out since it is part of the URI
  2070. # However, it can be safely stripped out if following a whitespace
  2071. return re.split(r'\s#', url, maxsplit=1)[0].rstrip()
  2072. with contextlib.closing(batch_fd) as fd:
  2073. return [url for url in map(fixup, fd) if url]
  2074. def urlencode_postdata(*args, **kargs):
  2075. return urllib.parse.urlencode(*args, **kargs).encode('ascii')
  2076. @partial_application
  2077. def update_url(url, *, query_update=None, **kwargs):
  2078. """Replace URL components specified by kwargs
  2079. @param url str or parse url tuple
  2080. @param query_update update query
  2081. @returns str
  2082. """
  2083. if isinstance(url, str):
  2084. if not kwargs and not query_update:
  2085. return url
  2086. else:
  2087. url = urllib.parse.urlparse(url)
  2088. if query_update:
  2089. assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
  2090. kwargs['query'] = urllib.parse.urlencode({
  2091. **urllib.parse.parse_qs(url.query),
  2092. **query_update,
  2093. }, True)
  2094. return urllib.parse.urlunparse(url._replace(**kwargs))
  2095. @partial_application
  2096. def update_url_query(url, query):
  2097. return update_url(url, query_update=query)
  2098. def _multipart_encode_impl(data, boundary):
  2099. content_type = f'multipart/form-data; boundary={boundary}'
  2100. out = b''
  2101. for k, v in data.items():
  2102. out += b'--' + boundary.encode('ascii') + b'\r\n'
  2103. if isinstance(k, str):
  2104. k = k.encode()
  2105. if isinstance(v, str):
  2106. v = v.encode()
  2107. # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
  2108. # suggests sending UTF-8 directly. Firefox sends UTF-8, too
  2109. content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
  2110. if boundary.encode('ascii') in content:
  2111. raise ValueError('Boundary overlaps with data')
  2112. out += content
  2113. out += b'--' + boundary.encode('ascii') + b'--\r\n'
  2114. return out, content_type
  2115. def multipart_encode(data, boundary=None):
  2116. """
  2117. Encode a dict to RFC 7578-compliant form-data
  2118. data:
  2119. A dict where keys and values can be either Unicode or bytes-like
  2120. objects.
  2121. boundary:
  2122. If specified a Unicode object, it's used as the boundary. Otherwise
  2123. a random boundary is generated.
  2124. Reference: https://tools.ietf.org/html/rfc7578
  2125. """
  2126. has_specified_boundary = boundary is not None
  2127. while True:
  2128. if boundary is None:
  2129. boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
  2130. try:
  2131. out, content_type = _multipart_encode_impl(data, boundary)
  2132. break
  2133. except ValueError:
  2134. if has_specified_boundary:
  2135. raise
  2136. boundary = None
  2137. return out, content_type
  2138. def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
  2139. if blocked_types is NO_DEFAULT:
  2140. blocked_types = (str, bytes, collections.abc.Mapping)
  2141. return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
  2142. def variadic(x, allowed_types=NO_DEFAULT):
  2143. if not isinstance(allowed_types, (tuple, type)):
  2144. deprecation_warning('allowed_types should be a tuple or a type')
  2145. allowed_types = tuple(allowed_types)
  2146. return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
  2147. def try_call(*funcs, expected_type=None, args=[], kwargs={}):
  2148. for f in funcs:
  2149. try:
  2150. val = f(*args, **kwargs)
  2151. except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
  2152. pass
  2153. else:
  2154. if expected_type is None or isinstance(val, expected_type):
  2155. return val
  2156. def try_get(src, getter, expected_type=None):
  2157. return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
  2158. def filter_dict(dct, cndn=lambda _, v: v is not None):
  2159. return {k: v for k, v in dct.items() if cndn(k, v)}
  2160. def merge_dicts(*dicts):
  2161. merged = {}
  2162. for a_dict in dicts:
  2163. for k, v in a_dict.items():
  2164. if ((v is not None and k not in merged)
  2165. or (isinstance(v, str) and merged[k] == '')):
  2166. merged[k] = v
  2167. return merged
  2168. def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
  2169. return string if isinstance(string, str) else str(string, encoding, errors)
  2170. US_RATINGS = {
  2171. 'G': 0,
  2172. 'PG': 10,
  2173. 'PG-13': 13,
  2174. 'R': 16,
  2175. 'NC': 18,
  2176. }
  2178. 'TV-Y': 0,
  2179. 'TV-Y7': 7,
  2180. 'TV-G': 0,
  2181. 'TV-PG': 0,
  2182. 'TV-14': 14,
  2183. 'TV-MA': 17,
  2184. }
  2185. def parse_age_limit(s):
  2186. # isinstance(False, int) is True. So type() must be used instead
  2187. if type(s) is int: # noqa: E721
  2188. return s if 0 <= s <= 21 else None
  2189. elif not isinstance(s, str):
  2190. return None
  2191. m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
  2192. if m:
  2193. return int(m.group('age'))
  2194. s = s.upper()
  2195. if s in US_RATINGS:
  2196. return US_RATINGS[s]
  2197. m = re.match(r'^TV[_-]?({})$'.format('|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES)), s)
  2198. if m:
  2199. return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
  2200. return None
  2201. def strip_jsonp(code):
  2202. return re.sub(
  2203. r'''(?sx)^
  2204. (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
  2205. (?:\s*&&\s*(?P=func_name))?
  2206. \s*\(\s*(?P<callback_data>.*)\);?
  2207. \s*?(?://[^\n]*)*$''',
  2208. r'\g<callback_data>', code)
  2209. def js_to_json(code, vars={}, *, strict=False):
  2210. # vars is a dict of var, val pairs to substitute
  2211. STRING_QUOTES = '\'"`'
  2212. STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
  2213. COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
  2214. SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
  2215. INTEGER_TABLE = (
  2216. (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
  2217. (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
  2218. )
  2219. def process_escape(match):
  2220. JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
  2221. escape = match.group(1) or match.group(2)
  2222. return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
  2223. else R'\u00' if escape == 'x'
  2224. else '' if escape == '\n'
  2225. else escape)
  2226. def template_substitute(match):
  2227. evaluated = js_to_json(match.group(1), vars, strict=strict)
  2228. if evaluated[0] == '"':
  2229. return json.loads(evaluated)
  2230. return evaluated
  2231. def fix_kv(m):
  2232. v = m.group(0)
  2233. if v in ('true', 'false', 'null'):
  2234. return v
  2235. elif v in ('undefined', 'void 0'):
  2236. return 'null'
  2237. elif v.startswith(('/*', '//', '!')) or v == ',':
  2238. return ''
  2239. if v[0] in STRING_QUOTES:
  2240. v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
  2241. escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
  2242. return f'"{escaped}"'
  2243. for regex, base in INTEGER_TABLE:
  2244. im = re.match(regex, v)
  2245. if im:
  2246. i = int(im.group(1), base)
  2247. return f'"{i}":' if v.endswith(':') else str(i)
  2248. if v in vars:
  2249. try:
  2250. if not strict:
  2251. json.loads(vars[v])
  2252. except json.JSONDecodeError:
  2253. return json.dumps(vars[v])
  2254. else:
  2255. return vars[v]
  2256. if not strict:
  2257. return f'"{v}"'
  2258. raise ValueError(f'Unknown value: {v}')
  2259. def create_map(mobj):
  2260. return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
  2261. code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
  2262. code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
  2263. if not strict:
  2264. code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
  2265. code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
  2266. code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
  2267. code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
  2268. return re.sub(rf'''(?sx)
  2269. {STRING_RE}|
  2270. {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
  2271. void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
  2272. \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
  2273. [0-9]+(?={SKIP_RE}:)|
  2274. !+
  2275. ''', fix_kv, code)
  2276. def qualities(quality_ids):
  2277. """ Get a numeric quality value out of a list of possible values """
  2278. def q(qid):
  2279. try:
  2280. return quality_ids.index(qid)
  2281. except ValueError:
  2282. return -1
  2283. return q
  2284. POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
  2286. 'default': '%(title)s [%(id)s].%(ext)s',
  2287. 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
  2288. }
  2289. OUTTMPL_TYPES = {
  2290. 'chapter': None,
  2291. 'subtitle': None,
  2292. 'thumbnail': None,
  2293. 'description': 'description',
  2294. 'annotation': 'annotations.xml',
  2295. 'infojson': 'info.json',
  2296. 'link': None,
  2297. 'pl_video': None,
  2298. 'pl_thumbnail': None,
  2299. 'pl_description': 'description',
  2300. 'pl_infojson': 'info.json',
  2301. }
  2302. # As of [1] format syntax is:
  2303. # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
  2304. # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
  2305. STR_FORMAT_RE_TMPL = r'''(?x)
  2306. (?<!%)(?P<prefix>(?:%%)*)
  2307. %
  2308. (?P<has_key>\((?P<key>{0})\))?
  2309. (?P<format>
  2310. (?P<conversion>[#0\-+ ]+)?
  2311. (?P<min_width>\d+)?
  2312. (?P<precision>\.\d+)?
  2313. (?P<len_mod>[hlL])? # unused in python
  2314. {1} # conversion type
  2315. )
  2316. '''
  2317. STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
  2318. def limit_length(s, length):
  2319. """ Add ellipses to overly long strings """
  2320. if s is None:
  2321. return None
  2322. ELLIPSES = '...'
  2323. if len(s) > length:
  2324. return s[:length - len(ELLIPSES)] + ELLIPSES
  2325. return s
  2326. def version_tuple(v):
  2327. return tuple(int(e) for e in re.split(r'[-.]', v))
  2328. def is_outdated_version(version, limit, assume_new=True):
  2329. if not version:
  2330. return not assume_new
  2331. try:
  2332. return version_tuple(version) < version_tuple(limit)
  2333. except ValueError:
  2334. return not assume_new
  2335. def ytdl_is_updateable():
  2336. """ Returns if yt-dlp can be updated with -U """
  2337. from ..update import is_non_updateable
  2338. return not is_non_updateable()
  2339. def args_to_str(args):
  2340. # Get a short string representation for a subprocess command
  2341. return shell_quote(args)
  2342. def error_to_str(err):
  2343. return f'{type(err).__name__}: {err}'
  2344. @partial_application
  2345. def mimetype2ext(mt, default=NO_DEFAULT):
  2346. if not isinstance(mt, str):
  2347. if default is not NO_DEFAULT:
  2348. return default
  2349. return None
  2350. MAP = {
  2351. # video
  2352. '3gpp': '3gp',
  2353. 'mp2t': 'ts',
  2354. 'mp4': 'mp4',
  2355. 'mpeg': 'mpeg',
  2356. 'mpegurl': 'm3u8',
  2357. 'quicktime': 'mov',
  2358. 'webm': 'webm',
  2359. 'vp9': 'vp9',
  2360. 'video/ogg': 'ogv',
  2361. 'x-flv': 'flv',
  2362. 'x-m4v': 'm4v',
  2363. 'x-matroska': 'mkv',
  2364. 'x-mng': 'mng',
  2365. 'x-mp4-fragmented': 'mp4',
  2366. 'x-ms-asf': 'asf',
  2367. 'x-ms-wmv': 'wmv',
  2368. 'x-msvideo': 'avi',
  2369. # application (streaming playlists)
  2370. 'dash+xml': 'mpd',
  2371. 'f4m+xml': 'f4m',
  2372. 'hds+xml': 'f4m',
  2373. 'vnd.apple.mpegurl': 'm3u8',
  2374. 'vnd.ms-sstr+xml': 'ism',
  2375. 'x-mpegurl': 'm3u8',
  2376. # audio
  2377. 'audio/mp4': 'm4a',
  2378. # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
  2379. # Using .mp3 as it's the most popular one
  2380. 'audio/mpeg': 'mp3',
  2381. 'audio/webm': 'webm',
  2382. 'audio/x-matroska': 'mka',
  2383. 'audio/x-mpegurl': 'm3u',
  2384. 'aacp': 'aac',
  2385. 'midi': 'mid',
  2386. 'ogg': 'ogg',
  2387. 'wav': 'wav',
  2388. 'wave': 'wav',
  2389. 'x-aac': 'aac',
  2390. 'x-flac': 'flac',
  2391. 'x-m4a': 'm4a',
  2392. 'x-realaudio': 'ra',
  2393. 'x-wav': 'wav',
  2394. # image
  2395. 'avif': 'avif',
  2396. 'bmp': 'bmp',
  2397. 'gif': 'gif',
  2398. 'jpeg': 'jpg',
  2399. 'png': 'png',
  2400. 'svg+xml': 'svg',
  2401. 'tiff': 'tif',
  2402. 'vnd.wap.wbmp': 'wbmp',
  2403. 'webp': 'webp',
  2404. 'x-icon': 'ico',
  2405. 'x-jng': 'jng',
  2406. 'x-ms-bmp': 'bmp',
  2407. # caption
  2408. 'filmstrip+json': 'fs',
  2409. 'smptett+xml': 'tt',
  2410. 'ttaf+xml': 'dfxp',
  2411. 'ttml+xml': 'ttml',
  2412. 'x-ms-sami': 'sami',
  2413. # misc
  2414. 'gzip': 'gz',
  2415. 'json': 'json',
  2416. 'xml': 'xml',
  2417. 'zip': 'zip',
  2418. }
  2419. mimetype = mt.partition(';')[0].strip().lower()
  2420. _, _, subtype = mimetype.rpartition('/')
  2421. ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
  2422. if ext:
  2423. return ext
  2424. elif default is not NO_DEFAULT:
  2425. return default
  2426. return subtype.replace('+', '.')
  2427. def ext2mimetype(ext_or_url):
  2428. if not ext_or_url:
  2429. return None
  2430. if '.' not in ext_or_url:
  2431. ext_or_url = f'file.{ext_or_url}'
  2432. return mimetypes.guess_type(ext_or_url)[0]
  2433. def parse_codecs(codecs_str):
  2434. # http://tools.ietf.org/html/rfc6381
  2435. if not codecs_str:
  2436. return {}
  2437. split_codecs = list(filter(None, map(
  2438. str.strip, codecs_str.strip().strip(',').split(','))))
  2439. vcodec, acodec, scodec, hdr = None, None, None, None
  2440. for full_codec in split_codecs:
  2441. full_codec = re.sub(r'^([^.]+)', lambda m: m.group(1).lower(), full_codec)
  2442. parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
  2443. if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
  2444. 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
  2445. if vcodec:
  2446. continue
  2447. vcodec = full_codec
  2448. if parts[0] in ('dvh1', 'dvhe'):
  2449. hdr = 'DV'
  2450. elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
  2451. hdr = 'HDR10'
  2452. elif parts[:2] == ['vp9', '2']:
  2453. hdr = 'HDR10'
  2454. elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
  2455. 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
  2456. acodec = acodec or full_codec
  2457. elif parts[0] in ('stpp', 'wvtt'):
  2458. scodec = scodec or full_codec
  2459. else:
  2460. write_string(f'WARNING: Unknown codec {full_codec}\n')
  2461. if vcodec or acodec or scodec:
  2462. return {
  2463. 'vcodec': vcodec or 'none',
  2464. 'acodec': acodec or 'none',
  2465. 'dynamic_range': hdr,
  2466. **({'scodec': scodec} if scodec is not None else {}),
  2467. }
  2468. elif len(split_codecs) == 2:
  2469. return {
  2470. 'vcodec': split_codecs[0],
  2471. 'acodec': split_codecs[1],
  2472. }
  2473. return {}
  2474. def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
  2475. assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
  2476. allow_mkv = not preferences or 'mkv' in preferences
  2477. if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
  2478. return 'mkv' # TODO: any other format allows this?
  2479. # TODO: All codecs supported by parse_codecs isn't handled here
  2481. 'mp4': {
  2482. 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
  2483. 'h264', 'aacl', 'ec-3', # Set in ISM
  2484. },
  2485. 'webm': {
  2486. 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
  2487. 'vp9x', 'vp8x', # in the webm spec
  2488. },
  2489. }
  2490. sanitize_codec = functools.partial(
  2491. try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
  2492. vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
  2493. for ext in preferences or COMPATIBLE_CODECS.keys():
  2494. codec_set = COMPATIBLE_CODECS.get(ext, set())
  2495. if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
  2496. return ext
  2498. {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
  2499. {'webm', 'weba'},
  2500. )
  2501. for ext in preferences or vexts:
  2502. current_exts = {ext, *vexts, *aexts}
  2503. if ext == 'mkv' or current_exts == {ext} or any(
  2504. ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
  2505. return ext
  2506. return 'mkv' if allow_mkv else preferences[-1]
  2507. def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
  2508. getheader = url_handle.headers.get
  2509. cd = getheader('Content-Disposition')
  2510. if cd:
  2511. m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
  2512. if m:
  2513. e = determine_ext(m.group('filename'), default_ext=None)
  2514. if e:
  2515. return e
  2516. meta_ext = getheader('x-amz-meta-name')
  2517. if meta_ext:
  2518. e = meta_ext.rpartition('.')[2]
  2519. if e:
  2520. return e
  2521. return mimetype2ext(getheader('Content-Type'), default=default)
  2522. def encode_data_uri(data, mime_type):
  2523. return 'data:{};base64,{}'.format(mime_type, base64.b64encode(data).decode('ascii'))
  2524. def age_restricted(content_limit, age_limit):
  2525. """ Returns True iff the content should be blocked """
  2526. if age_limit is None: # No limit set
  2527. return False
  2528. if content_limit is None:
  2529. return False # Content available for everyone
  2530. return age_limit < content_limit
  2531. # List of known byte-order-marks (BOM)
  2532. BOMS = [
  2533. (b'\xef\xbb\xbf', 'utf-8'),
  2534. (b'\x00\x00\xfe\xff', 'utf-32-be'),
  2535. (b'\xff\xfe\x00\x00', 'utf-32-le'),
  2536. (b'\xff\xfe', 'utf-16-le'),
  2537. (b'\xfe\xff', 'utf-16-be'),
  2538. ]
  2539. def is_html(first_bytes):
  2540. """ Detect whether a file contains HTML by examining its first bytes. """
  2541. encoding = 'utf-8'
  2542. for bom, enc in BOMS:
  2543. while first_bytes.startswith(bom):
  2544. encoding, first_bytes = enc, first_bytes[len(bom):]
  2545. return re.match(r'\s*<', first_bytes.decode(encoding, 'replace'))
  2546. def determine_protocol(info_dict):
  2547. protocol = info_dict.get('protocol')
  2548. if protocol is not None:
  2549. return protocol
  2550. url = sanitize_url(info_dict['url'])
  2551. if url.startswith('rtmp'):
  2552. return 'rtmp'
  2553. elif url.startswith('mms'):
  2554. return 'mms'
  2555. elif url.startswith('rtsp'):
  2556. return 'rtsp'
  2557. ext = determine_ext(url)
  2558. if ext == 'm3u8':
  2559. return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
  2560. elif ext == 'f4m':
  2561. return 'f4m'
  2562. return urllib.parse.urlparse(url).scheme
  2563. def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
  2564. """ Render a list of rows, each as a list of values.
  2565. Text after a \t will be right aligned """
  2566. def width(string):
  2567. return len(remove_terminal_sequences(string).replace('\t', ''))
  2568. def get_max_lens(table):
  2569. return [max(width(str(v)) for v in col) for col in zip(*table)]
  2570. def filter_using_list(row, filter_array):
  2571. return [col for take, col in itertools.zip_longest(filter_array, row, fillvalue=True) if take]
  2572. max_lens = get_max_lens(data) if hide_empty else []
  2573. header_row = filter_using_list(header_row, max_lens)
  2574. data = [filter_using_list(row, max_lens) for row in data]
  2575. table = [header_row, *data]
  2576. max_lens = get_max_lens(table)
  2577. extra_gap += 1
  2578. if delim:
  2579. table = [header_row, [delim * (ml + extra_gap) for ml in max_lens], *data]
  2580. table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
  2581. for row in table:
  2582. for pos, text in enumerate(map(str, row)):
  2583. if '\t' in text:
  2584. row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
  2585. else:
  2586. row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
  2587. return '\n'.join(''.join(row).rstrip() for row in table)
  2588. def _match_one(filter_part, dct, incomplete):
  2589. # TODO: Generalize code with YoutubeDL._build_format_filter
  2591. '*=': operator.contains,
  2592. '^=': lambda attr, value: attr.startswith(value),
  2593. '$=': lambda attr, value: attr.endswith(value),
  2594. '~=': lambda attr, value: re.search(value, attr),
  2595. }
  2598. '<=': operator.le, # "<=" must be defined above "<"
  2599. '<': operator.lt,
  2600. '>=': operator.ge,
  2601. '>': operator.gt,
  2602. '=': operator.eq,
  2603. }
  2604. if isinstance(incomplete, bool):
  2605. is_incomplete = lambda _: incomplete
  2606. else:
  2607. is_incomplete = lambda k: k in incomplete
  2608. operator_rex = re.compile(r'''(?x)
  2609. (?P<key>[a-z_]+)
  2610. \s*(?P<negation>!\s*)?(?P<op>{})(?P<none_inclusive>\s*\?)?\s*
  2611. (?:
  2612. (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
  2613. (?P<strval>.+?)
  2614. )
  2615. '''.format('|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))))
  2616. m = operator_rex.fullmatch(filter_part.strip())
  2617. if m:
  2618. m = m.groupdict()
  2619. unnegated_op = COMPARISON_OPERATORS[m['op']]
  2620. if m['negation']:
  2621. op = lambda attr, value: not unnegated_op(attr, value)
  2622. else:
  2623. op = unnegated_op
  2624. comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
  2625. if m['quote']:
  2626. comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote'])
  2627. actual_value = dct.get(m['key'])
  2628. numeric_comparison = None
  2629. if isinstance(actual_value, (int, float)):
  2630. # If the original field is a string and matching comparisonvalue is
  2631. # a number we should respect the origin of the original field
  2632. # and process comparison value as a string (see
  2633. # https://github.com/ytdl-org/youtube-dl/issues/11082)
  2634. try:
  2635. numeric_comparison = int(comparison_value)
  2636. except ValueError:
  2637. numeric_comparison = parse_filesize(comparison_value)
  2638. if numeric_comparison is None:
  2639. numeric_comparison = parse_filesize(f'{comparison_value}B')
  2640. if numeric_comparison is None:
  2641. numeric_comparison = parse_duration(comparison_value)
  2642. if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
  2643. raise ValueError('Operator {} only supports string values!'.format(m['op']))
  2644. if actual_value is None:
  2645. return is_incomplete(m['key']) or m['none_inclusive']
  2646. return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
  2648. '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
  2649. '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
  2650. }
  2651. operator_rex = re.compile(r'''(?x)
  2652. (?P<op>{})\s*(?P<key>[a-z_]+)
  2653. '''.format('|'.join(map(re.escape, UNARY_OPERATORS.keys()))))
  2654. m = operator_rex.fullmatch(filter_part.strip())
  2655. if m:
  2656. op = UNARY_OPERATORS[m.group('op')]
  2657. actual_value = dct.get(m.group('key'))
  2658. if is_incomplete(m.group('key')) and actual_value is None:
  2659. return True
  2660. return op(actual_value)
  2661. raise ValueError(f'Invalid filter part {filter_part!r}')
  2662. def match_str(filter_str, dct, incomplete=False):
  2663. """ Filter a dictionary with a simple string syntax.
  2664. @returns Whether the filter passes
  2665. @param incomplete Set of keys that is expected to be missing from dct.
  2666. Can be True/False to indicate all/none of the keys may be missing.
  2667. All conditions on incomplete keys pass if the key is missing
  2668. """
  2669. return all(
  2670. _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
  2671. for filter_part in re.split(r'(?<!\\)&', filter_str))
  2672. def match_filter_func(filters, breaking_filters=None):
  2673. if not filters and not breaking_filters:
  2674. return None
  2675. repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
  2676. breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
  2677. filters = set(variadic(filters or []))
  2678. interactive = '-' in filters
  2679. if interactive:
  2680. filters.remove('-')
  2681. @function_with_repr.set_repr(repr_)
  2682. def _match_func(info_dict, incomplete=False):
  2683. ret = breaking_filters(info_dict, incomplete)
  2684. if ret is not None:
  2685. raise RejectedVideoReached(ret)
  2686. if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
  2687. return NO_DEFAULT if interactive and not incomplete else None
  2688. else:
  2689. video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
  2690. filter_str = ') | ('.join(map(str.strip, filters))
  2691. return f'{video_title} does not pass filter ({filter_str}), skipping ..'
  2692. return _match_func
  2693. class download_range_func:
  2694. def __init__(self, chapters, ranges, from_info=False):
  2695. self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
  2696. def __call__(self, info_dict, ydl):
  2697. warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
  2698. else 'Cannot match chapters since chapter information is unavailable')
  2699. for regex in self.chapters or []:
  2700. for i, chapter in enumerate(info_dict.get('chapters') or []):
  2701. if re.search(regex, chapter['title']):
  2702. warning = None
  2703. yield {**chapter, 'index': i}
  2704. if self.chapters and warning:
  2705. ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
  2706. for start, end in self.ranges or []:
  2707. yield {
  2708. 'start_time': self._handle_negative_timestamp(start, info_dict),
  2709. 'end_time': self._handle_negative_timestamp(end, info_dict),
  2710. }
  2711. if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
  2712. yield {
  2713. 'start_time': info_dict.get('start_time') or 0,
  2714. 'end_time': info_dict.get('end_time') or float('inf'),
  2715. }
  2716. elif not self.ranges and not self.chapters:
  2717. yield {}
  2718. @staticmethod
  2719. def _handle_negative_timestamp(time, info):
  2720. return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
  2721. def __eq__(self, other):
  2722. return (isinstance(other, download_range_func)
  2723. and self.chapters == other.chapters and self.ranges == other.ranges)
  2724. def __repr__(self):
  2725. return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
  2726. def parse_dfxp_time_expr(time_expr):
  2727. if not time_expr:
  2728. return
  2729. mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
  2730. if mobj:
  2731. return float(mobj.group('time_offset'))
  2732. mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
  2733. if mobj:
  2734. return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  2735. def srt_subtitles_timecode(seconds):
  2736. return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
  2737. def ass_subtitles_timecode(seconds):
  2738. time = timetuple_from_msec(seconds * 1000)
  2739. return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
  2740. def dfxp2srt(dfxp_data):
  2741. """
  2742. @param dfxp_data A bytes-like object containing DFXP data
  2743. @returns A unicode object containing converted SRT data
  2744. """
  2746. (b'http://www.w3.org/ns/ttml', [
  2747. b'http://www.w3.org/2004/11/ttaf1',
  2748. b'http://www.w3.org/2006/04/ttaf1',
  2749. b'http://www.w3.org/2006/10/ttaf1',
  2750. ]),
  2751. (b'http://www.w3.org/ns/ttml#styling', [
  2752. b'http://www.w3.org/ns/ttml#style',
  2753. ]),
  2754. )
  2756. 'color',
  2757. 'fontFamily',
  2758. 'fontSize',
  2759. 'fontStyle',
  2760. 'fontWeight',
  2761. 'textDecoration',
  2762. ]
  2763. _x = functools.partial(xpath_with_ns, ns_map={
  2764. 'xml': 'http://www.w3.org/XML/1998/namespace',
  2765. 'ttml': 'http://www.w3.org/ns/ttml',
  2766. 'tts': 'http://www.w3.org/ns/ttml#styling',
  2767. })
  2768. styles = {}
  2769. default_style = {}
  2770. class TTMLPElementParser:
  2771. _out = ''
  2772. _unclosed_elements = []
  2773. _applied_styles = []
  2774. def start(self, tag, attrib):
  2775. if tag in (_x('ttml:br'), 'br'):
  2776. self._out += '\n'
  2777. else:
  2778. unclosed_elements = []
  2779. style = {}
  2780. element_style_id = attrib.get('style')
  2781. if default_style:
  2782. style.update(default_style)
  2783. if element_style_id:
  2784. style.update(styles.get(element_style_id, {}))
  2785. for prop in SUPPORTED_STYLING:
  2786. prop_val = attrib.get(_x('tts:' + prop))
  2787. if prop_val:
  2788. style[prop] = prop_val
  2789. if style:
  2790. font = ''
  2791. for k, v in sorted(style.items()):
  2792. if self._applied_styles and self._applied_styles[-1].get(k) == v:
  2793. continue
  2794. if k == 'color':
  2795. font += f' color="{v}"'
  2796. elif k == 'fontSize':
  2797. font += f' size="{v}"'
  2798. elif k == 'fontFamily':
  2799. font += f' face="{v}"'
  2800. elif k == 'fontWeight' and v == 'bold':
  2801. self._out += '<b>'
  2802. unclosed_elements.append('b')
  2803. elif k == 'fontStyle' and v == 'italic':
  2804. self._out += '<i>'
  2805. unclosed_elements.append('i')
  2806. elif k == 'textDecoration' and v == 'underline':
  2807. self._out += '<u>'
  2808. unclosed_elements.append('u')
  2809. if font:
  2810. self._out += '<font' + font + '>'
  2811. unclosed_elements.append('font')
  2812. applied_style = {}
  2813. if self._applied_styles:
  2814. applied_style.update(self._applied_styles[-1])
  2815. applied_style.update(style)
  2816. self._applied_styles.append(applied_style)
  2817. self._unclosed_elements.append(unclosed_elements)
  2818. def end(self, tag):
  2819. if tag not in (_x('ttml:br'), 'br'):
  2820. unclosed_elements = self._unclosed_elements.pop()
  2821. for element in reversed(unclosed_elements):
  2822. self._out += f'</{element}>'
  2823. if unclosed_elements and self._applied_styles:
  2824. self._applied_styles.pop()
  2825. def data(self, data):
  2826. self._out += data
  2827. def close(self):
  2828. return self._out.strip()
  2829. # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
  2830. # This will not trigger false positives since only UTF-8 text is being replaced
  2831. dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
  2832. def parse_node(node):
  2833. target = TTMLPElementParser()
  2834. parser = xml.etree.ElementTree.XMLParser(target=target)
  2835. parser.feed(xml.etree.ElementTree.tostring(node))
  2836. return parser.close()
  2837. for k, v in LEGACY_NAMESPACES:
  2838. for ns in v:
  2839. dfxp_data = dfxp_data.replace(ns, k)
  2840. dfxp = compat_etree_fromstring(dfxp_data)
  2841. out = []
  2842. paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
  2843. if not paras:
  2844. raise ValueError('Invalid dfxp/TTML subtitle')
  2845. repeat = False
  2846. while True:
  2847. for style in dfxp.findall(_x('.//ttml:style')):
  2848. style_id = style.get('id') or style.get(_x('xml:id'))
  2849. if not style_id:
  2850. continue
  2851. parent_style_id = style.get('style')
  2852. if parent_style_id:
  2853. if parent_style_id not in styles:
  2854. repeat = True
  2855. continue
  2856. styles[style_id] = styles[parent_style_id].copy()
  2857. for prop in SUPPORTED_STYLING:
  2858. prop_val = style.get(_x('tts:' + prop))
  2859. if prop_val:
  2860. styles.setdefault(style_id, {})[prop] = prop_val
  2861. if repeat:
  2862. repeat = False
  2863. else:
  2864. break
  2865. for p in ('body', 'div'):
  2866. ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
  2867. if ele is None:
  2868. continue
  2869. style = styles.get(ele.get('style'))
  2870. if not style:
  2871. continue
  2872. default_style.update(style)
  2873. for para, index in zip(paras, itertools.count(1)):
  2874. begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
  2875. end_time = parse_dfxp_time_expr(para.attrib.get('end'))
  2876. dur = parse_dfxp_time_expr(para.attrib.get('dur'))
  2877. if begin_time is None:
  2878. continue
  2879. if not end_time:
  2880. if not dur:
  2881. continue
  2882. end_time = begin_time + dur
  2883. out.append('%d\n%s --> %s\n%s\n\n' % (
  2884. index,
  2885. srt_subtitles_timecode(begin_time),
  2886. srt_subtitles_timecode(end_time),
  2887. parse_node(para)))
  2888. return ''.join(out)
  2889. def cli_option(params, command_option, param, separator=None):
  2890. param = params.get(param)
  2891. return ([] if param is None
  2892. else [command_option, str(param)] if separator is None
  2893. else [f'{command_option}{separator}{param}'])
  2894. def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
  2895. param = params.get(param)
  2896. assert param in (True, False, None)
  2897. return cli_option({True: true_value, False: false_value}, command_option, param, separator)
  2898. def cli_valueless_option(params, command_option, param, expected_value=True):
  2899. return [command_option] if params.get(param) == expected_value else []
  2900. def cli_configuration_args(argdict, keys, default=[], use_compat=True):
  2901. if isinstance(argdict, (list, tuple)): # for backward compatibility
  2902. if use_compat:
  2903. return argdict
  2904. else:
  2905. argdict = None
  2906. if argdict is None:
  2907. return default
  2908. assert isinstance(argdict, dict)
  2909. assert isinstance(keys, (list, tuple))
  2910. for key_list in keys:
  2911. arg_list = list(filter(
  2912. lambda x: x is not None,
  2913. [argdict.get(key.lower()) for key in variadic(key_list)]))
  2914. if arg_list:
  2915. return [arg for args in arg_list for arg in args]
  2916. return default
  2917. def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
  2918. main_key, exe = main_key.lower(), exe.lower()
  2919. root_key = exe if main_key == exe else f'{main_key}+{exe}'
  2920. keys = [f'{root_key}{k}' for k in (keys or [''])]
  2921. if root_key in keys:
  2922. if main_key != exe:
  2923. keys.append((main_key, exe))
  2924. keys.append('default')
  2925. else:
  2926. use_compat = False
  2927. return cli_configuration_args(argdict, keys, default, use_compat)
  2928. class ISO639Utils:
  2929. # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
  2930. _lang_map = {
  2931. 'aa': 'aar',
  2932. 'ab': 'abk',
  2933. 'ae': 'ave',
  2934. 'af': 'afr',
  2935. 'ak': 'aka',
  2936. 'am': 'amh',
  2937. 'an': 'arg',
  2938. 'ar': 'ara',
  2939. 'as': 'asm',
  2940. 'av': 'ava',
  2941. 'ay': 'aym',
  2942. 'az': 'aze',
  2943. 'ba': 'bak',
  2944. 'be': 'bel',
  2945. 'bg': 'bul',
  2946. 'bh': 'bih',
  2947. 'bi': 'bis',
  2948. 'bm': 'bam',
  2949. 'bn': 'ben',
  2950. 'bo': 'bod',
  2951. 'br': 'bre',
  2952. 'bs': 'bos',
  2953. 'ca': 'cat',
  2954. 'ce': 'che',
  2955. 'ch': 'cha',
  2956. 'co': 'cos',
  2957. 'cr': 'cre',
  2958. 'cs': 'ces',
  2959. 'cu': 'chu',
  2960. 'cv': 'chv',
  2961. 'cy': 'cym',
  2962. 'da': 'dan',
  2963. 'de': 'deu',
  2964. 'dv': 'div',
  2965. 'dz': 'dzo',
  2966. 'ee': 'ewe',
  2967. 'el': 'ell',
  2968. 'en': 'eng',
  2969. 'eo': 'epo',
  2970. 'es': 'spa',
  2971. 'et': 'est',
  2972. 'eu': 'eus',
  2973. 'fa': 'fas',
  2974. 'ff': 'ful',
  2975. 'fi': 'fin',
  2976. 'fj': 'fij',
  2977. 'fo': 'fao',
  2978. 'fr': 'fra',
  2979. 'fy': 'fry',
  2980. 'ga': 'gle',
  2981. 'gd': 'gla',
  2982. 'gl': 'glg',
  2983. 'gn': 'grn',
  2984. 'gu': 'guj',
  2985. 'gv': 'glv',
  2986. 'ha': 'hau',
  2987. 'he': 'heb',
  2988. 'iw': 'heb', # Replaced by he in 1989 revision
  2989. 'hi': 'hin',
  2990. 'ho': 'hmo',
  2991. 'hr': 'hrv',
  2992. 'ht': 'hat',
  2993. 'hu': 'hun',
  2994. 'hy': 'hye',
  2995. 'hz': 'her',
  2996. 'ia': 'ina',
  2997. 'id': 'ind',
  2998. 'in': 'ind', # Replaced by id in 1989 revision
  2999. 'ie': 'ile',
  3000. 'ig': 'ibo',
  3001. 'ii': 'iii',
  3002. 'ik': 'ipk',
  3003. 'io': 'ido',
  3004. 'is': 'isl',
  3005. 'it': 'ita',
  3006. 'iu': 'iku',
  3007. 'ja': 'jpn',
  3008. 'jv': 'jav',
  3009. 'ka': 'kat',
  3010. 'kg': 'kon',
  3011. 'ki': 'kik',
  3012. 'kj': 'kua',
  3013. 'kk': 'kaz',
  3014. 'kl': 'kal',
  3015. 'km': 'khm',
  3016. 'kn': 'kan',
  3017. 'ko': 'kor',
  3018. 'kr': 'kau',
  3019. 'ks': 'kas',
  3020. 'ku': 'kur',
  3021. 'kv': 'kom',
  3022. 'kw': 'cor',
  3023. 'ky': 'kir',
  3024. 'la': 'lat',
  3025. 'lb': 'ltz',
  3026. 'lg': 'lug',
  3027. 'li': 'lim',
  3028. 'ln': 'lin',
  3029. 'lo': 'lao',
  3030. 'lt': 'lit',
  3031. 'lu': 'lub',
  3032. 'lv': 'lav',
  3033. 'mg': 'mlg',
  3034. 'mh': 'mah',
  3035. 'mi': 'mri',
  3036. 'mk': 'mkd',
  3037. 'ml': 'mal',
  3038. 'mn': 'mon',
  3039. 'mr': 'mar',
  3040. 'ms': 'msa',
  3041. 'mt': 'mlt',
  3042. 'my': 'mya',
  3043. 'na': 'nau',
  3044. 'nb': 'nob',
  3045. 'nd': 'nde',
  3046. 'ne': 'nep',
  3047. 'ng': 'ndo',
  3048. 'nl': 'nld',
  3049. 'nn': 'nno',
  3050. 'no': 'nor',
  3051. 'nr': 'nbl',
  3052. 'nv': 'nav',
  3053. 'ny': 'nya',
  3054. 'oc': 'oci',
  3055. 'oj': 'oji',
  3056. 'om': 'orm',
  3057. 'or': 'ori',
  3058. 'os': 'oss',
  3059. 'pa': 'pan',
  3060. 'pe': 'per',
  3061. 'pi': 'pli',
  3062. 'pl': 'pol',
  3063. 'ps': 'pus',
  3064. 'pt': 'por',
  3065. 'qu': 'que',
  3066. 'rm': 'roh',
  3067. 'rn': 'run',
  3068. 'ro': 'ron',
  3069. 'ru': 'rus',
  3070. 'rw': 'kin',
  3071. 'sa': 'san',
  3072. 'sc': 'srd',
  3073. 'sd': 'snd',
  3074. 'se': 'sme',
  3075. 'sg': 'sag',
  3076. 'si': 'sin',
  3077. 'sk': 'slk',
  3078. 'sl': 'slv',
  3079. 'sm': 'smo',
  3080. 'sn': 'sna',
  3081. 'so': 'som',
  3082. 'sq': 'sqi',
  3083. 'sr': 'srp',
  3084. 'ss': 'ssw',
  3085. 'st': 'sot',
  3086. 'su': 'sun',
  3087. 'sv': 'swe',
  3088. 'sw': 'swa',
  3089. 'ta': 'tam',
  3090. 'te': 'tel',
  3091. 'tg': 'tgk',
  3092. 'th': 'tha',
  3093. 'ti': 'tir',
  3094. 'tk': 'tuk',
  3095. 'tl': 'tgl',
  3096. 'tn': 'tsn',
  3097. 'to': 'ton',
  3098. 'tr': 'tur',
  3099. 'ts': 'tso',
  3100. 'tt': 'tat',
  3101. 'tw': 'twi',
  3102. 'ty': 'tah',
  3103. 'ug': 'uig',
  3104. 'uk': 'ukr',
  3105. 'ur': 'urd',
  3106. 'uz': 'uzb',
  3107. 've': 'ven',
  3108. 'vi': 'vie',
  3109. 'vo': 'vol',
  3110. 'wa': 'wln',
  3111. 'wo': 'wol',
  3112. 'xh': 'xho',
  3113. 'yi': 'yid',
  3114. 'ji': 'yid', # Replaced by yi in 1989 revision
  3115. 'yo': 'yor',
  3116. 'za': 'zha',
  3117. 'zh': 'zho',
  3118. 'zu': 'zul',
  3119. }
  3120. @classmethod
  3121. def short2long(cls, code):
  3122. """Convert language code from ISO 639-1 to ISO 639-2/T"""
  3123. return cls._lang_map.get(code[:2])
  3124. @classmethod
  3125. def long2short(cls, code):
  3126. """Convert language code from ISO 639-2/T to ISO 639-1"""
  3127. for short_name, long_name in cls._lang_map.items():
  3128. if long_name == code:
  3129. return short_name
  3130. class ISO3166Utils:
  3131. # From http://data.okfn.org/data/core/country-list
  3132. _country_map = {
  3133. 'AF': 'Afghanistan',
  3134. 'AX': 'Åland Islands',
  3135. 'AL': 'Albania',
  3136. 'DZ': 'Algeria',
  3137. 'AS': 'American Samoa',
  3138. 'AD': 'Andorra',
  3139. 'AO': 'Angola',
  3140. 'AI': 'Anguilla',
  3141. 'AQ': 'Antarctica',
  3142. 'AG': 'Antigua and Barbuda',
  3143. 'AR': 'Argentina',
  3144. 'AM': 'Armenia',
  3145. 'AW': 'Aruba',
  3146. 'AU': 'Australia',
  3147. 'AT': 'Austria',
  3148. 'AZ': 'Azerbaijan',
  3149. 'BS': 'Bahamas',
  3150. 'BH': 'Bahrain',
  3151. 'BD': 'Bangladesh',
  3152. 'BB': 'Barbados',
  3153. 'BY': 'Belarus',
  3154. 'BE': 'Belgium',
  3155. 'BZ': 'Belize',
  3156. 'BJ': 'Benin',
  3157. 'BM': 'Bermuda',
  3158. 'BT': 'Bhutan',
  3159. 'BO': 'Bolivia, Plurinational State of',
  3160. 'BQ': 'Bonaire, Sint Eustatius and Saba',
  3161. 'BA': 'Bosnia and Herzegovina',
  3162. 'BW': 'Botswana',
  3163. 'BV': 'Bouvet Island',
  3164. 'BR': 'Brazil',
  3165. 'IO': 'British Indian Ocean Territory',
  3166. 'BN': 'Brunei Darussalam',
  3167. 'BG': 'Bulgaria',
  3168. 'BF': 'Burkina Faso',
  3169. 'BI': 'Burundi',
  3170. 'KH': 'Cambodia',
  3171. 'CM': 'Cameroon',
  3172. 'CA': 'Canada',
  3173. 'CV': 'Cape Verde',
  3174. 'KY': 'Cayman Islands',
  3175. 'CF': 'Central African Republic',
  3176. 'TD': 'Chad',
  3177. 'CL': 'Chile',
  3178. 'CN': 'China',
  3179. 'CX': 'Christmas Island',
  3180. 'CC': 'Cocos (Keeling) Islands',
  3181. 'CO': 'Colombia',
  3182. 'KM': 'Comoros',
  3183. 'CG': 'Congo',
  3184. 'CD': 'Congo, the Democratic Republic of the',
  3185. 'CK': 'Cook Islands',
  3186. 'CR': 'Costa Rica',
  3187. 'CI': 'Côte d\'Ivoire',
  3188. 'HR': 'Croatia',
  3189. 'CU': 'Cuba',
  3190. 'CW': 'Curaçao',
  3191. 'CY': 'Cyprus',
  3192. 'CZ': 'Czech Republic',
  3193. 'DK': 'Denmark',
  3194. 'DJ': 'Djibouti',
  3195. 'DM': 'Dominica',
  3196. 'DO': 'Dominican Republic',
  3197. 'EC': 'Ecuador',
  3198. 'EG': 'Egypt',
  3199. 'SV': 'El Salvador',
  3200. 'GQ': 'Equatorial Guinea',
  3201. 'ER': 'Eritrea',
  3202. 'EE': 'Estonia',
  3203. 'ET': 'Ethiopia',
  3204. 'FK': 'Falkland Islands (Malvinas)',
  3205. 'FO': 'Faroe Islands',
  3206. 'FJ': 'Fiji',
  3207. 'FI': 'Finland',
  3208. 'FR': 'France',
  3209. 'GF': 'French Guiana',
  3210. 'PF': 'French Polynesia',
  3211. 'TF': 'French Southern Territories',
  3212. 'GA': 'Gabon',
  3213. 'GM': 'Gambia',
  3214. 'GE': 'Georgia',
  3215. 'DE': 'Germany',
  3216. 'GH': 'Ghana',
  3217. 'GI': 'Gibraltar',
  3218. 'GR': 'Greece',
  3219. 'GL': 'Greenland',
  3220. 'GD': 'Grenada',
  3221. 'GP': 'Guadeloupe',
  3222. 'GU': 'Guam',
  3223. 'GT': 'Guatemala',
  3224. 'GG': 'Guernsey',
  3225. 'GN': 'Guinea',
  3226. 'GW': 'Guinea-Bissau',
  3227. 'GY': 'Guyana',
  3228. 'HT': 'Haiti',
  3229. 'HM': 'Heard Island and McDonald Islands',
  3230. 'VA': 'Holy See (Vatican City State)',
  3231. 'HN': 'Honduras',
  3232. 'HK': 'Hong Kong',
  3233. 'HU': 'Hungary',
  3234. 'IS': 'Iceland',
  3235. 'IN': 'India',
  3236. 'ID': 'Indonesia',
  3237. 'IR': 'Iran, Islamic Republic of',
  3238. 'IQ': 'Iraq',
  3239. 'IE': 'Ireland',
  3240. 'IM': 'Isle of Man',
  3241. 'IL': 'Israel',
  3242. 'IT': 'Italy',
  3243. 'JM': 'Jamaica',
  3244. 'JP': 'Japan',
  3245. 'JE': 'Jersey',
  3246. 'JO': 'Jordan',
  3247. 'KZ': 'Kazakhstan',
  3248. 'KE': 'Kenya',
  3249. 'KI': 'Kiribati',
  3250. 'KP': 'Korea, Democratic People\'s Republic of',
  3251. 'KR': 'Korea, Republic of',
  3252. 'KW': 'Kuwait',
  3253. 'KG': 'Kyrgyzstan',
  3254. 'LA': 'Lao People\'s Democratic Republic',
  3255. 'LV': 'Latvia',
  3256. 'LB': 'Lebanon',
  3257. 'LS': 'Lesotho',
  3258. 'LR': 'Liberia',
  3259. 'LY': 'Libya',
  3260. 'LI': 'Liechtenstein',
  3261. 'LT': 'Lithuania',
  3262. 'LU': 'Luxembourg',
  3263. 'MO': 'Macao',
  3264. 'MK': 'Macedonia, the Former Yugoslav Republic of',
  3265. 'MG': 'Madagascar',
  3266. 'MW': 'Malawi',
  3267. 'MY': 'Malaysia',
  3268. 'MV': 'Maldives',
  3269. 'ML': 'Mali',
  3270. 'MT': 'Malta',
  3271. 'MH': 'Marshall Islands',
  3272. 'MQ': 'Martinique',
  3273. 'MR': 'Mauritania',
  3274. 'MU': 'Mauritius',
  3275. 'YT': 'Mayotte',
  3276. 'MX': 'Mexico',
  3277. 'FM': 'Micronesia, Federated States of',
  3278. 'MD': 'Moldova, Republic of',
  3279. 'MC': 'Monaco',
  3280. 'MN': 'Mongolia',
  3281. 'ME': 'Montenegro',
  3282. 'MS': 'Montserrat',
  3283. 'MA': 'Morocco',
  3284. 'MZ': 'Mozambique',
  3285. 'MM': 'Myanmar',
  3286. 'NA': 'Namibia',
  3287. 'NR': 'Nauru',
  3288. 'NP': 'Nepal',
  3289. 'NL': 'Netherlands',
  3290. 'NC': 'New Caledonia',
  3291. 'NZ': 'New Zealand',
  3292. 'NI': 'Nicaragua',
  3293. 'NE': 'Niger',
  3294. 'NG': 'Nigeria',
  3295. 'NU': 'Niue',
  3296. 'NF': 'Norfolk Island',
  3297. 'MP': 'Northern Mariana Islands',
  3298. 'NO': 'Norway',
  3299. 'OM': 'Oman',
  3300. 'PK': 'Pakistan',
  3301. 'PW': 'Palau',
  3302. 'PS': 'Palestine, State of',
  3303. 'PA': 'Panama',
  3304. 'PG': 'Papua New Guinea',
  3305. 'PY': 'Paraguay',
  3306. 'PE': 'Peru',
  3307. 'PH': 'Philippines',
  3308. 'PN': 'Pitcairn',
  3309. 'PL': 'Poland',
  3310. 'PT': 'Portugal',
  3311. 'PR': 'Puerto Rico',
  3312. 'QA': 'Qatar',
  3313. 'RE': 'Réunion',
  3314. 'RO': 'Romania',
  3315. 'RU': 'Russian Federation',
  3316. 'RW': 'Rwanda',
  3317. 'BL': 'Saint Barthélemy',
  3318. 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
  3319. 'KN': 'Saint Kitts and Nevis',
  3320. 'LC': 'Saint Lucia',
  3321. 'MF': 'Saint Martin (French part)',
  3322. 'PM': 'Saint Pierre and Miquelon',
  3323. 'VC': 'Saint Vincent and the Grenadines',
  3324. 'WS': 'Samoa',
  3325. 'SM': 'San Marino',
  3326. 'ST': 'Sao Tome and Principe',
  3327. 'SA': 'Saudi Arabia',
  3328. 'SN': 'Senegal',
  3329. 'RS': 'Serbia',
  3330. 'SC': 'Seychelles',
  3331. 'SL': 'Sierra Leone',
  3332. 'SG': 'Singapore',
  3333. 'SX': 'Sint Maarten (Dutch part)',
  3334. 'SK': 'Slovakia',
  3335. 'SI': 'Slovenia',
  3336. 'SB': 'Solomon Islands',
  3337. 'SO': 'Somalia',
  3338. 'ZA': 'South Africa',
  3339. 'GS': 'South Georgia and the South Sandwich Islands',
  3340. 'SS': 'South Sudan',
  3341. 'ES': 'Spain',
  3342. 'LK': 'Sri Lanka',
  3343. 'SD': 'Sudan',
  3344. 'SR': 'Suriname',
  3345. 'SJ': 'Svalbard and Jan Mayen',
  3346. 'SZ': 'Swaziland',
  3347. 'SE': 'Sweden',
  3348. 'CH': 'Switzerland',
  3349. 'SY': 'Syrian Arab Republic',
  3350. 'TW': 'Taiwan, Province of China',
  3351. 'TJ': 'Tajikistan',
  3352. 'TZ': 'Tanzania, United Republic of',
  3353. 'TH': 'Thailand',
  3354. 'TL': 'Timor-Leste',
  3355. 'TG': 'Togo',
  3356. 'TK': 'Tokelau',
  3357. 'TO': 'Tonga',
  3358. 'TT': 'Trinidad and Tobago',
  3359. 'TN': 'Tunisia',
  3360. 'TR': 'Turkey',
  3361. 'TM': 'Turkmenistan',
  3362. 'TC': 'Turks and Caicos Islands',
  3363. 'TV': 'Tuvalu',
  3364. 'UG': 'Uganda',
  3365. 'UA': 'Ukraine',
  3366. 'AE': 'United Arab Emirates',
  3367. 'GB': 'United Kingdom',
  3368. 'US': 'United States',
  3369. 'UM': 'United States Minor Outlying Islands',
  3370. 'UY': 'Uruguay',
  3371. 'UZ': 'Uzbekistan',
  3372. 'VU': 'Vanuatu',
  3373. 'VE': 'Venezuela, Bolivarian Republic of',
  3374. 'VN': 'Viet Nam',
  3375. 'VG': 'Virgin Islands, British',
  3376. 'VI': 'Virgin Islands, U.S.',
  3377. 'WF': 'Wallis and Futuna',
  3378. 'EH': 'Western Sahara',
  3379. 'YE': 'Yemen',
  3380. 'ZM': 'Zambia',
  3381. 'ZW': 'Zimbabwe',
  3382. # Not ISO 3166 codes, but used for IP blocks
  3383. 'AP': 'Asia/Pacific Region',
  3384. 'EU': 'Europe',
  3385. }
  3386. @classmethod
  3387. def short2full(cls, code):
  3388. """Convert an ISO 3166-2 country code to the corresponding full name"""
  3389. return cls._country_map.get(code.upper())
  3390. class GeoUtils:
  3391. # Major IPv4 address blocks per country
  3392. _country_ip_map = {
  3393. 'AD': '',
  3394. 'AE': '',
  3395. 'AF': '',
  3396. 'AG': '',
  3397. 'AI': '',
  3398. 'AL': '',
  3399. 'AM': '',
  3400. 'AO': '',
  3401. 'AP': '',
  3402. 'AQ': '',
  3403. 'AR': '',
  3404. 'AS': '',
  3405. 'AT': '',
  3406. 'AU': '',
  3407. 'AW': '',
  3408. 'AX': '',
  3409. 'AZ': '',
  3410. 'BA': '',
  3411. 'BB': '',
  3412. 'BD': '',
  3413. 'BE': '',
  3414. 'BF': '',
  3415. 'BG': '',
  3416. 'BH': '',
  3417. 'BI': '',
  3418. 'BJ': '',
  3419. 'BL': '',
  3420. 'BM': '',
  3421. 'BN': '',
  3422. 'BO': '',
  3423. 'BQ': '',
  3424. 'BR': '',
  3425. 'BS': '',
  3426. 'BT': '',
  3427. 'BW': '',
  3428. 'BY': '',
  3429. 'BZ': '',
  3430. 'CA': '',
  3431. 'CD': '',
  3432. 'CF': '',
  3433. 'CG': '',
  3434. 'CH': '',
  3435. 'CI': '',
  3436. 'CK': '',
  3437. 'CL': '',
  3438. 'CM': '',
  3439. 'CN': '',
  3440. 'CO': '',
  3441. 'CR': '',
  3442. 'CU': '',
  3443. 'CV': '',
  3444. 'CW': '',
  3445. 'CY': '',
  3446. 'CZ': '',
  3447. 'DE': '',
  3448. 'DJ': '',
  3449. 'DK': '',
  3450. 'DM': '',
  3451. 'DO': '',
  3452. 'DZ': '',
  3453. 'EC': '',
  3454. 'EE': '',
  3455. 'EG': '',
  3456. 'ER': '',
  3457. 'ES': '',
  3458. 'ET': '',
  3459. 'EU': '',
  3460. 'FI': '',
  3461. 'FJ': '',
  3462. 'FK': '',
  3463. 'FM': '',
  3464. 'FO': '',
  3465. 'FR': '',
  3466. 'GA': '',
  3467. 'GB': '',
  3468. 'GD': '',
  3469. 'GE': '',
  3470. 'GF': '',
  3471. 'GG': '',
  3472. 'GH': '',
  3473. 'GI': '',
  3474. 'GL': '',
  3475. 'GM': '',
  3476. 'GN': '',
  3477. 'GP': '',
  3478. 'GQ': '',
  3479. 'GR': '',
  3480. 'GT': '',
  3481. 'GU': '',
  3482. 'GW': '',
  3483. 'GY': '',
  3484. 'HK': '',
  3485. 'HN': '',
  3486. 'HR': '',
  3487. 'HT': '',
  3488. 'HU': '',
  3489. 'ID': '',
  3490. 'IE': '',
  3491. 'IL': '',
  3492. 'IM': '',
  3493. 'IN': '',
  3494. 'IO': '',
  3495. 'IQ': '',
  3496. 'IR': '',
  3497. 'IS': '',
  3498. 'IT': '',
  3499. 'JE': '',
  3500. 'JM': '',
  3501. 'JO': '',
  3502. 'JP': '',
  3503. 'KE': '',
  3504. 'KG': '',
  3505. 'KH': '',
  3506. 'KI': '',
  3507. 'KM': '',
  3508. 'KN': '',
  3509. 'KP': '',
  3510. 'KR': '',
  3511. 'KW': '',
  3512. 'KY': '',
  3513. 'KZ': '',
  3514. 'LA': '',
  3515. 'LB': '',
  3516. 'LC': '',
  3517. 'LI': '',
  3518. 'LK': '',
  3519. 'LR': '',
  3520. 'LS': '',
  3521. 'LT': '',
  3522. 'LU': '',
  3523. 'LV': '',
  3524. 'LY': '',
  3525. 'MA': '',
  3526. 'MC': '',
  3527. 'MD': '',
  3528. 'ME': '',
  3529. 'MF': '',
  3530. 'MG': '',
  3531. 'MH': '',
  3532. 'MK': '',
  3533. 'ML': '',
  3534. 'MM': '',
  3535. 'MN': '',
  3536. 'MO': '',
  3537. 'MP': '',
  3538. 'MQ': '',
  3539. 'MR': '',
  3540. 'MS': '',
  3541. 'MT': '',
  3542. 'MU': '',
  3543. 'MV': '',
  3544. 'MW': '',
  3545. 'MX': '',
  3546. 'MY': '',
  3547. 'MZ': '',
  3548. 'NA': '',
  3549. 'NC': '',
  3550. 'NE': '',
  3551. 'NF': '',
  3552. 'NG': '',
  3553. 'NI': '',
  3554. 'NL': '',
  3555. 'NO': '',
  3556. 'NP': '',
  3557. 'NR': '',
  3558. 'NU': '',
  3559. 'NZ': '',
  3560. 'OM': '',
  3561. 'PA': '',
  3562. 'PE': '',
  3563. 'PF': '',
  3564. 'PG': '',
  3565. 'PH': '',
  3566. 'PK': '',
  3567. 'PL': '',
  3568. 'PM': '',
  3569. 'PR': '',
  3570. 'PS': '',
  3571. 'PT': '',
  3572. 'PW': '',
  3573. 'PY': '',
  3574. 'QA': '',
  3575. 'RE': '',
  3576. 'RO': '',
  3577. 'RS': '',
  3578. 'RU': '',
  3579. 'RW': '',
  3580. 'SA': '',
  3581. 'SB': '',
  3582. 'SC': '',
  3583. 'SD': '',
  3584. 'SE': '',
  3585. 'SG': '',
  3586. 'SI': '',
  3587. 'SK': '',
  3588. 'SL': '',
  3589. 'SM': '',
  3590. 'SN': '',
  3591. 'SO': '',
  3592. 'SR': '',
  3593. 'SS': '',
  3594. 'ST': '',
  3595. 'SV': '',
  3596. 'SX': '',
  3597. 'SY': '',
  3598. 'SZ': '',
  3599. 'TC': '',
  3600. 'TD': '',
  3601. 'TG': '',
  3602. 'TH': '',
  3603. 'TJ': '',
  3604. 'TK': '',
  3605. 'TL': '',
  3606. 'TM': '',
  3607. 'TN': '',
  3608. 'TO': '',
  3609. 'TR': '',
  3610. 'TT': '',
  3611. 'TV': '',
  3612. 'TW': '',
  3613. 'TZ': '',
  3614. 'UA': '',
  3615. 'UG': '',
  3616. 'US': '',
  3617. 'UY': '',
  3618. 'UZ': '',
  3619. 'VA': '',
  3620. 'VC': '',
  3621. 'VE': '',
  3622. 'VG': '',
  3623. 'VI': '',
  3624. 'VN': '',
  3625. 'VU': '',
  3626. 'WF': '',
  3627. 'WS': '',
  3628. 'YE': '',
  3629. 'YT': '',
  3630. 'ZA': '',
  3631. 'ZM': '',
  3632. 'ZW': '',
  3633. }
  3634. @classmethod
  3635. def random_ipv4(cls, code_or_block):
  3636. if len(code_or_block) == 2:
  3637. block = cls._country_ip_map.get(code_or_block.upper())
  3638. if not block:
  3639. return None
  3640. else:
  3641. block = code_or_block
  3642. addr, preflen = block.split('/')
  3643. addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
  3644. addr_max = addr_min | (0xffffffff >> int(preflen))
  3645. return str(socket.inet_ntoa(
  3646. struct.pack('!L', random.randint(addr_min, addr_max))))
  3647. # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
  3648. # released into Public Domain
  3649. # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
  3650. def long_to_bytes(n, blocksize=0):
  3651. """long_to_bytes(n:long, blocksize:int) : string
  3652. Convert a long integer to a byte string.
  3653. If optional blocksize is given and greater than zero, pad the front of the
  3654. byte string with binary zeros so that the length is a multiple of
  3655. blocksize.
  3656. """
  3657. # after much testing, this algorithm was deemed to be the fastest
  3658. s = b''
  3659. n = int(n)
  3660. while n > 0:
  3661. s = struct.pack('>I', n & 0xffffffff) + s
  3662. n = n >> 32
  3663. # strip off leading zeros
  3664. for i in range(len(s)):
  3665. if s[i] != b'\000'[0]:
  3666. break
  3667. else:
  3668. # only happens when n == 0
  3669. s = b'\000'
  3670. i = 0
  3671. s = s[i:]
  3672. # add back some pad bytes. this could be done more efficiently w.r.t. the
  3673. # de-padding being done above, but sigh...
  3674. if blocksize > 0 and len(s) % blocksize:
  3675. s = (blocksize - len(s) % blocksize) * b'\000' + s
  3676. return s
  3677. def bytes_to_long(s):
  3678. """bytes_to_long(string) : long
  3679. Convert a byte string to a long integer.
  3680. This is (essentially) the inverse of long_to_bytes().
  3681. """
  3682. acc = 0
  3683. length = len(s)
  3684. if length % 4:
  3685. extra = (4 - length % 4)
  3686. s = b'\000' * extra + s
  3687. length = length + extra
  3688. for i in range(0, length, 4):
  3689. acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
  3690. return acc
  3691. def ohdave_rsa_encrypt(data, exponent, modulus):
  3692. """
  3693. Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
  3694. Input:
  3695. data: data to encrypt, bytes-like object
  3696. exponent, modulus: parameter e and N of RSA algorithm, both integer
  3697. Output: hex string of encrypted data
  3698. Limitation: supports one block encryption only
  3699. """
  3700. payload = int(binascii.hexlify(data[::-1]), 16)
  3701. encrypted = pow(payload, exponent, modulus)
  3702. return f'{encrypted:x}'
  3703. def pkcs1pad(data, length):
  3704. """
  3705. Padding input data with PKCS#1 scheme
  3706. @param {int[]} data input data
  3707. @param {int} length target length
  3708. @returns {int[]} padded data
  3709. """
  3710. if len(data) > length - 11:
  3711. raise ValueError('Input data too long for PKCS#1 padding')
  3712. pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
  3713. return [0, 2, *pseudo_random, 0, *data]
  3714. def _base_n_table(n, table):
  3715. if not table and not n:
  3716. raise ValueError('Either table or n must be specified')
  3717. table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
  3718. if n and n != len(table):
  3719. raise ValueError(f'base {n} exceeds table length {len(table)}')
  3720. return table
  3721. def encode_base_n(num, n=None, table=None):
  3722. """Convert given int to a base-n string"""
  3723. table = _base_n_table(n, table)
  3724. if not num:
  3725. return table[0]
  3726. result, base = '', len(table)
  3727. while num:
  3728. result = table[num % base] + result
  3729. num = num // base
  3730. return result
  3731. def decode_base_n(string, n=None, table=None):
  3732. """Convert given base-n string to int"""
  3733. table = {char: index for index, char in enumerate(_base_n_table(n, table))}
  3734. result, base = 0, len(table)
  3735. for char in string:
  3736. result = result * base + table[char]
  3737. return result
  3738. def decode_packed_codes(code):
  3739. mobj = re.search(PACKED_CODES_RE, code)
  3740. obfuscated_code, base, count, symbols = mobj.groups()
  3741. base = int(base)
  3742. count = int(count)
  3743. symbols = symbols.split('|')
  3744. symbol_table = {}
  3745. while count:
  3746. count -= 1
  3747. base_n_count = encode_base_n(count, base)
  3748. symbol_table[base_n_count] = symbols[count] or base_n_count
  3749. return re.sub(
  3750. r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
  3751. obfuscated_code)
  3752. def caesar(s, alphabet, shift):
  3753. if shift == 0:
  3754. return s
  3755. l = len(alphabet)
  3756. return ''.join(
  3757. alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
  3758. for c in s)
  3759. def rot47(s):
  3760. return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
  3761. def parse_m3u8_attributes(attrib):
  3762. info = {}
  3763. for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
  3764. if val.startswith('"'):
  3765. val = val[1:-1]
  3766. info[key] = val
  3767. return info
  3768. def urshift(val, n):
  3769. return val >> n if val >= 0 else (val + 0x100000000) >> n
  3770. def write_xattr(path, key, value):
  3771. # Windows: Write xattrs to NTFS Alternate Data Streams:
  3772. # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
  3773. if os.name == 'nt':
  3774. assert ':' not in key
  3775. assert os.path.exists(path)
  3776. try:
  3777. with open(f'{path}:{key}', 'wb') as f:
  3778. f.write(value)
  3779. except OSError as e:
  3780. raise XAttrMetadataError(e.errno, e.strerror)
  3781. return
  3782. # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
  3783. setxattr = None
  3784. if callable(getattr(os, 'setxattr', None)):
  3785. setxattr = os.setxattr
  3786. elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
  3787. # Unicode arguments are not supported in pyxattr until version 0.5.0
  3788. # See https://github.com/ytdl-org/youtube-dl/issues/5498
  3789. if version_tuple(xattr.__version__) >= (0, 5, 0):
  3790. setxattr = xattr.set
  3791. elif xattr:
  3792. setxattr = xattr.setxattr
  3793. if setxattr:
  3794. try:
  3795. setxattr(path, key, value)
  3796. except OSError as e:
  3797. raise XAttrMetadataError(e.errno, e.strerror)
  3798. return
  3799. # UNIX Method 2. Use setfattr/xattr executables
  3800. exe = ('setfattr' if check_executable('setfattr', ['--version'])
  3801. else 'xattr' if check_executable('xattr', ['-h']) else None)
  3802. if not exe:
  3803. raise XAttrUnavailableError(
  3804. 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
  3805. + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
  3806. value = value.decode()
  3807. try:
  3808. _, stderr, returncode = Popen.run(
  3809. [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
  3810. text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
  3811. except OSError as e:
  3812. raise XAttrMetadataError(e.errno, e.strerror)
  3813. if returncode:
  3814. raise XAttrMetadataError(returncode, stderr)
  3815. def random_birthday(year_field, month_field, day_field):
  3816. start_date = dt.date(1950, 1, 1)
  3817. end_date = dt.date(1995, 12, 31)
  3818. offset = random.randint(0, (end_date - start_date).days)
  3819. random_date = start_date + dt.timedelta(offset)
  3820. return {
  3821. year_field: str(random_date.year),
  3822. month_field: str(random_date.month),
  3823. day_field: str(random_date.day),
  3824. }
  3825. def find_available_port(interface=''):
  3826. try:
  3827. with socket.socket() as sock:
  3828. sock.bind((interface, 0))
  3829. return sock.getsockname()[1]
  3830. except OSError:
  3831. return None
  3832. # Templates for internet shortcut files, which are plain text files.
  3833. DOT_URL_LINK_TEMPLATE = '''\
  3834. [InternetShortcut]
  3835. URL=%(url)s
  3836. '''
  3838. <?xml version="1.0" encoding="UTF-8"?>
  3839. <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
  3840. <plist version="1.0">
  3841. <dict>
  3842. \t<key>URL</key>
  3843. \t<string>%(url)s</string>
  3844. </dict>
  3845. </plist>
  3846. '''
  3848. [Desktop Entry]
  3849. Encoding=UTF-8
  3850. Name=%(filename)s
  3851. Type=Link
  3852. URL=%(url)s
  3853. Icon=text-html
  3854. '''
  3855. LINK_TEMPLATES = {
  3856. 'url': DOT_URL_LINK_TEMPLATE,
  3857. 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
  3858. 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
  3859. }
  3860. def iri_to_uri(iri):
  3861. """
  3862. Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
  3863. The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
  3864. """
  3865. iri_parts = urllib.parse.urlparse(iri)
  3866. if '[' in iri_parts.netloc:
  3867. raise ValueError('IPv6 URIs are not, yet, supported.')
  3868. # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
  3869. # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
  3870. net_location = ''
  3871. if iri_parts.username:
  3872. net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
  3873. if iri_parts.password is not None:
  3874. net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
  3875. net_location += '@'
  3876. net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
  3877. # The 'idna' encoding produces ASCII text.
  3878. if iri_parts.port is not None and iri_parts.port != 80:
  3879. net_location += ':' + str(iri_parts.port)
  3880. return urllib.parse.urlunparse(
  3881. (iri_parts.scheme,
  3882. net_location,
  3883. urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
  3884. # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
  3885. urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
  3886. # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
  3887. urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
  3888. urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
  3889. # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
  3890. def to_high_limit_path(path):
  3891. if sys.platform in ['win32', 'cygwin']:
  3892. # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
  3893. return '\\\\?\\' + os.path.abspath(path)
  3894. return path
  3895. @partial_application
  3896. def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
  3897. val = traversal.traverse_obj(obj, *variadic(field))
  3898. if not val if ignore is NO_DEFAULT else val in variadic(ignore):
  3899. return default
  3900. return template % func(val)
  3901. def clean_podcast_url(url):
  3902. url = re.sub(r'''(?x)
  3903. (?:
  3904. (?:
  3905. chtbl\.com/track|
  3906. media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
  3907. play\.podtrac\.com|
  3908. chrt\.fm/track|
  3909. mgln\.ai/e
  3910. )(?:/[^/.]+)?|
  3911. (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
  3912. flex\.acast\.com|
  3913. pd(?:
  3914. cn\.co| # https://podcorn.com/analytics-prefix/
  3915. st\.fm # https://podsights.com/docs/
  3916. )/e|
  3917. [0-9]\.gum\.fm|
  3918. pscrb\.fm/rss/p
  3919. )/''', '', url)
  3920. return re.sub(r'^\w+://(\w+://)', r'\1', url)
  3921. _HEX_TABLE = '0123456789abcdef'
  3922. def random_uuidv4():
  3923. return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
  3924. def make_dir(path, to_screen=None):
  3925. try:
  3926. dn = os.path.dirname(path)
  3927. if dn:
  3928. os.makedirs(dn, exist_ok=True)
  3929. return True
  3930. except OSError as err:
  3931. if callable(to_screen) is not None:
  3932. to_screen(f'unable to create directory {err}')
  3933. return False
  3934. def get_executable_path():
  3935. from ..update import _get_variant_and_executable_path
  3936. return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
  3937. def get_user_config_dirs(package_name):
  3938. # .config (e.g. ~/.config/package_name)
  3939. xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
  3940. yield os.path.join(xdg_config_home, package_name)
  3941. # appdata (%APPDATA%/package_name)
  3942. appdata_dir = os.getenv('appdata')
  3943. if appdata_dir:
  3944. yield os.path.join(appdata_dir, package_name)
  3945. # home (~/.package_name)
  3946. yield os.path.join(compat_expanduser('~'), f'.{package_name}')
  3947. def get_system_config_dirs(package_name):
  3948. # /etc/package_name
  3949. yield os.path.join('/etc', package_name)
  3950. def time_seconds(**kwargs):
  3951. """
  3952. Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
  3953. """
  3954. return time.time() + dt.timedelta(**kwargs).total_seconds()
  3955. # create a JSON Web Signature (jws) with HS256 algorithm
  3956. # the resulting format is in JWS Compact Serialization
  3957. # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
  3958. # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
  3959. def jwt_encode_hs256(payload_data, key, headers={}):
  3960. header_data = {
  3961. 'alg': 'HS256',
  3962. 'typ': 'JWT',
  3963. }
  3964. if headers:
  3965. header_data.update(headers)
  3966. header_b64 = base64.b64encode(json.dumps(header_data).encode())
  3967. payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
  3968. h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
  3969. signature_b64 = base64.b64encode(h.digest())
  3970. return header_b64 + b'.' + payload_b64 + b'.' + signature_b64
  3971. # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
  3972. def jwt_decode_hs256(jwt):
  3973. header_b64, payload_b64, signature_b64 = jwt.split('.')
  3974. # add trailing ='s that may have been stripped, superfluous ='s are ignored
  3975. return json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
  3976. WINDOWS_VT_MODE = False if os.name == 'nt' else None
  3977. @functools.cache
  3978. def supports_terminal_sequences(stream):
  3979. if os.name == 'nt':
  3980. if not WINDOWS_VT_MODE:
  3981. return False
  3982. elif not os.getenv('TERM'):
  3983. return False
  3984. try:
  3985. return stream.isatty()
  3986. except BaseException:
  3987. return False
  3988. def windows_enable_vt_mode():
  3989. """Ref: https://bugs.python.org/issue30075 """
  3990. if get_windows_version() < (10, 0, 10586):
  3991. return
  3992. import ctypes
  3993. import ctypes.wintypes
  3994. import msvcrt
  3996. dll = ctypes.WinDLL('kernel32', use_last_error=False)
  3997. handle = os.open('CONOUT$', os.O_RDWR)
  3998. try:
  3999. h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
  4000. dw_original_mode = ctypes.wintypes.DWORD()
  4001. success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
  4002. if not success:
  4003. raise Exception('GetConsoleMode failed')
  4004. success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
  4005. dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
  4006. if not success:
  4007. raise Exception('SetConsoleMode failed')
  4008. finally:
  4009. os.close(handle)
  4010. global WINDOWS_VT_MODE
  4011. WINDOWS_VT_MODE = True
  4012. supports_terminal_sequences.cache_clear()
  4013. _terminal_sequences_re = re.compile('\033\\[[^m]+m')
  4014. def remove_terminal_sequences(string):
  4015. return _terminal_sequences_re.sub('', string)
  4016. def number_of_digits(number):
  4017. return len('%d' % number)
  4018. def join_nonempty(*values, delim='-', from_dict=None):
  4019. if from_dict is not None:
  4020. values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
  4021. return delim.join(map(str, filter(None, values)))
  4022. def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
  4023. """
  4024. Find the largest format dimensions in terms of video width and, for each thumbnail:
  4025. * Modify the URL: Match the width with the provided regex and replace with the former width
  4026. * Update dimensions
  4027. This function is useful with video services that scale the provided thumbnails on demand
  4028. """
  4029. _keys = ('width', 'height')
  4030. max_dimensions = max(
  4031. (tuple(fmt.get(k) or 0 for k in _keys) for fmt in formats),
  4032. default=(0, 0))
  4033. if not max_dimensions[0]:
  4034. return thumbnails
  4035. return [
  4036. merge_dicts(
  4037. {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
  4038. dict(zip(_keys, max_dimensions)), thumbnail)
  4039. for thumbnail in thumbnails
  4040. ]
  4041. def parse_http_range(range):
  4042. """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
  4043. if not range:
  4044. return None, None, None
  4045. crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
  4046. if not crg:
  4047. return None, None, None
  4048. return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
  4049. def read_stdin(what):
  4050. if what:
  4051. eof = 'Ctrl+Z' if os.name == 'nt' else 'Ctrl+D'
  4052. write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
  4053. return sys.stdin
  4054. def determine_file_encoding(data):
  4055. """
  4056. Detect the text encoding used
  4057. @returns (encoding, bytes to skip)
  4058. """
  4059. # BOM marks are given priority over declarations
  4060. for bom, enc in BOMS:
  4061. if data.startswith(bom):
  4062. return enc, len(bom)
  4063. # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
  4064. # We ignore the endianness to get a good enough match
  4065. data = data.replace(b'\0', b'')
  4066. mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
  4067. return mobj.group(1).decode() if mobj else None, 0
  4068. class Config:
  4069. own_args = None
  4070. parsed_args = None
  4071. filename = None
  4072. __initialized = False
  4073. # Internal only, do not use! Hack to enable --plugin-dirs
  4074. # TODO(coletdjnz): remove when plugin globals system is implemented
  4075. _plugin_dirs = None
  4076. def __init__(self, parser, label=None):
  4077. self.parser, self.label = parser, label
  4078. self._loaded_paths, self.configs = set(), []
  4079. def init(self, args=None, filename=None):
  4080. assert not self.__initialized
  4081. self.own_args, self.filename = args, filename
  4082. return self.load_configs()
  4083. def load_configs(self):
  4084. directory = ''
  4085. if self.filename:
  4086. location = os.path.realpath(self.filename)
  4087. directory = os.path.dirname(location)
  4088. if location in self._loaded_paths:
  4089. return False
  4090. self._loaded_paths.add(location)
  4091. self.__initialized = True
  4092. opts, _ = self.parser.parse_known_args(self.own_args)
  4093. self.parsed_args = self.own_args
  4094. for location in opts.config_locations or []:
  4095. if location == '-':
  4096. if location in self._loaded_paths:
  4097. continue
  4098. self._loaded_paths.add(location)
  4099. self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
  4100. continue
  4101. location = os.path.join(directory, expand_path(location))
  4102. if os.path.isdir(location):
  4103. location = os.path.join(location, 'yt-dlp.conf')
  4104. if not os.path.exists(location):
  4105. self.parser.error(f'config location {location} does not exist')
  4106. self.append_config(self.read_file(location), location)
  4107. return True
  4108. def __str__(self):
  4109. label = join_nonempty(
  4110. self.label, 'config', f'"{self.filename}"' if self.filename else '',
  4111. delim=' ')
  4112. return join_nonempty(
  4113. self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
  4114. *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
  4115. delim='\n')
  4116. @staticmethod
  4117. def read_file(filename, default=[]):
  4118. try:
  4119. optionf = open(filename, 'rb')
  4120. except OSError:
  4121. return default # silently skip if file is not present
  4122. try:
  4123. enc, skip = determine_file_encoding(optionf.read(512))
  4124. optionf.seek(skip, io.SEEK_SET)
  4125. except OSError:
  4126. enc = None # silently skip read errors
  4127. try:
  4128. # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
  4129. contents = optionf.read().decode(enc or preferredencoding())
  4130. res = shlex.split(contents, comments=True)
  4131. except Exception as err:
  4132. raise ValueError(f'Unable to parse "{filename}": {err}')
  4133. finally:
  4134. optionf.close()
  4135. return res
  4136. @staticmethod
  4137. def hide_login_info(opts):
  4138. PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
  4139. eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
  4140. def _scrub_eq(o):
  4141. m = eqre.match(o)
  4142. if m:
  4143. return m.group('key') + '=PRIVATE'
  4144. else:
  4145. return o
  4146. opts = list(map(_scrub_eq, opts))
  4147. for idx, opt in enumerate(opts):
  4148. if opt in PRIVATE_OPTS and idx + 1 < len(opts):
  4149. opts[idx + 1] = 'PRIVATE'
  4150. return opts
  4151. def append_config(self, *args, label=None):
  4152. config = type(self)(self.parser, label)
  4153. config._loaded_paths = self._loaded_paths
  4154. if config.init(*args):
  4155. self.configs.append(config)
  4156. @property
  4157. def all_args(self):
  4158. for config in reversed(self.configs):
  4159. yield from config.all_args
  4160. yield from self.parsed_args or []
  4161. def parse_known_args(self, **kwargs):
  4162. return self.parser.parse_known_args(self.all_args, **kwargs)
  4163. def parse_args(self):
  4164. return self.parser.parse_args(self.all_args)
  4165. def merge_headers(*dicts):
  4166. """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
  4167. return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
  4168. def cached_method(f):
  4169. """Cache a method"""
  4170. signature = inspect.signature(f)
  4171. @functools.wraps(f)
  4172. def wrapper(self, *args, **kwargs):
  4173. bound_args = signature.bind(self, *args, **kwargs)
  4174. bound_args.apply_defaults()
  4175. key = tuple(bound_args.arguments.values())[1:]
  4176. cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
  4177. if key not in cache:
  4178. cache[key] = f(self, *args, **kwargs)
  4179. return cache[key]
  4180. return wrapper
  4181. class classproperty:
  4182. """property access for class methods with optional caching"""
  4183. def __new__(cls, func=None, *args, **kwargs):
  4184. if not func:
  4185. return functools.partial(cls, *args, **kwargs)
  4186. return super().__new__(cls)
  4187. def __init__(self, func, *, cache=False):
  4188. functools.update_wrapper(self, func)
  4189. self.func = func
  4190. self._cache = {} if cache else None
  4191. def __get__(self, _, cls):
  4192. if self._cache is None:
  4193. return self.func(cls)
  4194. elif cls not in self._cache:
  4195. self._cache[cls] = self.func(cls)
  4196. return self._cache[cls]
  4197. class function_with_repr:
  4198. def __init__(self, func, repr_=None):
  4199. functools.update_wrapper(self, func)
  4200. self.func, self.__repr = func, repr_
  4201. def __call__(self, *args, **kwargs):
  4202. return self.func(*args, **kwargs)
  4203. @classmethod
  4204. def set_repr(cls, repr_):
  4205. return functools.partial(cls, repr_=repr_)
  4206. def __repr__(self):
  4207. if self.__repr:
  4208. return self.__repr
  4209. return f'{self.func.__module__}.{self.func.__qualname__}'
  4210. class Namespace(types.SimpleNamespace):
  4211. """Immutable namespace"""
  4212. def __iter__(self):
  4213. return iter(self.__dict__.values())
  4214. @property
  4215. def items_(self):
  4216. return self.__dict__.items()
  4217. MEDIA_EXTENSIONS = Namespace(
  4218. common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
  4219. video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
  4220. common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
  4221. audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
  4222. thumbnails=('jpg', 'png', 'webp'),
  4223. storyboards=('mhtml', ),
  4224. subtitles=('srt', 'vtt', 'ass', 'lrc'),
  4225. manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
  4226. )
  4227. MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
  4228. MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
  4230. class _UnsafeExtensionError(Exception):
  4231. """
  4232. Mitigation exception for uncommon/malicious file extensions
  4233. This should be caught in YoutubeDL.py alongside a warning
  4234. Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j
  4235. """
  4236. ALLOWED_EXTENSIONS = frozenset([
  4237. # internal
  4238. 'description',
  4239. 'json',
  4240. 'meta',
  4241. 'orig',
  4242. 'part',
  4243. 'temp',
  4244. 'uncut',
  4245. 'unknown_video',
  4246. 'ytdl',
  4247. # video
  4248. *MEDIA_EXTENSIONS.video,
  4249. 'asx',
  4250. 'ismv',
  4251. 'm2t',
  4252. 'm2ts',
  4253. 'm2v',
  4254. 'm4s',
  4255. 'mng',
  4256. 'mp2v',
  4257. 'mp4v',
  4258. 'mpe',
  4259. 'mpeg',
  4260. 'mpeg1',
  4261. 'mpeg2',
  4262. 'mpeg4',
  4263. 'mxf',
  4264. 'ogm',
  4265. 'qt',
  4266. 'rm',
  4267. 'swf',
  4268. 'ts',
  4269. 'vid',
  4270. 'vob',
  4271. 'vp9',
  4272. # audio
  4273. *MEDIA_EXTENSIONS.audio,
  4274. '3ga',
  4275. 'ac3',
  4276. 'adts',
  4277. 'aif',
  4278. 'au',
  4279. 'dts',
  4280. 'isma',
  4281. 'it',
  4282. 'mid',
  4283. 'mod',
  4284. 'mpga',
  4285. 'mp1',
  4286. 'mp2',
  4287. 'mp4a',
  4288. 'mpa',
  4289. 'ra',
  4290. 'shn',
  4291. 'xm',
  4292. # image
  4293. *MEDIA_EXTENSIONS.thumbnails,
  4294. 'avif',
  4295. 'bmp',
  4296. 'gif',
  4297. 'heic',
  4298. 'ico',
  4299. 'image',
  4300. 'jfif',
  4301. 'jng',
  4302. 'jpe',
  4303. 'jpeg',
  4304. 'jxl',
  4305. 'svg',
  4306. 'tif',
  4307. 'tiff',
  4308. 'wbmp',
  4309. # subtitle
  4310. *MEDIA_EXTENSIONS.subtitles,
  4311. 'dfxp',
  4312. 'fs',
  4313. 'ismt',
  4314. 'json3',
  4315. 'sami',
  4316. 'scc',
  4317. 'srv1',
  4318. 'srv2',
  4319. 'srv3',
  4320. 'ssa',
  4321. 'tt',
  4322. 'ttml',
  4323. 'xml',
  4324. # others
  4325. *MEDIA_EXTENSIONS.manifests,
  4326. *MEDIA_EXTENSIONS.storyboards,
  4327. 'desktop',
  4328. 'ism',
  4329. 'm3u',
  4330. 'sbv',
  4331. 'url',
  4332. 'webloc',
  4333. ])
  4334. def __init__(self, extension, /):
  4335. super().__init__(f'unsafe file extension: {extension!r}')
  4336. self.extension = extension
  4337. @classmethod
  4338. def sanitize_extension(cls, extension, /, *, prepend=False):
  4339. if extension is None:
  4340. return None
  4341. if '/' in extension or '\\' in extension:
  4342. raise cls(extension)
  4343. if not prepend:
  4344. _, _, last = extension.rpartition('.')
  4345. if last == 'bin':
  4346. extension = last = 'unknown_video'
  4347. if last.lower() not in cls.ALLOWED_EXTENSIONS:
  4348. raise cls(extension)
  4349. return extension
  4350. class RetryManager:
  4351. """Usage:
  4352. for retry in RetryManager(...):
  4353. try:
  4354. ...
  4355. except SomeException as err:
  4356. retry.error = err
  4357. continue
  4358. """
  4359. attempt, _error = 0, None
  4360. def __init__(self, _retries, _error_callback, **kwargs):
  4361. self.retries = _retries or 0
  4362. self.error_callback = functools.partial(_error_callback, **kwargs)
  4363. def _should_retry(self):
  4364. return self._error is not NO_DEFAULT and self.attempt <= self.retries
  4365. @property
  4366. def error(self):
  4367. if self._error is NO_DEFAULT:
  4368. return None
  4369. return self._error
  4370. @error.setter
  4371. def error(self, value):
  4372. self._error = value
  4373. def __iter__(self):
  4374. while self._should_retry():
  4375. self.error = NO_DEFAULT
  4376. self.attempt += 1
  4377. yield self
  4378. if self.error:
  4379. self.error_callback(self.error, self.attempt, self.retries)
  4380. @staticmethod
  4381. def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
  4382. """Utility function for reporting retries"""
  4383. if count > retries:
  4384. if error:
  4385. return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
  4386. raise e
  4387. if not count:
  4388. return warn(e)
  4389. elif isinstance(e, ExtractorError):
  4390. e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
  4391. warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
  4392. delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
  4393. if delay:
  4394. info(f'Sleeping {delay:.2f} seconds ...')
  4395. time.sleep(delay)
  4396. @partial_application
  4397. def make_archive_id(ie, video_id):
  4398. ie_key = ie if isinstance(ie, str) else ie.ie_key()
  4399. return f'{ie_key.lower()} {video_id}'
  4400. @partial_application
  4401. def truncate_string(s, left, right=0):
  4402. assert left > 3 and right >= 0
  4403. if s is None or len(s) <= left + right:
  4404. return s
  4405. return f'{s[:left - 3]}...{s[-right:] if right else ""}'
  4406. def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
  4407. assert 'all' in alias_dict, '"all" alias is required'
  4408. requested = list(start or [])
  4409. for val in options:
  4410. discard = val.startswith('-')
  4411. if discard:
  4412. val = val[1:]
  4413. if val in alias_dict:
  4414. val = alias_dict[val] if not discard else [
  4415. i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
  4416. # NB: Do not allow regex in aliases for performance
  4417. requested = orderedSet_from_options(val, alias_dict, start=requested)
  4418. continue
  4419. current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
  4420. else [val] if val in alias_dict['all'] else None)
  4421. if current is None:
  4422. raise ValueError(val)
  4423. if discard:
  4424. for item in current:
  4425. while item in requested:
  4426. requested.remove(item)
  4427. else:
  4428. requested.extend(current)
  4429. return orderedSet(requested)
  4430. # TODO: Rewrite
  4431. class FormatSorter:
  4432. regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  4433. default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4434. 'res', 'fps', 'hdr:12', 'vcodec', 'channels', 'acodec',
  4435. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
  4436. _prefer_vp9_sort = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4437. 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
  4438. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')
  4439. ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
  4440. 'height', 'width', 'proto', 'vext', 'abr', 'aext',
  4441. 'fps', 'fs_approx', 'source', 'id')
  4442. settings = {
  4443. 'vcodec': {'type': 'ordered', 'regex': True,
  4444. 'order': ['av0?1', r'vp0?9\.0?2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
  4445. 'acodec': {'type': 'ordered', 'regex': True,
  4446. 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
  4447. 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
  4448. 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
  4449. 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
  4450. 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
  4451. 'vext': {'type': 'ordered', 'field': 'video_ext',
  4452. 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
  4453. 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
  4454. 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
  4455. 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
  4456. 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
  4457. 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
  4458. 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
  4459. 'field': ('vcodec', 'acodec'),
  4460. 'function': lambda it: int(any(v != 'none' for v in it))},
  4461. 'ie_pref': {'priority': True, 'type': 'extractor'},
  4462. 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4463. 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4464. 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
  4465. 'quality': {'convert': 'float', 'default': -1},
  4466. 'filesize': {'convert': 'bytes'},
  4467. 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
  4468. 'id': {'convert': 'string', 'field': 'format_id'},
  4469. 'height': {'convert': 'float_none'},
  4470. 'width': {'convert': 'float_none'},
  4471. 'fps': {'convert': 'float_none'},
  4472. 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
  4473. 'tbr': {'convert': 'float_none'},
  4474. 'vbr': {'convert': 'float_none'},
  4475. 'abr': {'convert': 'float_none'},
  4476. 'asr': {'convert': 'float_none'},
  4477. 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  4478. 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
  4479. 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
  4480. 'function': lambda it: next(filter(None, it), None)},
  4481. 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
  4482. 'function': lambda it: next(filter(None, it), None)},
  4483. 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
  4484. 'res': {'type': 'multiple', 'field': ('height', 'width'),
  4485. 'function': lambda it: min(filter(None, it), default=0)},
  4486. # Actual field names
  4487. 'format_id': {'type': 'alias', 'field': 'id'},
  4488. 'preference': {'type': 'alias', 'field': 'ie_pref'},
  4489. 'language_preference': {'type': 'alias', 'field': 'lang'},
  4490. 'source_preference': {'type': 'alias', 'field': 'source'},
  4491. 'protocol': {'type': 'alias', 'field': 'proto'},
  4492. 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
  4493. 'audio_channels': {'type': 'alias', 'field': 'channels'},
  4494. # Deprecated
  4495. 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4496. 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4497. 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
  4498. 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
  4499. 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
  4500. 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
  4501. 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
  4502. 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
  4503. 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
  4504. 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
  4505. 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
  4506. 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
  4507. 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
  4508. 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
  4509. 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4510. 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4511. 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4512. 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4513. 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4514. 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4515. }
  4516. def __init__(self, ydl, field_preference):
  4517. self.ydl = ydl
  4518. self._order = []
  4519. self.evaluate_params(self.ydl.params, field_preference)
  4520. if ydl.params.get('verbose'):
  4521. self.print_verbose_info(self.ydl.write_debug)
  4522. def _get_field_setting(self, field, key):
  4523. if field not in self.settings:
  4524. if key in ('forced', 'priority'):
  4525. return False
  4526. self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
  4527. 'deprecated and may be removed in a future version')
  4528. self.settings[field] = {}
  4529. prop_obj = self.settings[field]
  4530. if key not in prop_obj:
  4531. type_ = prop_obj.get('type')
  4532. if key == 'field':
  4533. default = 'preference' if type_ == 'extractor' else (field,) if type_ in ('combined', 'multiple') else field
  4534. elif key == 'convert':
  4535. default = 'order' if type_ == 'ordered' else 'float_string' if field else 'ignore'
  4536. else:
  4537. default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key)
  4538. prop_obj[key] = default
  4539. return prop_obj[key]
  4540. def _resolve_field_value(self, field, value, convert_none=False):
  4541. if value is None:
  4542. if not convert_none:
  4543. return None
  4544. else:
  4545. value = value.lower()
  4546. conversion = self._get_field_setting(field, 'convert')
  4547. if conversion == 'ignore':
  4548. return None
  4549. if conversion == 'string':
  4550. return value
  4551. elif conversion == 'float_none':
  4552. return float_or_none(value)
  4553. elif conversion == 'bytes':
  4554. return parse_bytes(value)
  4555. elif conversion == 'order':
  4556. order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
  4557. use_regex = self._get_field_setting(field, 'regex')
  4558. list_length = len(order_list)
  4559. empty_pos = order_list.index('') if '' in order_list else list_length + 1
  4560. if use_regex and value is not None:
  4561. for i, regex in enumerate(order_list):
  4562. if regex and re.match(regex, value):
  4563. return list_length - i
  4564. return list_length - empty_pos # not in list
  4565. else: # not regex or value = None
  4566. return list_length - (order_list.index(value) if value in order_list else empty_pos)
  4567. else:
  4568. if value.isnumeric():
  4569. return float(value)
  4570. else:
  4571. self.settings[field]['convert'] = 'string'
  4572. return value
  4573. def evaluate_params(self, params, sort_extractor):
  4574. self._use_free_order = params.get('prefer_free_formats', False)
  4575. self._sort_user = params.get('format_sort', [])
  4576. self._sort_extractor = sort_extractor
  4577. def add_item(field, reverse, closest, limit_text):
  4578. field = field.lower()
  4579. if field in self._order:
  4580. return
  4581. self._order.append(field)
  4582. limit = self._resolve_field_value(field, limit_text)
  4583. data = {
  4584. 'reverse': reverse,
  4585. 'closest': False if limit is None else closest,
  4586. 'limit_text': limit_text,
  4587. 'limit': limit}
  4588. if field in self.settings:
  4589. self.settings[field].update(data)
  4590. else:
  4591. self.settings[field] = data
  4592. sort_list = (
  4593. tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
  4594. + (tuple() if params.get('format_sort_force', False)
  4595. else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
  4596. + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
  4597. for item in sort_list:
  4598. match = re.match(self.regex, item)
  4599. if match is None:
  4600. raise ExtractorError(f'Invalid format sort string "{item}" given by extractor')
  4601. field = match.group('field')
  4602. if field is None:
  4603. continue
  4604. if self._get_field_setting(field, 'type') == 'alias':
  4605. alias, field = field, self._get_field_setting(field, 'field')
  4606. if self._get_field_setting(alias, 'deprecated'):
  4607. self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
  4608. f'be removed in a future version. Please use {field} instead')
  4609. reverse = match.group('reverse') is not None
  4610. closest = match.group('separator') == '~'
  4611. limit_text = match.group('limit')
  4612. has_limit = limit_text is not None
  4613. has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
  4614. has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  4615. fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
  4616. limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
  4617. limit_count = len(limits)
  4618. for (i, f) in enumerate(fields):
  4619. add_item(f, reverse, closest,
  4620. limits[i] if i < limit_count
  4621. else limits[0] if has_limit and not has_multiple_limits
  4622. else None)
  4623. def print_verbose_info(self, write_debug):
  4624. if self._sort_user:
  4625. write_debug('Sort order given by user: {}'.format(', '.join(self._sort_user)))
  4626. if self._sort_extractor:
  4627. write_debug('Sort order given by extractor: {}'.format(', '.join(self._sort_extractor)))
  4628. write_debug('Formats sorted by: {}'.format(', '.join(['{}{}{}'.format(
  4629. '+' if self._get_field_setting(field, 'reverse') else '', field,
  4630. '{}{}({})'.format('~' if self._get_field_setting(field, 'closest') else ':',
  4631. self._get_field_setting(field, 'limit_text'),
  4632. self._get_field_setting(field, 'limit'))
  4633. if self._get_field_setting(field, 'limit_text') is not None else '')
  4634. for field in self._order if self._get_field_setting(field, 'visible')])))
  4635. def _calculate_field_preference_from_value(self, format_, field, type_, value):
  4636. reverse = self._get_field_setting(field, 'reverse')
  4637. closest = self._get_field_setting(field, 'closest')
  4638. limit = self._get_field_setting(field, 'limit')
  4639. if type_ == 'extractor':
  4640. maximum = self._get_field_setting(field, 'max')
  4641. if value is None or (maximum is not None and value >= maximum):
  4642. value = -1
  4643. elif type_ == 'boolean':
  4644. in_list = self._get_field_setting(field, 'in_list')
  4645. not_in_list = self._get_field_setting(field, 'not_in_list')
  4646. value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
  4647. elif type_ == 'ordered':
  4648. value = self._resolve_field_value(field, value, True)
  4649. # try to convert to number
  4650. val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
  4651. is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
  4652. if is_num:
  4653. value = val_num
  4654. return ((-10, 0) if value is None
  4655. else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
  4656. else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
  4657. else (0, value, 0) if not reverse and (limit is None or value <= limit)
  4658. else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
  4659. else (-1, value, 0))
  4660. def _calculate_field_preference(self, format_, field):
  4661. type_ = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
  4662. get_value = lambda f: format_.get(self._get_field_setting(f, 'field'))
  4663. if type_ == 'multiple':
  4664. type_ = 'field' # Only 'field' is allowed in multiple for now
  4665. actual_fields = self._get_field_setting(field, 'field')
  4666. value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
  4667. else:
  4668. value = get_value(field)
  4669. return self._calculate_field_preference_from_value(format_, field, type_, value)
  4670. @staticmethod
  4671. def _fill_sorting_fields(format):
  4672. # Determine missing protocol
  4673. if not format.get('protocol'):
  4674. format['protocol'] = determine_protocol(format)
  4675. # Determine missing ext
  4676. if not format.get('ext') and 'url' in format:
  4677. format['ext'] = determine_ext(format['url']).lower()
  4678. if format.get('vcodec') == 'none':
  4679. format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
  4680. format['video_ext'] = 'none'
  4681. else:
  4682. format['video_ext'] = format['ext']
  4683. format['audio_ext'] = 'none'
  4684. # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
  4685. # format['preference'] = -1000
  4686. if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
  4687. # HEVC-over-FLV is out-of-spec by FLV's original spec
  4688. # ref. https://trac.ffmpeg.org/ticket/6389
  4689. # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
  4690. format['preference'] = -100
  4691. # Determine missing bitrates
  4692. if format.get('vcodec') == 'none':
  4693. format['vbr'] = 0
  4694. if format.get('acodec') == 'none':
  4695. format['abr'] = 0
  4696. if not format.get('vbr') and format.get('vcodec') != 'none':
  4697. format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
  4698. if not format.get('abr') and format.get('acodec') != 'none':
  4699. format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
  4700. if not format.get('tbr'):
  4701. format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
  4702. def calculate_preference(self, format):
  4703. self._fill_sorting_fields(format)
  4704. return tuple(self._calculate_field_preference(format, field) for field in self._order)
  4705. def filesize_from_tbr(tbr, duration):
  4706. """
  4707. @param tbr: Total bitrate in kbps (1000 bits/sec)
  4708. @param duration: Duration in seconds
  4709. @returns Filesize in bytes
  4710. """
  4711. if tbr is None or duration is None:
  4712. return None
  4713. return int(duration * tbr * (1000 / 8))
  4714. def _request_dump_filename(url, video_id, data=None, trim_length=None):
  4715. if data is not None:
  4716. data = hashlib.md5(data).hexdigest()
  4717. basen = join_nonempty(video_id, data, url, delim='_')
  4718. trim_length = trim_length or 240
  4719. if len(basen) > trim_length:
  4720. h = '___' + hashlib.md5(basen.encode()).hexdigest()
  4721. basen = basen[:trim_length - len(h)] + h
  4722. filename = sanitize_filename(f'{basen}.dump', restricted=True)
  4723. # Working around MAX_PATH limitation on Windows (see
  4724. # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
  4725. if os.name == 'nt':
  4726. absfilepath = os.path.abspath(filename)
  4727. if len(absfilepath) > 259:
  4728. filename = fR'\\?\{absfilepath}'
  4729. return filename
  4730. # XXX: Temporary
  4731. class _YDLLogger:
  4732. def __init__(self, ydl=None):
  4733. self._ydl = ydl
  4734. def debug(self, message):
  4735. if self._ydl:
  4736. self._ydl.write_debug(message)
  4737. def info(self, message):
  4738. if self._ydl:
  4739. self._ydl.to_screen(message)
  4740. def warning(self, message, *, once=False):
  4741. if self._ydl:
  4742. self._ydl.report_warning(message, once)
  4743. def error(self, message, *, is_error=True):
  4744. if self._ydl:
  4745. self._ydl.report_error(message, is_error=is_error)
  4746. def stdout(self, message):
  4747. if self._ydl:
  4748. self._ydl.to_stdout(message)
  4749. def stderr(self, message):
  4750. if self._ydl:
  4751. self._ydl.to_stderr(message)