util.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808
  1. from datetime import datetime
  2. import settings
  3. import socks
  4. import sockshandler
  5. import gzip
  6. try:
  7. import brotli
  8. have_brotli = True
  9. except ImportError:
  10. have_brotli = False
  11. import urllib.parse
  12. import re
  13. import time
  14. import os
  15. import json
  16. import gevent
  17. import gevent.queue
  18. import gevent.lock
  19. import collections
  20. import stem
  21. import stem.control
  22. import traceback
  23. # The trouble with the requests library: It ships its own certificate bundle via certifi
  24. # instead of using the system certificate store, meaning self-signed certificates
  25. # configured by the user will not work. Some draconian networks block TLS unless a corporate
  26. # certificate is installed on the system. Additionally, some users install a self signed cert
  27. # in order to use programs to modify or monitor requests made by programs on the system.
  28. # Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes
  29. # certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes,
  30. # this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a
  31. # malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples:
  32. # https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/
  33. # https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/
  34. # In the requests documentation it says:
  35. # "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store.
  36. # The certificates were only updated once for each Requests version. When certifi was not installed,
  37. # this led to extremely out-of-date certificate bundles when using significantly older versions of Requests.
  38. # For the sake of security we recommend upgrading certifi frequently!"
  39. # (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates)
  40. # Expecting users to remember to manually update certifi on Linux isn't reasonable in my view.
  41. # On windows, this is even worse since I am distributing all dependencies. This program is not
  42. # updated frequently, and using requests would lead to outdated certificates. Certificates
  43. # should be updated with OS updates, instead of thousands of developers of different programs
  44. # being expected to do this correctly 100% of the time.
  45. # There is hope that this might be fixed eventually:
  46. # https://github.com/kennethreitz/requests/issues/2966
  47. # Until then, I will use a mix of urllib3 and urllib.
  48. import urllib3
  49. import urllib3.contrib.socks
  50. URL_ORIGIN = "/https://www.youtube.com"
  51. connection_pool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED')
  52. class TorManager:
  53. MAX_TRIES = 3
  54. # Remember the 7-sec wait times, so make cooldown be two of those
  55. # (otherwise it will retry forever if 429s never end)
  56. COOLDOWN_TIME = 14
  57. def __init__(self):
  58. self.old_tor_connection_pool = None
  59. self.tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager(
  60. 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
  61. cert_reqs='CERT_REQUIRED')
  62. self.tor_pool_refresh_time = time.monotonic()
  63. settings.add_setting_changed_hook(
  64. 'tor_port',
  65. lambda old_val, new_val: self.refresh_tor_connection_pool(),
  66. )
  67. self.new_identity_lock = gevent.lock.BoundedSemaphore(1)
  68. self.last_new_identity_time = time.monotonic() - 20
  69. self.try_num = 1
  70. def refresh_tor_connection_pool(self):
  71. self.tor_connection_pool.clear()
  72. # Keep a reference for 5 min to avoid it getting garbage collected
  73. # while sockets still in use
  74. self.old_tor_connection_pool = self.tor_connection_pool
  75. self.tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager(
  76. 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
  77. cert_reqs='CERT_REQUIRED')
  78. self.tor_pool_refresh_time = time.monotonic()
  79. def get_tor_connection_pool(self):
  80. # Tor changes circuits after 10 minutes:
  81. # https://tor.stackexchange.com/questions/262/for-how-long-does-a-circuit-stay-alive
  82. current_time = time.monotonic()
  83. # close pool after 5 minutes
  84. if current_time - self.tor_pool_refresh_time > 300:
  85. self.refresh_tor_connection_pool()
  86. return self.tor_connection_pool
  87. def new_identity(self, time_failed_request_started):
  88. '''return error, or None if no error and the identity is fresh'''
  89. # The overall pattern at maximum (always returning 429) will be
  90. # R N (0) R N (6) R N (6) R | (12) R N (0) R N (6) ...
  91. # where R is a request, N is a new identity, (x) is a wait time of
  92. # x sec, and | is where we give up and display an error to the user.
  93. print('new_identity: new_identity called')
  94. # blocks if another greenlet currently has the lock
  95. self.new_identity_lock.acquire()
  96. print('new_identity: New identity lock acquired')
  97. try:
  98. # This was caused by a request that failed within a previous,
  99. # stale identity
  100. if time_failed_request_started <= self.last_new_identity_time:
  101. print('new_identity: Cancelling; request was from stale identity')
  102. return None
  103. delta = time.monotonic() - self.last_new_identity_time
  104. if delta < self.COOLDOWN_TIME and self.try_num == 1:
  105. err = ('Retried with new circuit %d times (max) within last '
  106. '%d seconds.' % (self.MAX_TRIES, self.COOLDOWN_TIME))
  107. print('new_identity:', err)
  108. return err
  109. elif delta >= self.COOLDOWN_TIME:
  110. self.try_num = 1
  111. try:
  112. port = settings.tor_control_port
  113. with stem.control.Controller.from_port(port=port) as controller:
  114. controller.authenticate('')
  115. print('new_identity: Getting new identity')
  116. controller.signal(stem.Signal.NEWNYM)
  117. print('new_identity: NEWNYM signal sent')
  118. self.last_new_identity_time = time.monotonic()
  119. self.refresh_tor_connection_pool()
  120. except stem.SocketError:
  121. traceback.print_exc()
  122. return 'Failed to connect to Tor control port.'
  123. finally:
  124. original_try_num = self.try_num
  125. self.try_num += 1
  126. if self.try_num > self.MAX_TRIES:
  127. self.try_num = 1
  128. # If we do the request right after second new identity it won't
  129. # be a new IP, based on experiments.
  130. # Not necessary after first new identity
  131. if original_try_num > 1:
  132. print('Sleeping for 7 seconds before retrying request')
  133. time.sleep(7) # experimentally determined minimum
  134. return None
  135. finally:
  136. self.new_identity_lock.release()
  137. tor_manager = TorManager()
  138. def get_pool(use_tor):
  139. if not use_tor:
  140. return connection_pool
  141. return tor_manager.get_tor_connection_pool()
  142. class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
  143. '''Separate cookiejars for receiving and sending'''
  144. def __init__(self, cookiejar_send=None, cookiejar_receive=None):
  145. import http.cookiejar
  146. self.cookiejar_send = cookiejar_send
  147. self.cookiejar_receive = cookiejar_receive
  148. def http_request(self, request):
  149. if self.cookiejar_send is not None:
  150. self.cookiejar_send.add_cookie_header(request)
  151. return request
  152. def http_response(self, request, response):
  153. if self.cookiejar_receive is not None:
  154. self.cookiejar_receive.extract_cookies(response, request)
  155. return response
  156. https_request = http_request
  157. https_response = http_response
  158. class FetchError(Exception):
  159. def __init__(self, code, reason='', ip=None, error_message=None):
  160. if error_message:
  161. string = code + ' ' + reason + ': ' + error_message
  162. else:
  163. string = 'HTTP error during request: ' + code + ' ' + reason
  164. Exception.__init__(self, string)
  165. self.code = code
  166. self.reason = reason
  167. self.ip = ip
  168. self.error_message = error_message
  169. def decode_content(content, encoding_header):
  170. encodings = encoding_header.replace(' ', '').split(',')
  171. for encoding in reversed(encodings):
  172. if encoding == 'identity':
  173. continue
  174. if encoding == 'br':
  175. content = brotli.decompress(content)
  176. elif encoding == 'gzip':
  177. content = gzip.decompress(content)
  178. return content
  179. def fetch_url_response(url, headers=(), timeout=15, data=None,
  180. cookiejar_send=None, cookiejar_receive=None,
  181. use_tor=True, max_redirects=None):
  182. '''
  183. returns response, cleanup_function
  184. When cookiejar_send is set to a CookieJar object,
  185. those cookies will be sent in the request (but cookies in response will not be merged into it)
  186. When cookiejar_receive is set to a CookieJar object,
  187. cookies received in the response will be merged into the object (nothing will be sent from it)
  188. When both are set to the same object, cookies will be sent from the object,
  189. and response cookies will be merged into it.
  190. '''
  191. headers = dict(headers) # Note: Calling dict() on a dict will make a copy
  192. if have_brotli:
  193. headers['Accept-Encoding'] = 'gzip, br'
  194. else:
  195. headers['Accept-Encoding'] = 'gzip'
  196. # prevent python version being leaked by urllib if User-Agent isn't provided
  197. # (urllib will use ex. Python-urllib/3.6 otherwise)
  198. if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
  199. headers['User-Agent'] = 'Python-urllib'
  200. method = "GET"
  201. if data is not None:
  202. method = "POST"
  203. if isinstance(data, str):
  204. data = data.encode('utf-8')
  205. elif not isinstance(data, bytes):
  206. data = urllib.parse.urlencode(data).encode('utf-8')
  207. if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
  208. req = urllib.request.Request(url, data=data, headers=headers)
  209. cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
  210. if use_tor and settings.route_tor:
  211. opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", settings.tor_port), cookie_processor)
  212. else:
  213. opener = urllib.request.build_opener(cookie_processor)
  214. response = opener.open(req, timeout=timeout)
  215. cleanup_func = (lambda r: None)
  216. else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
  217. # default: Retry.DEFAULT = Retry(3)
  218. # (in connectionpool.py in urllib3)
  219. # According to the documentation for urlopen, a redirect counts as a
  220. # retry. So there are 3 redirects max by default.
  221. if max_redirects:
  222. retries = urllib3.Retry(3+max_redirects, redirect=max_redirects, raise_on_redirect=False)
  223. else:
  224. retries = urllib3.Retry(3, raise_on_redirect=False)
  225. pool = get_pool(use_tor and settings.route_tor)
  226. try:
  227. response = pool.request(method, url, headers=headers, body=data,
  228. timeout=timeout, preload_content=False,
  229. decode_content=False, retries=retries)
  230. response.retries = retries
  231. except urllib3.exceptions.MaxRetryError as e:
  232. exception_cause = e.__context__.__context__
  233. if (isinstance(exception_cause, socks.ProxyConnectionError)
  234. and settings.route_tor):
  235. msg = ('Failed to connect to Tor. Check that Tor is open and '
  236. 'that your internet connection is working.\n\n'
  237. + str(e))
  238. raise FetchError('502', reason='Bad Gateway',
  239. error_message=msg)
  240. elif isinstance(e.__context__,
  241. urllib3.exceptions.NewConnectionError):
  242. msg = 'Failed to establish a connection.\n\n' + str(e)
  243. raise FetchError(
  244. '502', reason='Bad Gateway',
  245. error_message=msg)
  246. else:
  247. raise
  248. cleanup_func = (lambda r: r.release_conn())
  249. return response, cleanup_func
  250. def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
  251. cookiejar_send=None, cookiejar_receive=None, use_tor=True,
  252. debug_name=None):
  253. while True:
  254. start_time = time.monotonic()
  255. response, cleanup_func = fetch_url_response(
  256. url, headers, timeout=timeout, data=data,
  257. cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
  258. use_tor=use_tor)
  259. response_time = time.monotonic()
  260. content = response.read()
  261. read_finish = time.monotonic()
  262. cleanup_func(response) # release_connection for urllib3
  263. content = decode_content(
  264. content,
  265. response.getheader('Content-Encoding', default='identity'))
  266. if (settings.debugging_save_responses
  267. and debug_name is not None and content):
  268. save_dir = os.path.join(settings.data_dir, 'debug')
  269. if not os.path.exists(save_dir):
  270. os.makedirs(save_dir)
  271. with open(os.path.join(save_dir, debug_name), 'wb') as f:
  272. f.write(content)
  273. if response.status == 429 or (
  274. response.status == 302 and (response.getheader('Location') == url
  275. or response.getheader('Location').startswith(
  276. 'https://www.google.com/sorry/index'
  277. )
  278. )
  279. ):
  280. print(response.status, response.reason, response.headers)
  281. ip = re.search(
  282. br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
  283. content)
  284. ip = ip.group(1).decode('ascii') if ip else None
  285. if not ip:
  286. ip = re.search(r'IP=((?:\d+\.)+\d+)',
  287. response.getheader('Set-Cookie') or '')
  288. ip = ip.group(1) if ip else None
  289. # don't get new identity if we're not using Tor
  290. if not use_tor:
  291. raise FetchError('429', reason=response.reason, ip=ip)
  292. print('Error: YouTube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip)
  293. # get new identity
  294. error = tor_manager.new_identity(start_time)
  295. if error:
  296. raise FetchError(
  297. '429', reason=response.reason, ip=ip,
  298. error_message='Automatic circuit change: ' + error)
  299. else:
  300. continue # retry now that we have new identity
  301. elif response.status >= 400:
  302. raise FetchError(str(response.status), reason=response.reason,
  303. ip=None)
  304. break
  305. if report_text:
  306. print(report_text, ' Latency:', round(response_time - start_time, 3), ' Read time:', round(read_finish - response_time,3))
  307. return content
  308. def head(url, use_tor=False, report_text=None, max_redirects=10):
  309. pool = get_pool(use_tor and settings.route_tor)
  310. start_time = time.monotonic()
  311. # default: Retry.DEFAULT = Retry(3)
  312. # (in connectionpool.py in urllib3)
  313. # According to the documentation for urlopen, a redirect counts as a retry
  314. # So there are 3 redirects max by default. Let's change that
  315. # to 10 since googlevideo redirects a lot.
  316. retries = urllib3.Retry(
  317. 3+max_redirects,
  318. redirect=max_redirects,
  319. raise_on_redirect=False)
  320. headers = {'User-Agent': 'Python-urllib'}
  321. response = pool.request('HEAD', url, headers=headers, retries=retries)
  322. if report_text:
  323. print(
  324. report_text,
  325. ' Latency:',
  326. round(time.monotonic() - start_time, 3))
  327. return response
  328. mobile_user_agent = 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.80 Mobile Safari/537.36'
  329. mobile_ua = (('User-Agent', mobile_user_agent),)
  330. desktop_user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0'
  331. desktop_ua = (('User-Agent', desktop_user_agent),)
  332. json_header = (('Content-Type', 'application/json'),)
  333. desktop_xhr_headers = (
  334. ('Accept', '*/*'),
  335. ('Accept-Language', 'en-US,en;q=0.5'),
  336. ('X-YouTube-Client-Name', '1'),
  337. ('X-YouTube-Client-Version', '2.20240327.00.00'),
  338. ) + desktop_ua
  339. mobile_xhr_headers = (
  340. ('Accept', '*/*'),
  341. ('Accept-Language', 'en-US,en;q=0.5'),
  342. ('X-YouTube-Client-Name', '1'),
  343. ('X-YouTube-Client-Version', '2.20240328.08.00'),
  344. ) + mobile_ua
  345. class RateLimitedQueue(gevent.queue.Queue):
  346. ''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''
  347. def __init__(self, initial_burst=30, waiting_period=5, subsequent_bursts=10):
  348. self.initial_burst = initial_burst
  349. self.waiting_period = waiting_period
  350. self.subsequent_bursts = subsequent_bursts
  351. self.count_since_last_wait = 0
  352. self.surpassed_initial = False
  353. self.lock = gevent.lock.BoundedSemaphore(1)
  354. self.currently_empty = False
  355. self.empty_start = 0
  356. gevent.queue.Queue.__init__(self)
  357. def get(self):
  358. with self.lock: # blocks if another greenlet currently has the lock
  359. if ((self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial) or
  360. (self.count_since_last_wait >= self.initial_burst and not self.surpassed_initial)):
  361. self.surpassed_initial = True
  362. gevent.sleep(self.waiting_period)
  363. self.count_since_last_wait = 0
  364. self.count_since_last_wait += 1
  365. if not self.currently_empty and self.empty():
  366. self.currently_empty = True
  367. self.empty_start = time.monotonic()
  368. item = gevent.queue.Queue.get(self) # blocks when nothing left
  369. if self.currently_empty:
  370. if time.monotonic() - self.empty_start >= self.waiting_period:
  371. self.count_since_last_wait = 0
  372. self.surpassed_initial = False
  373. self.currently_empty = False
  374. return item
  375. def download_thumbnail(save_directory, video_id):
  376. url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
  377. save_location = os.path.join(save_directory, video_id + ".jpg")
  378. try:
  379. thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id)
  380. except urllib.error.HTTPError as e:
  381. print("Failed to download thumbnail for " + video_id + ": " + str(e))
  382. return False
  383. try:
  384. f = open(save_location, 'wb')
  385. except FileNotFoundError:
  386. os.makedirs(save_directory, exist_ok=True)
  387. f = open(save_location, 'wb')
  388. f.write(thumbnail)
  389. f.close()
  390. return True
  391. def download_thumbnails(save_directory, ids):
  392. if not isinstance(ids, (list, tuple)):
  393. ids = list(ids)
  394. # only do 5 at a time
  395. # do the n where n is divisible by 5
  396. i = -1
  397. for i in range(0, int(len(ids)/5) - 1 ):
  398. gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
  399. # do the remainders (< 5)
  400. gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
  401. def dict_add(*dicts):
  402. for dictionary in dicts[1:]:
  403. dicts[0].update(dictionary)
  404. return dicts[0]
  405. def video_id(url):
  406. url_parts = urllib.parse.urlparse(url)
  407. return urllib.parse.parse_qs(url_parts.query)['v'][0]
  408. # default, sddefault, mqdefault, hqdefault, hq720
  409. def get_thumbnail_url(video_id):
  410. return f"{settings.img_prefix}https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
  411. def seconds_to_timestamp(seconds):
  412. seconds = int(seconds)
  413. hours, seconds = divmod(seconds, 3600)
  414. minutes, seconds = divmod(seconds, 60)
  415. if hours != 0:
  416. timestamp = str(hours) + ":"
  417. timestamp += str(minutes).zfill(2) # zfill pads with zeros
  418. else:
  419. timestamp = str(minutes)
  420. timestamp += ":" + str(seconds).zfill(2)
  421. return timestamp
  422. def update_query_string(query_string, items):
  423. parameters = urllib.parse.parse_qs(query_string)
  424. parameters.update(items)
  425. return urllib.parse.urlencode(parameters, doseq=True)
  426. YOUTUBE_DOMAINS = ('youtube.com', 'youtu.be', 'youtube-nocookie.com')
  427. YOUTUBE_URL_RE_STR = r'https?://(?:[a-zA-Z0-9_-]*\.)?(?:'
  428. YOUTUBE_URL_RE_STR += r'|'.join(map(re.escape, YOUTUBE_DOMAINS))
  429. YOUTUBE_URL_RE_STR += r')(?:/[^"]*)?'
  430. YOUTUBE_URL_RE = re.compile(YOUTUBE_URL_RE_STR)
  431. def prefix_url(url):
  432. if url is None:
  433. return None
  434. url = url.lstrip('/') # some urls have // before them, which has a special meaning
  435. return '/' + url
  436. def left_remove(string, substring):
  437. '''removes substring from the start of string, if present'''
  438. if string.startswith(substring):
  439. return string[len(substring):]
  440. return string
  441. def concat_or_none(*strings):
  442. '''Concatenates strings. Returns None if any of the arguments are None'''
  443. result = ''
  444. for string in strings:
  445. if string is None:
  446. return None
  447. result += string
  448. return result
  449. def prefix_urls(item):
  450. if settings.proxy_images:
  451. try:
  452. item['thumbnail'] = prefix_url(item['thumbnail'])
  453. except KeyError:
  454. pass
  455. try:
  456. item['author_url'] = prefix_url(item['author_url'])
  457. except KeyError:
  458. pass
  459. def add_extra_html_info(item):
  460. if item['type'] == 'video':
  461. item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
  462. video_info = {}
  463. for key in ('id', 'title', 'author', 'duration', 'author_id'):
  464. try:
  465. video_info[key] = item[key]
  466. except KeyError:
  467. video_info[key] = None
  468. item['video_info'] = json.dumps(video_info)
  469. elif item['type'] == 'playlist' and item['playlist_type'] == 'radio':
  470. item['url'] = concat_or_none(
  471. URL_ORIGIN,
  472. '/watch?v=', item['first_video_id'],
  473. '&list=', item['id']
  474. )
  475. elif item['type'] == 'playlist':
  476. item['url'] = concat_or_none(URL_ORIGIN, '/playlist?list=', item['id'])
  477. elif item['type'] == 'channel':
  478. item['url'] = concat_or_none(URL_ORIGIN, "/channel/", item['id'])
  479. if item.get('author_id') and 'author_url' not in item:
  480. item['author_url'] = URL_ORIGIN + '/channel/' + item['author_id']
  481. def check_gevent_exceptions(*tasks):
  482. for task in tasks:
  483. if task.exception:
  484. raise task.exception
  485. # https://stackoverflow.com/a/62888
  486. replacement_map = collections.OrderedDict([
  487. ('<', '_'),
  488. ('>', '_'),
  489. (': ', ' - '),
  490. (':', '-'),
  491. ('"', "'"),
  492. ('/', '_'),
  493. ('\\', '_'),
  494. ('|', '-'),
  495. ('?', ''),
  496. ('*', '_'),
  497. ('\t', ' '),
  498. ])
  499. DOS_names = {'con', 'prn', 'aux', 'nul', 'com0', 'com1', 'com2', 'com3',
  500. 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt0',
  501. 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7',
  502. 'lpt8', 'lpt9'}
  503. def to_valid_filename(name):
  504. '''Changes the name so it's valid on Windows, Linux, and Mac'''
  505. # See https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
  506. # for Windows specs
  507. # Additional recommendations for Linux:
  508. # https://dwheeler.com/essays/fixing-unix-linux-filenames.html#standards
  509. # remove control characters
  510. name = re.sub(r'[\x00-\x1f]', '_', name)
  511. # reserved characters
  512. for reserved_char, replacement in replacement_map.items():
  513. name = name.replace(reserved_char, replacement)
  514. # check for all periods/spaces
  515. if all(c == '.' or c == ' ' for c in name):
  516. name = '_'*len(name)
  517. # remove trailing periods and spaces
  518. name = name.rstrip('. ')
  519. # check for reserved DOS names, such as nul or nul.txt
  520. base_ext_parts = name.rsplit('.', maxsplit=1)
  521. if base_ext_parts[0].lower() in DOS_names:
  522. base_ext_parts[0] += '_'
  523. name = '.'.join(base_ext_parts)
  524. # check for blank name
  525. if name == '':
  526. name = '_'
  527. # check if name begins with a hyphen, period, or space
  528. if name[0] in ('-', '.', ' '):
  529. name = '_' + name
  530. return name
  531. # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72
  532. INNERTUBE_CLIENTS = {
  533. 'android-test-suite': {
  534. 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
  535. 'INNERTUBE_CONTEXT': {
  536. 'client': {
  537. 'hl': 'en',
  538. 'gl': 'US',
  539. 'clientName': 'ANDROID_TESTSUITE',
  540. 'clientVersion': '1.9',
  541. 'osName': 'Android',
  542. 'osVersion': '12',
  543. 'androidSdkVersion': 31,
  544. 'platform': 'MOBILE',
  545. 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 12; US) gzip'
  546. },
  547. # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
  548. #'thirdParty': {
  549. # 'embedUrl': 'https://google.com', # Can be any valid URL
  550. #}
  551. },
  552. 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
  553. 'REQUIRE_JS_PLAYER': False,
  554. },
  555. 'ios': {
  556. 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
  557. 'INNERTUBE_CONTEXT': {
  558. 'client': {
  559. 'hl': 'en',
  560. 'gl': 'US',
  561. 'clientName': 'IOS',
  562. 'clientVersion': '19.12.3',
  563. 'deviceModel': 'iPhone14,3',
  564. 'userAgent': 'com.google.ios.youtube/19.12.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
  565. }
  566. },
  567. 'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
  568. 'REQUIRE_JS_PLAYER': False
  569. },
  570. 'android': {
  571. 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
  572. 'INNERTUBE_CONTEXT': {
  573. 'client': {
  574. 'hl': 'en',
  575. 'gl': 'US',
  576. 'clientName': 'ANDROID',
  577. 'clientVersion': '19.15.35',
  578. 'osName': 'Android',
  579. 'osVersion': '14',
  580. 'androidSdkVersion': 34,
  581. 'platform': 'MOBILE',
  582. 'userAgent': 'com.google.android.youtube/19.15.35 (Linux; U; Android 14; en_US; Google Pixel 6 Pro) gzip'
  583. }
  584. },
  585. 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
  586. 'REQUIRE_JS_PLAYER': False,
  587. },
  588. 'android_music': {
  589. 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI',
  590. 'INNERTUBE_CONTEXT': {
  591. 'client': {
  592. 'hl': 'en',
  593. 'gl': 'US',
  594. 'clientName': 'ANDROID_MUSIC',
  595. 'clientVersion': '6.48.51',
  596. 'osName': 'Android',
  597. 'osVersion': '14',
  598. 'androidSdkVersion': 34,
  599. 'platform': 'MOBILE',
  600. 'userAgent': 'com.google.android.apps.youtube.music/6.48.51 (Linux; U; Android 14; US) gzip'
  601. }
  602. },
  603. 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
  604. 'REQUIRE_JS_PLAYER': False
  605. },
  606. # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
  607. # See: https://github.com/zerodytrash/YouTube-Internal-Clients
  608. 'tv_embedded': {
  609. 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  610. 'INNERTUBE_CONTEXT': {
  611. 'client': {
  612. 'hl': 'en',
  613. 'gl': 'US',
  614. 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
  615. 'clientVersion': '2.0',
  616. 'clientScreen': 'EMBED',
  617. },
  618. # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
  619. 'thirdParty': {
  620. 'embedUrl': 'https://google.com', # Can be any valid URL
  621. }
  622. },
  623. 'INNERTUBE_CONTEXT_CLIENT_NAME': 85,
  624. 'REQUIRE_JS_PLAYER': True,
  625. },
  626. 'web': {
  627. 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  628. 'INNERTUBE_CONTEXT': {
  629. 'client': {
  630. 'clientName': 'WEB',
  631. 'clientVersion': '2.20240327.00.00',
  632. 'userAgent': desktop_user_agent,
  633. }
  634. },
  635. 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
  636. },
  637. }
  638. def call_youtube_api(client, api, data):
  639. client_params = INNERTUBE_CLIENTS[client]
  640. context = client_params['INNERTUBE_CONTEXT']
  641. key = client_params['INNERTUBE_API_KEY']
  642. host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com'
  643. user_agent = context['client'].get('userAgent') or mobile_user_agent
  644. url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key
  645. data['context'] = context
  646. data = json.dumps(data)
  647. headers = (('Content-Type', 'application/json'),('User-Agent', user_agent))
  648. response = fetch_url(
  649. url, data=data, headers=headers,
  650. debug_name='youtubei_' + api + '_' + client,
  651. report_text='Fetched ' + client + ' youtubei ' + api
  652. ).decode('utf-8')
  653. return response
  654. def strip_non_ascii(string):
  655. ''' Returns the string without non ASCII characters'''
  656. stripped = (c for c in string if 0 < ord(c) < 127)
  657. return ''.join(stripped)
  658. def time_utc_isoformat(string):
  659. t = datetime.strptime(string, '%Y-%m-%d')
  660. t = t.astimezone().isoformat()
  661. return t