util.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784
  1. from datetime import datetime
  2. import settings
  3. import socks
  4. import sockshandler
  5. import gzip
  6. try:
  7. import brotli
  8. have_brotli = True
  9. except ImportError:
  10. have_brotli = False
  11. import urllib.parse
  12. import re
  13. import time
  14. import os
  15. import json
  16. import gevent
  17. import gevent.queue
  18. import gevent.lock
  19. import collections
  20. import stem
  21. import stem.control
  22. import traceback
  23. # The trouble with the requests library: It ships its own certificate bundle via certifi
  24. # instead of using the system certificate store, meaning self-signed certificates
  25. # configured by the user will not work. Some draconian networks block TLS unless a corporate
  26. # certificate is installed on the system. Additionally, some users install a self signed cert
  27. # in order to use programs to modify or monitor requests made by programs on the system.
  28. # Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes
  29. # certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes,
  30. # this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a
  31. # malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples:
  32. # https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/
  33. # https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/
  34. # In the requests documentation it says:
  35. # "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store.
  36. # The certificates were only updated once for each Requests version. When certifi was not installed,
  37. # this led to extremely out-of-date certificate bundles when using significantly older versions of Requests.
  38. # For the sake of security we recommend upgrading certifi frequently!"
  39. # (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates)
  40. # Expecting users to remember to manually update certifi on Linux isn't reasonable in my view.
  41. # On windows, this is even worse since I am distributing all dependencies. This program is not
  42. # updated frequently, and using requests would lead to outdated certificates. Certificates
  43. # should be updated with OS updates, instead of thousands of developers of different programs
  44. # being expected to do this correctly 100% of the time.
  45. # There is hope that this might be fixed eventually:
  46. # https://github.com/kennethreitz/requests/issues/2966
  47. # Until then, I will use a mix of urllib3 and urllib.
  48. import urllib3
  49. import urllib3.contrib.socks
  50. URL_ORIGIN = "/https://www.youtube.com"
  51. connection_pool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED')
  52. class TorManager:
  53. MAX_TRIES = 3
  54. # Remember the 7-sec wait times, so make cooldown be two of those
  55. # (otherwise it will retry forever if 429s never end)
  56. COOLDOWN_TIME = 14
  57. def __init__(self):
  58. self.old_tor_connection_pool = None
  59. self.tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager(
  60. 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
  61. cert_reqs='CERT_REQUIRED')
  62. self.tor_pool_refresh_time = time.monotonic()
  63. settings.add_setting_changed_hook(
  64. 'tor_port',
  65. lambda old_val, new_val: self.refresh_tor_connection_pool(),
  66. )
  67. self.new_identity_lock = gevent.lock.BoundedSemaphore(1)
  68. self.last_new_identity_time = time.monotonic() - 20
  69. self.try_num = 1
  70. def refresh_tor_connection_pool(self):
  71. self.tor_connection_pool.clear()
  72. # Keep a reference for 5 min to avoid it getting garbage collected
  73. # while sockets still in use
  74. self.old_tor_connection_pool = self.tor_connection_pool
  75. self.tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager(
  76. 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
  77. cert_reqs='CERT_REQUIRED')
  78. self.tor_pool_refresh_time = time.monotonic()
  79. def get_tor_connection_pool(self):
  80. # Tor changes circuits after 10 minutes:
  81. # https://tor.stackexchange.com/questions/262/for-how-long-does-a-circuit-stay-alive
  82. current_time = time.monotonic()
  83. # close pool after 5 minutes
  84. if current_time - self.tor_pool_refresh_time > 300:
  85. self.refresh_tor_connection_pool()
  86. return self.tor_connection_pool
  87. def new_identity(self, time_failed_request_started):
  88. '''return error, or None if no error and the identity is fresh'''
  89. # The overall pattern at maximum (always returning 429) will be
  90. # R N (0) R N (6) R N (6) R | (12) R N (0) R N (6) ...
  91. # where R is a request, N is a new identity, (x) is a wait time of
  92. # x sec, and | is where we give up and display an error to the user.
  93. print('new_identity: new_identity called')
  94. # blocks if another greenlet currently has the lock
  95. self.new_identity_lock.acquire()
  96. print('new_identity: New identity lock acquired')
  97. try:
  98. # This was caused by a request that failed within a previous,
  99. # stale identity
  100. if time_failed_request_started <= self.last_new_identity_time:
  101. print('new_identity: Cancelling; request was from stale identity')
  102. return None
  103. delta = time.monotonic() - self.last_new_identity_time
  104. if delta < self.COOLDOWN_TIME and self.try_num == 1:
  105. err = ('Retried with new circuit %d times (max) within last '
  106. '%d seconds.' % (self.MAX_TRIES, self.COOLDOWN_TIME))
  107. print('new_identity:', err)
  108. return err
  109. elif delta >= self.COOLDOWN_TIME:
  110. self.try_num = 1
  111. try:
  112. port = settings.tor_control_port
  113. with stem.control.Controller.from_port(port=port) as controller:
  114. controller.authenticate('')
  115. print('new_identity: Getting new identity')
  116. controller.signal(stem.Signal.NEWNYM)
  117. print('new_identity: NEWNYM signal sent')
  118. self.last_new_identity_time = time.monotonic()
  119. self.refresh_tor_connection_pool()
  120. except stem.SocketError:
  121. traceback.print_exc()
  122. return 'Failed to connect to Tor control port.'
  123. finally:
  124. original_try_num = self.try_num
  125. self.try_num += 1
  126. if self.try_num > self.MAX_TRIES:
  127. self.try_num = 1
  128. # If we do the request right after second new identity it won't
  129. # be a new IP, based on experiments.
  130. # Not necessary after first new identity
  131. if original_try_num > 1:
  132. print('Sleeping for 7 seconds before retrying request')
  133. time.sleep(7) # experimentally determined minimum
  134. return None
  135. finally:
  136. self.new_identity_lock.release()
  137. tor_manager = TorManager()
  138. def get_pool(use_tor):
  139. if not use_tor:
  140. return connection_pool
  141. return tor_manager.get_tor_connection_pool()
  142. class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
  143. '''Separate cookiejars for receiving and sending'''
  144. def __init__(self, cookiejar_send=None, cookiejar_receive=None):
  145. import http.cookiejar
  146. self.cookiejar_send = cookiejar_send
  147. self.cookiejar_receive = cookiejar_receive
  148. def http_request(self, request):
  149. if self.cookiejar_send is not None:
  150. self.cookiejar_send.add_cookie_header(request)
  151. return request
  152. def http_response(self, request, response):
  153. if self.cookiejar_receive is not None:
  154. self.cookiejar_receive.extract_cookies(response, request)
  155. return response
  156. https_request = http_request
  157. https_response = http_response
  158. class FetchError(Exception):
  159. def __init__(self, code, reason='', ip=None, error_message=None):
  160. if error_message:
  161. string = code + ' ' + reason + ': ' + error_message
  162. else:
  163. string = 'HTTP error during request: ' + code + ' ' + reason
  164. Exception.__init__(self, string)
  165. self.code = code
  166. self.reason = reason
  167. self.ip = ip
  168. self.error_message = error_message
  169. def decode_content(content, encoding_header):
  170. encodings = encoding_header.replace(' ', '').split(',')
  171. for encoding in reversed(encodings):
  172. if encoding == 'identity':
  173. continue
  174. if encoding == 'br':
  175. content = brotli.decompress(content)
  176. elif encoding == 'gzip':
  177. content = gzip.decompress(content)
  178. return content
  179. def fetch_url_response(url, headers=(), timeout=15, data=None,
  180. cookiejar_send=None, cookiejar_receive=None,
  181. use_tor=True, max_redirects=None):
  182. '''
  183. returns response, cleanup_function
  184. When cookiejar_send is set to a CookieJar object,
  185. those cookies will be sent in the request (but cookies in response will not be merged into it)
  186. When cookiejar_receive is set to a CookieJar object,
  187. cookies received in the response will be merged into the object (nothing will be sent from it)
  188. When both are set to the same object, cookies will be sent from the object,
  189. and response cookies will be merged into it.
  190. '''
  191. headers = dict(headers) # Note: Calling dict() on a dict will make a copy
  192. if have_brotli:
  193. headers['Accept-Encoding'] = 'gzip, br'
  194. else:
  195. headers['Accept-Encoding'] = 'gzip'
  196. # prevent python version being leaked by urllib if User-Agent isn't provided
  197. # (urllib will use ex. Python-urllib/3.6 otherwise)
  198. if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
  199. headers['User-Agent'] = 'Python-urllib'
  200. method = "GET"
  201. if data is not None:
  202. method = "POST"
  203. if isinstance(data, str):
  204. data = data.encode('utf-8')
  205. elif not isinstance(data, bytes):
  206. data = urllib.parse.urlencode(data).encode('utf-8')
  207. if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
  208. req = urllib.request.Request(url, data=data, headers=headers)
  209. cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
  210. if use_tor and settings.route_tor:
  211. opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", settings.tor_port), cookie_processor)
  212. else:
  213. opener = urllib.request.build_opener(cookie_processor)
  214. response = opener.open(req, timeout=timeout)
  215. cleanup_func = (lambda r: None)
  216. else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
  217. # default: Retry.DEFAULT = Retry(3)
  218. # (in connectionpool.py in urllib3)
  219. # According to the documentation for urlopen, a redirect counts as a
  220. # retry. So there are 3 redirects max by default.
  221. if max_redirects:
  222. retries = urllib3.Retry(3+max_redirects, redirect=max_redirects, raise_on_redirect=False)
  223. else:
  224. retries = urllib3.Retry(3, raise_on_redirect=False)
  225. pool = get_pool(use_tor and settings.route_tor)
  226. try:
  227. response = pool.request(method, url, headers=headers, body=data,
  228. timeout=timeout, preload_content=False,
  229. decode_content=False, retries=retries)
  230. response.retries = retries
  231. except urllib3.exceptions.MaxRetryError as e:
  232. exception_cause = e.__context__.__context__
  233. if (isinstance(exception_cause, socks.ProxyConnectionError)
  234. and settings.route_tor):
  235. msg = ('Failed to connect to Tor. Check that Tor is open and '
  236. 'that your internet connection is working.\n\n'
  237. + str(e))
  238. raise FetchError('502', reason='Bad Gateway',
  239. error_message=msg)
  240. elif isinstance(e.__context__,
  241. urllib3.exceptions.NewConnectionError):
  242. msg = 'Failed to establish a connection.\n\n' + str(e)
  243. raise FetchError(
  244. '502', reason='Bad Gateway',
  245. error_message=msg)
  246. else:
  247. raise
  248. cleanup_func = (lambda r: r.release_conn())
  249. return response, cleanup_func
  250. def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
  251. cookiejar_send=None, cookiejar_receive=None, use_tor=True,
  252. debug_name=None):
  253. while True:
  254. start_time = time.monotonic()
  255. response, cleanup_func = fetch_url_response(
  256. url, headers, timeout=timeout, data=data,
  257. cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
  258. use_tor=use_tor)
  259. response_time = time.monotonic()
  260. content = response.read()
  261. read_finish = time.monotonic()
  262. cleanup_func(response) # release_connection for urllib3
  263. content = decode_content(
  264. content,
  265. response.getheader('Content-Encoding', default='identity'))
  266. if (settings.debugging_save_responses
  267. and debug_name is not None and content):
  268. save_dir = os.path.join(settings.data_dir, 'debug')
  269. if not os.path.exists(save_dir):
  270. os.makedirs(save_dir)
  271. with open(os.path.join(save_dir, debug_name), 'wb') as f:
  272. f.write(content)
  273. if response.status == 429 or (
  274. response.status == 302 and (response.getheader('Location') == url
  275. or response.getheader('Location').startswith(
  276. 'https://www.google.com/sorry/index'
  277. )
  278. )
  279. ):
  280. print(response.status, response.reason, response.headers)
  281. ip = re.search(
  282. br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
  283. content)
  284. ip = ip.group(1).decode('ascii') if ip else None
  285. if not ip:
  286. ip = re.search(r'IP=((?:\d+\.)+\d+)',
  287. response.getheader('Set-Cookie') or '')
  288. ip = ip.group(1) if ip else None
  289. # don't get new identity if we're not using Tor
  290. if not use_tor:
  291. raise FetchError('429', reason=response.reason, ip=ip)
  292. print('Error: YouTube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip)
  293. # get new identity
  294. error = tor_manager.new_identity(start_time)
  295. if error:
  296. raise FetchError(
  297. '429', reason=response.reason, ip=ip,
  298. error_message='Automatic circuit change: ' + error)
  299. else:
  300. continue # retry now that we have new identity
  301. elif response.status >= 400:
  302. raise FetchError(str(response.status), reason=response.reason,
  303. ip=None)
  304. break
  305. if report_text:
  306. print(report_text, ' Latency:', round(response_time - start_time, 3), ' Read time:', round(read_finish - response_time,3))
  307. return content
  308. def head(url, use_tor=False, report_text=None, max_redirects=10):
  309. pool = get_pool(use_tor and settings.route_tor)
  310. start_time = time.monotonic()
  311. # default: Retry.DEFAULT = Retry(3)
  312. # (in connectionpool.py in urllib3)
  313. # According to the documentation for urlopen, a redirect counts as a retry
  314. # So there are 3 redirects max by default. Let's change that
  315. # to 10 since googlevideo redirects a lot.
  316. retries = urllib3.Retry(
  317. 3+max_redirects,
  318. redirect=max_redirects,
  319. raise_on_redirect=False)
  320. headers = {'User-Agent': 'Python-urllib'}
  321. response = pool.request('HEAD', url, headers=headers, retries=retries)
  322. if report_text:
  323. print(
  324. report_text,
  325. ' Latency:',
  326. round(time.monotonic() - start_time, 3))
  327. return response
  328. mobile_user_agent = 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.80 Mobile Safari/537.36'
  329. mobile_ua = (('User-Agent', mobile_user_agent),)
  330. desktop_user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0'
  331. desktop_ua = (('User-Agent', desktop_user_agent),)
  332. json_header = (('Content-Type', 'application/json'),)
  333. desktop_xhr_headers = (
  334. ('Accept', '*/*'),
  335. ('Accept-Language', 'en-US,en;q=0.5'),
  336. ('X-YouTube-Client-Name', '1'),
  337. ('X-YouTube-Client-Version', '2.20240327.00.00'),
  338. ) + desktop_ua
  339. mobile_xhr_headers = (
  340. ('Accept', '*/*'),
  341. ('Accept-Language', 'en-US,en;q=0.5'),
  342. ('X-YouTube-Client-Name', '1'),
  343. ('X-YouTube-Client-Version', '2.20240328.08.00'),
  344. ) + mobile_ua
  345. class RateLimitedQueue(gevent.queue.Queue):
  346. ''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''
  347. def __init__(self, initial_burst=30, waiting_period=5, subsequent_bursts=10):
  348. self.initial_burst = initial_burst
  349. self.waiting_period = waiting_period
  350. self.subsequent_bursts = subsequent_bursts
  351. self.count_since_last_wait = 0
  352. self.surpassed_initial = False
  353. self.lock = gevent.lock.BoundedSemaphore(1)
  354. self.currently_empty = False
  355. self.empty_start = 0
  356. gevent.queue.Queue.__init__(self)
  357. def get(self):
  358. self.lock.acquire() # blocks if another greenlet currently has the lock
  359. if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial:
  360. gevent.sleep(self.waiting_period)
  361. self.count_since_last_wait = 0
  362. elif self.count_since_last_wait >= self.initial_burst and not self.surpassed_initial:
  363. self.surpassed_initial = True
  364. gevent.sleep(self.waiting_period)
  365. self.count_since_last_wait = 0
  366. self.count_since_last_wait += 1
  367. if not self.currently_empty and self.empty():
  368. self.currently_empty = True
  369. self.empty_start = time.monotonic()
  370. item = gevent.queue.Queue.get(self) # blocks when nothing left
  371. if self.currently_empty:
  372. if time.monotonic() - self.empty_start >= self.waiting_period:
  373. self.count_since_last_wait = 0
  374. self.surpassed_initial = False
  375. self.currently_empty = False
  376. self.lock.release()
  377. return item
  378. def download_thumbnail(save_directory, video_id):
  379. url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
  380. save_location = os.path.join(save_directory, video_id + ".jpg")
  381. try:
  382. thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id)
  383. except urllib.error.HTTPError as e:
  384. print("Failed to download thumbnail for " + video_id + ": " + str(e))
  385. return False
  386. try:
  387. f = open(save_location, 'wb')
  388. except FileNotFoundError:
  389. os.makedirs(save_directory, exist_ok=True)
  390. f = open(save_location, 'wb')
  391. f.write(thumbnail)
  392. f.close()
  393. return True
  394. def download_thumbnails(save_directory, ids):
  395. if not isinstance(ids, (list, tuple)):
  396. ids = list(ids)
  397. # only do 5 at a time
  398. # do the n where n is divisible by 5
  399. i = -1
  400. for i in range(0, int(len(ids)/5) - 1 ):
  401. gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
  402. # do the remainders (< 5)
  403. gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
  404. def dict_add(*dicts):
  405. for dictionary in dicts[1:]:
  406. dicts[0].update(dictionary)
  407. return dicts[0]
  408. def video_id(url):
  409. url_parts = urllib.parse.urlparse(url)
  410. return urllib.parse.parse_qs(url_parts.query)['v'][0]
  411. # default, sddefault, mqdefault, hqdefault, hq720
  412. def get_thumbnail_url(video_id):
  413. return f"{settings.img_prefix}https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
  414. def seconds_to_timestamp(seconds):
  415. seconds = int(seconds)
  416. hours, seconds = divmod(seconds, 3600)
  417. minutes, seconds = divmod(seconds, 60)
  418. if hours != 0:
  419. timestamp = str(hours) + ":"
  420. timestamp += str(minutes).zfill(2) # zfill pads with zeros
  421. else:
  422. timestamp = str(minutes)
  423. timestamp += ":" + str(seconds).zfill(2)
  424. return timestamp
  425. def update_query_string(query_string, items):
  426. parameters = urllib.parse.parse_qs(query_string)
  427. parameters.update(items)
  428. return urllib.parse.urlencode(parameters, doseq=True)
  429. YOUTUBE_DOMAINS = ('youtube.com', 'youtu.be', 'youtube-nocookie.com')
  430. YOUTUBE_URL_RE_STR = r'https?://(?:[a-zA-Z0-9_-]*\.)?(?:'
  431. YOUTUBE_URL_RE_STR += r'|'.join(map(re.escape, YOUTUBE_DOMAINS))
  432. YOUTUBE_URL_RE_STR += r')(?:/[^"]*)?'
  433. YOUTUBE_URL_RE = re.compile(YOUTUBE_URL_RE_STR)
  434. def prefix_url(url):
  435. if url is None:
  436. return None
  437. url = url.lstrip('/') # some urls have // before them, which has a special meaning
  438. return '/' + url
  439. def left_remove(string, substring):
  440. '''removes substring from the start of string, if present'''
  441. if string.startswith(substring):
  442. return string[len(substring):]
  443. return string
  444. def concat_or_none(*strings):
  445. '''Concatenates strings. Returns None if any of the arguments are None'''
  446. result = ''
  447. for string in strings:
  448. if string is None:
  449. return None
  450. result += string
  451. return result
  452. def prefix_urls(item):
  453. if settings.proxy_images:
  454. try:
  455. item['thumbnail'] = prefix_url(item['thumbnail'])
  456. except KeyError:
  457. pass
  458. try:
  459. item['author_url'] = prefix_url(item['author_url'])
  460. except KeyError:
  461. pass
  462. def add_extra_html_info(item):
  463. if item['type'] == 'video':
  464. item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
  465. video_info = {}
  466. for key in ('id', 'title', 'author', 'duration', 'author_id'):
  467. try:
  468. video_info[key] = item[key]
  469. except KeyError:
  470. video_info[key] = None
  471. item['video_info'] = json.dumps(video_info)
  472. elif item['type'] == 'playlist' and item['playlist_type'] == 'radio':
  473. item['url'] = concat_or_none(
  474. URL_ORIGIN,
  475. '/watch?v=', item['first_video_id'],
  476. '&list=', item['id']
  477. )
  478. elif item['type'] == 'playlist':
  479. item['url'] = concat_or_none(URL_ORIGIN, '/playlist?list=', item['id'])
  480. elif item['type'] == 'channel':
  481. item['url'] = concat_or_none(URL_ORIGIN, "/channel/", item['id'])
  482. if item.get('author_id') and 'author_url' not in item:
  483. item['author_url'] = URL_ORIGIN + '/channel/' + item['author_id']
  484. def check_gevent_exceptions(*tasks):
  485. for task in tasks:
  486. if task.exception:
  487. raise task.exception
  488. # https://stackoverflow.com/a/62888
  489. replacement_map = collections.OrderedDict([
  490. ('<', '_'),
  491. ('>', '_'),
  492. (': ', ' - '),
  493. (':', '-'),
  494. ('"', "'"),
  495. ('/', '_'),
  496. ('\\', '_'),
  497. ('|', '-'),
  498. ('?', ''),
  499. ('*', '_'),
  500. ('\t', ' '),
  501. ])
  502. DOS_names = {'con', 'prn', 'aux', 'nul', 'com0', 'com1', 'com2', 'com3',
  503. 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt0',
  504. 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7',
  505. 'lpt8', 'lpt9'}
  506. def to_valid_filename(name):
  507. '''Changes the name so it's valid on Windows, Linux, and Mac'''
  508. # See https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
  509. # for Windows specs
  510. # Additional recommendations for Linux:
  511. # https://dwheeler.com/essays/fixing-unix-linux-filenames.html#standards
  512. # remove control characters
  513. name = re.sub(r'[\x00-\x1f]', '_', name)
  514. # reserved characters
  515. for reserved_char, replacement in replacement_map.items():
  516. name = name.replace(reserved_char, replacement)
  517. # check for all periods/spaces
  518. if all(c == '.' or c == ' ' for c in name):
  519. name = '_'*len(name)
  520. # remove trailing periods and spaces
  521. name = name.rstrip('. ')
  522. # check for reserved DOS names, such as nul or nul.txt
  523. base_ext_parts = name.rsplit('.', maxsplit=1)
  524. if base_ext_parts[0].lower() in DOS_names:
  525. base_ext_parts[0] += '_'
  526. name = '.'.join(base_ext_parts)
  527. # check for blank name
  528. if name == '':
  529. name = '_'
  530. # check if name begins with a hyphen, period, or space
  531. if name[0] in ('-', '.', ' '):
  532. name = '_' + name
  533. return name
  534. # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72
  535. INNERTUBE_CLIENTS = {
  536. 'web_creator': {
  537. 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo',
  538. 'INNERTUBE_CONTEXT': {
  539. 'client': {
  540. 'clientName': 'WEB_CREATOR',
  541. 'clientVersion': '1.20240723.03.00',
  542. },
  543. },
  544. 'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
  545. },
  546. 'android': {
  547. 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
  548. 'INNERTUBE_CONTEXT': {
  549. 'client': {
  550. 'hl': 'en',
  551. 'gl': 'US',
  552. 'clientName': 'ANDROID',
  553. 'clientVersion': '19.15.35',
  554. 'osName': 'Android',
  555. 'osVersion': '14',
  556. 'androidSdkVersion': 34,
  557. 'platform': 'MOBILE',
  558. 'userAgent': 'com.google.android.youtube/19.15.35 (Linux; U; Android 14; en_US; Google Pixel 6 Pro) gzip'
  559. }
  560. },
  561. 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
  562. 'REQUIRE_JS_PLAYER': False,
  563. },
  564. 'ios': {
  565. 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
  566. 'INNERTUBE_CONTEXT': {
  567. 'client': {
  568. 'hl': 'en',
  569. 'gl': 'US',
  570. 'clientName': 'IOS',
  571. 'clientVersion': '19.29.1',
  572. 'deviceMake': 'Apple',
  573. 'deviceModel': 'iPhone16,2',
  574. 'userAgent': 'com.google.ios.youtube/19.29.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)',
  575. 'osName': 'iPhone',
  576. 'osVersion': '17.5.1.21F90',
  577. }
  578. },
  579. 'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
  580. 'REQUIRE_JS_PLAYER': False
  581. },
  582. # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
  583. # See: https://github.com/zerodytrash/YouTube-Internal-Clients
  584. 'tv_embedded': {
  585. 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  586. 'INNERTUBE_CONTEXT': {
  587. 'client': {
  588. 'hl': 'en',
  589. 'gl': 'US',
  590. 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
  591. 'clientVersion': '2.0',
  592. 'clientScreen': 'EMBED',
  593. },
  594. # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
  595. 'thirdParty': {
  596. 'embedUrl': 'https://google.com', # Can be any valid URL
  597. }
  598. },
  599. 'INNERTUBE_CONTEXT_CLIENT_NAME': 85,
  600. 'REQUIRE_JS_PLAYER': True,
  601. },
  602. 'web': {
  603. 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  604. 'INNERTUBE_CONTEXT': {
  605. 'client': {
  606. 'clientName': 'WEB',
  607. 'clientVersion': '2.20240327.00.00',
  608. 'userAgent': desktop_user_agent,
  609. }
  610. },
  611. 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
  612. },
  613. }
  614. def call_youtube_api(client, api, data):
  615. client_params = INNERTUBE_CLIENTS[client]
  616. context = client_params['INNERTUBE_CONTEXT']
  617. key = client_params['INNERTUBE_API_KEY']
  618. host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com'
  619. user_agent = context['client'].get('userAgent') or mobile_user_agent
  620. url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key
  621. data['context'] = context
  622. data = json.dumps(data)
  623. headers = (('Content-Type', 'application/json'),('User-Agent', user_agent))
  624. response = fetch_url(
  625. url, data=data, headers=headers,
  626. debug_name='youtubei_' + api + '_' + client,
  627. report_text='Fetched ' + client + ' youtubei ' + api
  628. ).decode('utf-8')
  629. return response
  630. def strip_non_ascii(string):
  631. ''' Returns the string without non ASCII characters'''
  632. stripped = (c for c in string if 0 < ord(c) < 127)
  633. return ''.join(stripped)
  634. def time_utc_isoformat(string):
  635. t = datetime.strptime(string, '%Y-%m-%d')
  636. t = t.astimezone().isoformat()
  637. return t