util.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851
  1. from datetime import datetime
  2. import settings
  3. import socks
  4. import sockshandler
  5. import gzip
  6. try:
  7. import brotli
  8. have_brotli = True
  9. except ImportError:
  10. have_brotli = False
  11. import urllib.parse
  12. import re
  13. import time
  14. import os
  15. import json
  16. import gevent
  17. import gevent.queue
  18. import gevent.lock
  19. import collections
  20. import stem
  21. import stem.control
  22. import traceback
  23. # The trouble with the requests library: It ships its own certificate bundle via certifi
  24. # instead of using the system certificate store, meaning self-signed certificates
  25. # configured by the user will not work. Some draconian networks block TLS unless a corporate
  26. # certificate is installed on the system. Additionally, some users install a self signed cert
  27. # in order to use programs to modify or monitor requests made by programs on the system.
  28. # Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes
  29. # certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes,
  30. # this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a
  31. # malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples:
  32. # https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/
  33. # https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/
  34. # In the requests documentation it says:
  35. # "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store.
  36. # The certificates were only updated once for each Requests version. When certifi was not installed,
  37. # this led to extremely out-of-date certificate bundles when using significantly older versions of Requests.
  38. # For the sake of security we recommend upgrading certifi frequently!"
  39. # (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates)
  40. # Expecting users to remember to manually update certifi on Linux isn't reasonable in my view.
  41. # On windows, this is even worse since I am distributing all dependencies. This program is not
  42. # updated frequently, and using requests would lead to outdated certificates. Certificates
  43. # should be updated with OS updates, instead of thousands of developers of different programs
  44. # being expected to do this correctly 100% of the time.
  45. # There is hope that this might be fixed eventually:
  46. # https://github.com/kennethreitz/requests/issues/2966
  47. # Until then, I will use a mix of urllib3 and urllib.
  48. import urllib3
  49. import urllib3.contrib.socks
  50. URL_ORIGIN = "/https://www.youtube.com"
  51. connection_pool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED')
  52. class TorManager:
  53. MAX_TRIES = 3
  54. # Remember the 7-sec wait times, so make cooldown be two of those
  55. # (otherwise it will retry forever if 429s never end)
  56. COOLDOWN_TIME = 14
  57. def __init__(self):
  58. self.old_tor_connection_pool = None
  59. self.tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager(
  60. 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
  61. cert_reqs='CERT_REQUIRED')
  62. self.tor_pool_refresh_time = time.monotonic()
  63. settings.add_setting_changed_hook(
  64. 'tor_port',
  65. lambda old_val, new_val: self.refresh_tor_connection_pool(),
  66. )
  67. self.new_identity_lock = gevent.lock.BoundedSemaphore(1)
  68. self.last_new_identity_time = time.monotonic() - 20
  69. self.try_num = 1
  70. def refresh_tor_connection_pool(self):
  71. self.tor_connection_pool.clear()
  72. # Keep a reference for 5 min to avoid it getting garbage collected
  73. # while sockets still in use
  74. self.old_tor_connection_pool = self.tor_connection_pool
  75. self.tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager(
  76. 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
  77. cert_reqs='CERT_REQUIRED')
  78. self.tor_pool_refresh_time = time.monotonic()
  79. def get_tor_connection_pool(self):
  80. # Tor changes circuits after 10 minutes:
  81. # https://tor.stackexchange.com/questions/262/for-how-long-does-a-circuit-stay-alive
  82. current_time = time.monotonic()
  83. # close pool after 5 minutes
  84. if current_time - self.tor_pool_refresh_time > 300:
  85. self.refresh_tor_connection_pool()
  86. return self.tor_connection_pool
  87. def new_identity(self, time_failed_request_started):
  88. '''return error, or None if no error and the identity is fresh'''
  89. # The overall pattern at maximum (always returning 429) will be
  90. # R N (0) R N (6) R N (6) R | (12) R N (0) R N (6) ...
  91. # where R is a request, N is a new identity, (x) is a wait time of
  92. # x sec, and | is where we give up and display an error to the user.
  93. print('new_identity: new_identity called')
  94. # blocks if another greenlet currently has the lock
  95. self.new_identity_lock.acquire()
  96. print('new_identity: New identity lock acquired')
  97. try:
  98. # This was caused by a request that failed within a previous,
  99. # stale identity
  100. if time_failed_request_started <= self.last_new_identity_time:
  101. print('new_identity: Cancelling; request was from stale identity')
  102. return None
  103. delta = time.monotonic() - self.last_new_identity_time
  104. if delta < self.COOLDOWN_TIME and self.try_num == 1:
  105. err = ('Retried with new circuit %d times (max) within last '
  106. '%d seconds.' % (self.MAX_TRIES, self.COOLDOWN_TIME))
  107. print('new_identity:', err)
  108. return err
  109. elif delta >= self.COOLDOWN_TIME:
  110. self.try_num = 1
  111. try:
  112. port = settings.tor_control_port
  113. with stem.control.Controller.from_port(port=port) as controller:
  114. controller.authenticate('')
  115. print('new_identity: Getting new identity')
  116. controller.signal(stem.Signal.NEWNYM)
  117. print('new_identity: NEWNYM signal sent')
  118. self.last_new_identity_time = time.monotonic()
  119. self.refresh_tor_connection_pool()
  120. except stem.SocketError:
  121. traceback.print_exc()
  122. return 'Failed to connect to Tor control port.'
  123. finally:
  124. original_try_num = self.try_num
  125. self.try_num += 1
  126. if self.try_num > self.MAX_TRIES:
  127. self.try_num = 1
  128. # If we do the request right after second new identity it won't
  129. # be a new IP, based on experiments.
  130. # Not necessary after first new identity
  131. if original_try_num > 1:
  132. print('Sleeping for 7 seconds before retrying request')
  133. time.sleep(7) # experimentally determined minimum
  134. return None
  135. finally:
  136. self.new_identity_lock.release()
  137. tor_manager = TorManager()
  138. def get_pool(use_tor):
  139. if not use_tor:
  140. return connection_pool
  141. return tor_manager.get_tor_connection_pool()
  142. class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
  143. '''Separate cookiejars for receiving and sending'''
  144. def __init__(self, cookiejar_send=None, cookiejar_receive=None):
  145. import http.cookiejar
  146. self.cookiejar_send = cookiejar_send
  147. self.cookiejar_receive = cookiejar_receive
  148. def http_request(self, request):
  149. if self.cookiejar_send is not None:
  150. self.cookiejar_send.add_cookie_header(request)
  151. return request
  152. def http_response(self, request, response):
  153. if self.cookiejar_receive is not None:
  154. self.cookiejar_receive.extract_cookies(response, request)
  155. return response
  156. https_request = http_request
  157. https_response = http_response
  158. class FetchError(Exception):
  159. def __init__(self, code, reason='', ip=None, error_message=None):
  160. if error_message:
  161. string = code + ' ' + reason + ': ' + error_message
  162. else:
  163. string = 'HTTP error during request: ' + code + ' ' + reason
  164. Exception.__init__(self, string)
  165. self.code = code
  166. self.reason = reason
  167. self.ip = ip
  168. self.error_message = error_message
  169. def decode_content(content, encoding_header):
  170. encodings = encoding_header.replace(' ', '').split(',')
  171. for encoding in reversed(encodings):
  172. if encoding == 'identity':
  173. continue
  174. if encoding == 'br':
  175. content = brotli.decompress(content)
  176. elif encoding == 'gzip':
  177. content = gzip.decompress(content)
  178. return content
  179. def fetch_url_response(url, headers=(), timeout=15, data=None,
  180. cookiejar_send=None, cookiejar_receive=None,
  181. use_tor=True, max_redirects=None):
  182. '''
  183. returns response, cleanup_function
  184. When cookiejar_send is set to a CookieJar object,
  185. those cookies will be sent in the request (but cookies in response will not be merged into it)
  186. When cookiejar_receive is set to a CookieJar object,
  187. cookies received in the response will be merged into the object (nothing will be sent from it)
  188. When both are set to the same object, cookies will be sent from the object,
  189. and response cookies will be merged into it.
  190. '''
  191. headers = dict(headers) # Note: Calling dict() on a dict will make a copy
  192. if have_brotli:
  193. headers['Accept-Encoding'] = 'gzip, br'
  194. else:
  195. headers['Accept-Encoding'] = 'gzip'
  196. # prevent python version being leaked by urllib if User-Agent isn't provided
  197. # (urllib will use ex. Python-urllib/3.6 otherwise)
  198. if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers:
  199. headers['User-Agent'] = 'Python-urllib'
  200. method = "GET"
  201. if data is not None:
  202. method = "POST"
  203. if isinstance(data, str):
  204. data = data.encode('utf-8')
  205. elif not isinstance(data, bytes):
  206. data = urllib.parse.urlencode(data).encode('utf-8')
  207. if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib
  208. req = urllib.request.Request(url, data=data, headers=headers)
  209. cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive)
  210. if use_tor and settings.route_tor:
  211. opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", settings.tor_port), cookie_processor)
  212. else:
  213. opener = urllib.request.build_opener(cookie_processor)
  214. response = opener.open(req, timeout=timeout)
  215. cleanup_func = (lambda r: None)
  216. else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them.
  217. # default: Retry.DEFAULT = Retry(3)
  218. # (in connectionpool.py in urllib3)
  219. # According to the documentation for urlopen, a redirect counts as a
  220. # retry. So there are 3 redirects max by default.
  221. if max_redirects:
  222. retries = urllib3.Retry(3+max_redirects, redirect=max_redirects, raise_on_redirect=False)
  223. else:
  224. retries = urllib3.Retry(3, raise_on_redirect=False)
  225. pool = get_pool(use_tor and settings.route_tor)
  226. try:
  227. response = pool.request(method, url, headers=headers, body=data,
  228. timeout=timeout, preload_content=False,
  229. decode_content=False, retries=retries)
  230. response.retries = retries
  231. except urllib3.exceptions.MaxRetryError as e:
  232. exception_cause = e.__context__.__context__
  233. if (isinstance(exception_cause, socks.ProxyConnectionError)
  234. and settings.route_tor):
  235. msg = ('Failed to connect to Tor. Check that Tor is open and '
  236. 'that your internet connection is working.\n\n'
  237. + str(e))
  238. raise FetchError('502', reason='Bad Gateway',
  239. error_message=msg)
  240. elif isinstance(e.__context__,
  241. urllib3.exceptions.NewConnectionError):
  242. msg = 'Failed to establish a connection.\n\n' + str(e)
  243. raise FetchError(
  244. '502', reason='Bad Gateway',
  245. error_message=msg)
  246. else:
  247. raise
  248. cleanup_func = (lambda r: r.release_conn())
  249. return response, cleanup_func
  250. def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
  251. cookiejar_send=None, cookiejar_receive=None, use_tor=True,
  252. debug_name=None):
  253. while True:
  254. start_time = time.monotonic()
  255. response, cleanup_func = fetch_url_response(
  256. url, headers, timeout=timeout, data=data,
  257. cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive,
  258. use_tor=use_tor)
  259. response_time = time.monotonic()
  260. content = response.read()
  261. read_finish = time.monotonic()
  262. cleanup_func(response) # release_connection for urllib3
  263. content = decode_content(
  264. content,
  265. response.headers.get('Content-Encoding', default='identity'))
  266. if (settings.debugging_save_responses
  267. and debug_name is not None
  268. and content):
  269. save_dir = os.path.join(settings.data_dir, 'debug')
  270. if not os.path.exists(save_dir):
  271. os.makedirs(save_dir)
  272. with open(os.path.join(save_dir, debug_name), 'wb') as f:
  273. f.write(content)
  274. if response.status == 429 or (
  275. response.status == 302 and (response.getheader('Location') == url
  276. or response.getheader('Location').startswith(
  277. 'https://www.google.com/sorry/index'
  278. )
  279. )
  280. ):
  281. print(response.status, response.reason, response.headers)
  282. ip = re.search(
  283. br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
  284. content)
  285. ip = ip.group(1).decode('ascii') if ip else None
  286. if not ip:
  287. ip = re.search(r'IP=((?:\d+\.)+\d+)',
  288. response.getheader('Set-Cookie') or '')
  289. ip = ip.group(1) if ip else None
  290. # don't get new identity if we're not using Tor
  291. if not use_tor:
  292. raise FetchError('429', reason=response.reason, ip=ip)
  293. print('Error: YouTube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip)
  294. # get new identity
  295. error = tor_manager.new_identity(start_time)
  296. if error:
  297. raise FetchError(
  298. '429', reason=response.reason, ip=ip,
  299. error_message='Automatic circuit change: ' + error)
  300. else:
  301. continue # retry now that we have new identity
  302. elif response.status >= 400:
  303. raise FetchError(str(response.status), reason=response.reason,
  304. ip=None)
  305. break
  306. if report_text:
  307. print(report_text, ' Latency:', round(response_time - start_time, 3), ' Read time:', round(read_finish - response_time,3))
  308. return content
  309. def head(url, use_tor=False, report_text=None, max_redirects=10):
  310. pool = get_pool(use_tor and settings.route_tor)
  311. start_time = time.monotonic()
  312. # default: Retry.DEFAULT = Retry(3)
  313. # (in connectionpool.py in urllib3)
  314. # According to the documentation for urlopen, a redirect counts as a retry
  315. # So there are 3 redirects max by default. Let's change that
  316. # to 10 since googlevideo redirects a lot.
  317. retries = urllib3.Retry(
  318. 3+max_redirects,
  319. redirect=max_redirects,
  320. raise_on_redirect=False)
  321. headers = {'User-Agent': 'Python-urllib'}
  322. response = pool.request('HEAD', url, headers=headers, retries=retries)
  323. if report_text:
  324. print(
  325. report_text,
  326. ' Latency:',
  327. round(time.monotonic() - start_time, 3))
  328. return response
  329. mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
  330. mobile_ua = (('User-Agent', mobile_user_agent),)
  331. desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
  332. desktop_ua = (('User-Agent', desktop_user_agent),)
  333. json_header = (('Content-Type', 'application/json'),)
  334. desktop_xhr_headers = (
  335. ('Accept', '*/*'),
  336. ('Accept-Language', 'en-US,en;q=0.5'),
  337. ('X-YouTube-Client-Name', '1'),
  338. ('X-YouTube-Client-Version', '2.20240304.00.00'),
  339. ) + desktop_ua
  340. mobile_xhr_headers = (
  341. ('Accept', '*/*'),
  342. ('Accept-Language', 'en-US,en;q=0.5'),
  343. ('X-YouTube-Client-Name', '2'),
  344. ('X-YouTube-Client-Version', '2.20240304.08.00'),
  345. ) + mobile_ua
  346. class RateLimitedQueue(gevent.queue.Queue):
  347. ''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. '''
  348. def __init__(self, initial_burst=30, waiting_period=5, subsequent_bursts=10):
  349. self.initial_burst = initial_burst
  350. self.waiting_period = waiting_period
  351. self.subsequent_bursts = subsequent_bursts
  352. self.count_since_last_wait = 0
  353. self.surpassed_initial = False
  354. self.lock = gevent.lock.BoundedSemaphore(1)
  355. self.currently_empty = False
  356. self.empty_start = 0
  357. gevent.queue.Queue.__init__(self)
  358. def get(self):
  359. self.lock.acquire() # blocks if another greenlet currently has the lock
  360. if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial:
  361. gevent.sleep(self.waiting_period)
  362. self.count_since_last_wait = 0
  363. elif self.count_since_last_wait >= self.initial_burst and not self.surpassed_initial:
  364. self.surpassed_initial = True
  365. gevent.sleep(self.waiting_period)
  366. self.count_since_last_wait = 0
  367. self.count_since_last_wait += 1
  368. if not self.currently_empty and self.empty():
  369. self.currently_empty = True
  370. self.empty_start = time.monotonic()
  371. item = gevent.queue.Queue.get(self) # blocks when nothing left
  372. if self.currently_empty:
  373. if time.monotonic() - self.empty_start >= self.waiting_period:
  374. self.count_since_last_wait = 0
  375. self.surpassed_initial = False
  376. self.currently_empty = False
  377. self.lock.release()
  378. return item
  379. def download_thumbnail(save_directory, video_id):
  380. url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
  381. save_location = os.path.join(save_directory, video_id + ".jpg")
  382. try:
  383. thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id)
  384. except urllib.error.HTTPError as e:
  385. print("Failed to download thumbnail for " + video_id + ": " + str(e))
  386. return False
  387. try:
  388. f = open(save_location, 'wb')
  389. except FileNotFoundError:
  390. os.makedirs(save_directory, exist_ok=True)
  391. f = open(save_location, 'wb')
  392. f.write(thumbnail)
  393. f.close()
  394. return True
  395. def download_thumbnails(save_directory, ids):
  396. if not isinstance(ids, (list, tuple)):
  397. ids = list(ids)
  398. # only do 5 at a time
  399. # do the n where n is divisible by 5
  400. i = -1
  401. for i in range(0, int(len(ids)/5) - 1 ):
  402. gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5, i*5 + 5)])
  403. # do the remainders (< 5)
  404. gevent.joinall([gevent.spawn(download_thumbnail, save_directory, ids[j]) for j in range(i*5 + 5, len(ids))])
  405. def dict_add(*dicts):
  406. for dictionary in dicts[1:]:
  407. dicts[0].update(dictionary)
  408. return dicts[0]
  409. def video_id(url):
  410. url_parts = urllib.parse.urlparse(url)
  411. return urllib.parse.parse_qs(url_parts.query)['v'][0]
  412. # default, sddefault, mqdefault, hqdefault, hq720
  413. def get_thumbnail_url(video_id):
  414. return f"{settings.img_prefix}https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
  415. def seconds_to_timestamp(seconds):
  416. seconds = int(seconds)
  417. hours, seconds = divmod(seconds, 3600)
  418. minutes, seconds = divmod(seconds, 60)
  419. if hours != 0:
  420. timestamp = str(hours) + ":"
  421. timestamp += str(minutes).zfill(2) # zfill pads with zeros
  422. else:
  423. timestamp = str(minutes)
  424. timestamp += ":" + str(seconds).zfill(2)
  425. return timestamp
  426. def update_query_string(query_string, items):
  427. parameters = urllib.parse.parse_qs(query_string)
  428. parameters.update(items)
  429. return urllib.parse.urlencode(parameters, doseq=True)
  430. YOUTUBE_DOMAINS = ('youtube.com', 'youtu.be', 'youtube-nocookie.com')
  431. YOUTUBE_URL_RE_STR = r'https?://(?:[a-zA-Z0-9_-]*\.)?(?:'
  432. YOUTUBE_URL_RE_STR += r'|'.join(map(re.escape, YOUTUBE_DOMAINS))
  433. YOUTUBE_URL_RE_STR += r')(?:/[^"]*)?'
  434. YOUTUBE_URL_RE = re.compile(YOUTUBE_URL_RE_STR)
  435. def prefix_url(url):
  436. if url is None:
  437. return None
  438. url = url.lstrip('/') # some urls have // before them, which has a special meaning
  439. return '/' + url
  440. def left_remove(string, substring):
  441. '''removes substring from the start of string, if present'''
  442. if string.startswith(substring):
  443. return string[len(substring):]
  444. return string
  445. def concat_or_none(*strings):
  446. '''Concatenates strings. Returns None if any of the arguments are None'''
  447. result = ''
  448. for string in strings:
  449. if string is None:
  450. return None
  451. result += string
  452. return result
  453. def prefix_urls(item):
  454. if settings.proxy_images:
  455. try:
  456. item['thumbnail'] = prefix_url(item['thumbnail'])
  457. except KeyError:
  458. pass
  459. try:
  460. item['author_url'] = prefix_url(item['author_url'])
  461. except KeyError:
  462. pass
  463. def add_extra_html_info(item):
  464. if item['type'] == 'video':
  465. item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
  466. video_info = {}
  467. for key in ('id', 'title', 'author', 'duration', 'author_id'):
  468. try:
  469. video_info[key] = item[key]
  470. except KeyError:
  471. video_info[key] = None
  472. item['video_info'] = json.dumps(video_info)
  473. elif item['type'] == 'playlist' and item['playlist_type'] == 'radio':
  474. item['url'] = concat_or_none(
  475. URL_ORIGIN,
  476. '/watch?v=', item['first_video_id'],
  477. '&list=', item['id']
  478. )
  479. elif item['type'] == 'playlist':
  480. item['url'] = concat_or_none(URL_ORIGIN, '/playlist?list=', item['id'])
  481. elif item['type'] == 'channel':
  482. item['url'] = concat_or_none(URL_ORIGIN, "/channel/", item['id'])
  483. if item.get('author_id') and 'author_url' not in item:
  484. item['author_url'] = URL_ORIGIN + '/channel/' + item['author_id']
  485. def check_gevent_exceptions(*tasks):
  486. for task in tasks:
  487. if task.exception:
  488. raise task.exception
  489. # https://stackoverflow.com/a/62888
  490. replacement_map = collections.OrderedDict([
  491. ('<', '_'),
  492. ('>', '_'),
  493. (': ', ' - '),
  494. (':', '-'),
  495. ('"', "'"),
  496. ('/', '_'),
  497. ('\\', '_'),
  498. ('|', '-'),
  499. ('?', ''),
  500. ('*', '_'),
  501. ('\t', ' '),
  502. ])
  503. DOS_names = {'con', 'prn', 'aux', 'nul', 'com0', 'com1', 'com2', 'com3',
  504. 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt0',
  505. 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7',
  506. 'lpt8', 'lpt9'}
  507. def to_valid_filename(name):
  508. '''Changes the name so it's valid on Windows, Linux, and Mac'''
  509. # See https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
  510. # for Windows specs
  511. # Additional recommendations for Linux:
  512. # https://dwheeler.com/essays/fixing-unix-linux-filenames.html#standards
  513. # remove control characters
  514. name = re.sub(r'[\x00-\x1f]', '_', name)
  515. # reserved characters
  516. for reserved_char, replacement in replacement_map.items():
  517. name = name.replace(reserved_char, replacement)
  518. # check for all periods/spaces
  519. if all(c == '.' or c == ' ' for c in name):
  520. name = '_'*len(name)
  521. # remove trailing periods and spaces
  522. name = name.rstrip('. ')
  523. # check for reserved DOS names, such as nul or nul.txt
  524. base_ext_parts = name.rsplit('.', maxsplit=1)
  525. if base_ext_parts[0].lower() in DOS_names:
  526. base_ext_parts[0] += '_'
  527. name = '.'.join(base_ext_parts)
  528. # check for blank name
  529. if name == '':
  530. name = '_'
  531. # check if name begins with a hyphen, period, or space
  532. if name[0] in ('-', '.', ' '):
  533. name = '_' + name
  534. return name
  535. # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72
  536. INNERTUBE_CLIENTS = {
  537. 'android': {
  538. 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
  539. 'INNERTUBE_CONTEXT': {
  540. 'client': {
  541. 'hl': 'en',
  542. 'gl': 'US',
  543. 'clientName': 'ANDROID',
  544. 'clientVersion': '19.09.36',
  545. 'osName': 'Android',
  546. 'osVersion': '12',
  547. 'androidSdkVersion': 31,
  548. 'platform': 'MOBILE',
  549. 'userAgent': 'com.google.android.youtube/19.09.36 (Linux; U; Android 12; US) gzip'
  550. },
  551. # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
  552. #'thirdParty': {
  553. # 'embedUrl': 'https://google.com', # Can be any valid URL
  554. #}
  555. },
  556. 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
  557. 'REQUIRE_JS_PLAYER': False,
  558. },
  559. 'android-test-suite': {
  560. 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
  561. 'INNERTUBE_CONTEXT': {
  562. 'client': {
  563. 'hl': 'en',
  564. 'gl': 'US',
  565. 'clientName': 'ANDROID_TESTSUITE',
  566. 'clientVersion': '1.9',
  567. 'osName': 'Android',
  568. 'osVersion': '12',
  569. 'androidSdkVersion': 31,
  570. 'platform': 'MOBILE',
  571. 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 12; US) gzip'
  572. },
  573. # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
  574. #'thirdParty': {
  575. # 'embedUrl': 'https://google.com', # Can be any valid URL
  576. #}
  577. },
  578. 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
  579. 'REQUIRE_JS_PLAYER': False,
  580. },
  581. 'ios': {
  582. 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
  583. 'INNERTUBE_CONTEXT': {
  584. 'client': {
  585. 'hl': 'en',
  586. 'gl': 'US',
  587. 'clientName': 'IOS',
  588. 'clientVersion': '19.09.3',
  589. 'deviceModel': 'iPhone14,3',
  590. 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
  591. }
  592. },
  593. 'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
  594. 'REQUIRE_JS_PLAYER': False
  595. },
  596. # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
  597. # See: https://github.com/zerodytrash/YouTube-Internal-Clients
  598. 'tv_embedded': {
  599. 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  600. 'INNERTUBE_CONTEXT': {
  601. 'client': {
  602. 'hl': 'en',
  603. 'gl': 'US',
  604. 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
  605. 'clientVersion': '2.0',
  606. 'clientScreen': 'EMBED',
  607. },
  608. # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
  609. 'thirdParty': {
  610. 'embedUrl': 'https://google.com', # Can be any valid URL
  611. }
  612. },
  613. 'INNERTUBE_CONTEXT_CLIENT_NAME': 85,
  614. 'REQUIRE_JS_PLAYER': True,
  615. },
  616. 'web': {
  617. 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  618. 'INNERTUBE_CONTEXT': {
  619. 'client': {
  620. 'clientName': 'WEB',
  621. 'clientVersion': '2.20220801.00.00',
  622. 'userAgent': desktop_user_agent,
  623. }
  624. },
  625. 'INNERTUBE_CONTEXT_CLIENT_NAME': 1
  626. },
  627. 'android_vr': {
  628. 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
  629. 'INNERTUBE_CONTEXT': {
  630. 'client': {
  631. 'clientName': 'ANDROID_VR',
  632. 'clientVersion': '1.60.19',
  633. 'deviceMake': 'Oculus',
  634. 'deviceModel': 'Quest 3',
  635. 'androidSdkVersion': 32,
  636. 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip',
  637. 'osName': 'Android',
  638. 'osVersion': '12L',
  639. },
  640. },
  641. 'INNERTUBE_CONTEXT_CLIENT_NAME': 28,
  642. 'REQUIRE_JS_PLAYER': False,
  643. },
  644. }
  645. def get_visitor_data():
  646. visitor_data = None
  647. visitor_data_cache = os.path.join(settings.data_dir, 'visitorData.txt')
  648. if not os.path.exists(settings.data_dir):
  649. os.makedirs(settings.data_dir)
  650. if os.path.isfile(visitor_data_cache):
  651. with open(visitor_data_cache, 'r') as file:
  652. print('Getting visitor_data from cache')
  653. visitor_data = file.read()
  654. max_age = 12*3600
  655. file_age = time.time() - os.path.getmtime(visitor_data_cache)
  656. if file_age > max_age:
  657. print('visitor_data cache is too old. Removing file...')
  658. os.remove(visitor_data_cache)
  659. return visitor_data
  660. print('Fetching youtube homepage to get visitor_data')
  661. yt_homepage = 'https://www.youtube.com'
  662. yt_resp = fetch_url(yt_homepage, headers={'User-Agent': mobile_user_agent}, report_text='Getting youtube homepage')
  663. visitor_data_re = r'''"visitorData":\s*?"(.+?)"'''
  664. visitor_data_match = re.search(visitor_data_re, yt_resp.decode())
  665. if visitor_data_match:
  666. visitor_data = visitor_data_match.group(1)
  667. print(f'Got visitor_data: {len(visitor_data)}')
  668. with open(visitor_data_cache, 'w') as file:
  669. print('Saving visitor_data cache...')
  670. file.write(visitor_data)
  671. return visitor_data
  672. else:
  673. print('Unable to get visitor_data value')
  674. return visitor_data
  675. def call_youtube_api(client, api, data):
  676. client_params = INNERTUBE_CLIENTS[client]
  677. context = client_params['INNERTUBE_CONTEXT']
  678. key = client_params['INNERTUBE_API_KEY']
  679. host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com'
  680. user_agent = context['client'].get('userAgent') or mobile_user_agent
  681. visitor_data = get_visitor_data()
  682. url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key
  683. if visitor_data:
  684. context['client'].update({'visitorData': visitor_data})
  685. data['context'] = context
  686. data = json.dumps(data)
  687. headers = (('Content-Type', 'application/json'),('User-Agent', user_agent))
  688. if visitor_data:
  689. headers = ( *headers, ('X-Goog-Visitor-Id', visitor_data ))
  690. response = fetch_url(
  691. url, data=data, headers=headers,
  692. debug_name='youtubei_' + api + '_' + client,
  693. report_text='Fetched ' + client + ' youtubei ' + api
  694. ).decode('utf-8')
  695. return response
  696. def strip_non_ascii(string):
  697. ''' Returns the string without non ASCII characters'''
  698. stripped = (c for c in string if 0 < ord(c) < 127)
  699. return ''.join(stripped)
  700. def time_utc_isoformat(string):
  701. t = datetime.strptime(string, '%Y-%m-%d')
  702. t = t.astimezone().isoformat()
  703. return t