watch_extraction.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949
  1. from .common import (get, multi_get, deep_get, multi_deep_get,
  2. liberal_update, conservative_update, remove_redirect, normalize_url,
  3. extract_str, extract_formatted_text, extract_int, extract_approx_int,
  4. extract_date, check_missing_keys, extract_item_info, extract_items,
  5. extract_response, concat_or_none, liberal_dict_update,
  6. conservative_dict_update)
  7. import json
  8. import urllib.parse
  9. import traceback
  10. import re
  11. # from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
  12. _formats = {
  13. '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
  14. '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'},
  15. '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
  16. '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'},
  17. '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'},
  18. '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
  19. '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  20. '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  21. # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well
  22. '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
  23. '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
  24. '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
  25. '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
  26. '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
  27. '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
  28. '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
  29. '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  30. '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  31. # 3D videos
  32. '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  33. '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  34. '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
  35. '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'},
  36. '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'},
  37. '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
  38. '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'},
  39. # Apple HTTP Live Streaming
  40. '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
  41. '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
  42. '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  43. '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'},
  44. '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
  45. '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'},
  46. '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'},
  47. '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'},
  48. # DASH mp4 video
  49. '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
  50. '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
  51. '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
  52. '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
  53. '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
  54. '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
  55. '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
  56. '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
  57. '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
  58. '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
  59. '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
  60. '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
  61. # Dash mp4 audio
  62. '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'},
  63. '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'},
  64. '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'},
  65. '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
  66. '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
  67. '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
  68. '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
  69. # Dash webm
  70. '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  71. '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  72. '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  73. '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  74. '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  75. '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  76. '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
  77. '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  78. '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  79. '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  80. '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  81. '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  82. '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  83. '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  84. '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  85. # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
  86. '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  87. '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  88. '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  89. '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  90. '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  91. '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  92. # Dash webm audio
  93. '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128},
  94. '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256},
  95. # Dash webm audio with opus inside
  96. '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50},
  97. '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70},
  98. '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160},
  99. # RTMP (unnamed)
  100. '_rtmp': {'protocol': 'rtmp'},
  101. # av01 video only formats sometimes served with "unknown" codecs
  102. '394': {'vcodec': 'av01.0.05M.08'},
  103. '395': {'vcodec': 'av01.0.05M.08'},
  104. '396': {'vcodec': 'av01.0.05M.08'},
  105. '397': {'vcodec': 'av01.0.05M.08'},
  106. }
  107. def _extract_from_video_information_renderer(renderer_content):
  108. subtitle = extract_str(renderer_content.get('expandedSubtitle'),
  109. default='')
  110. info = {
  111. 'title': extract_str(renderer_content.get('title')),
  112. 'view_count': extract_int(subtitle),
  113. 'unlisted': False,
  114. 'live': 'watching' in subtitle,
  115. }
  116. for badge in renderer_content.get('badges', []):
  117. if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
  118. info['unlisted'] = True
  119. return info
  120. def _extract_likes_dislikes(renderer_content):
  121. def extract_button_count(toggle_button_renderer):
  122. # all the digits can be found in the accessibility data
  123. count = extract_int(multi_deep_get(
  124. toggle_button_renderer,
  125. ['defaultText', 'accessibility', 'accessibilityData', 'label'],
  126. ['accessibility', 'label'],
  127. ['accessibilityData', 'accessibilityData', 'label'],
  128. ['accessibilityText'],
  129. ))
  130. # this count doesn't have all the digits, it's like 53K for instance
  131. dumb_count = extract_int(extract_str(multi_get(
  132. toggle_button_renderer, ['defaultText', 'title'])))
  133. # The accessibility text will be "No likes" or "No dislikes" or
  134. # something like that, but dumb count will be 0
  135. if dumb_count == 0:
  136. count = 0
  137. return count
  138. info = {
  139. 'like_count': None,
  140. 'dislike_count': None,
  141. }
  142. for button in renderer_content.get('buttons', ()):
  143. if 'slimMetadataToggleButtonRenderer' in button:
  144. button_renderer = button['slimMetadataToggleButtonRenderer']
  145. count = extract_button_count(deep_get(button_renderer,
  146. 'button',
  147. 'toggleButtonRenderer'))
  148. if 'isLike' in button_renderer:
  149. info['like_count'] = count
  150. elif 'isDislike' in button_renderer:
  151. info['dislike_count'] = count
  152. elif 'slimMetadataButtonRenderer' in button:
  153. button_renderer = button['slimMetadataButtonRenderer']
  154. liberal_update(info, 'like_count', extract_button_count(
  155. multi_deep_get(button_renderer,
  156. ['button', 'segmentedLikeDislikeButtonRenderer',
  157. 'likeButton', 'toggleButtonRenderer'],
  158. ['button', 'segmentedLikeDislikeButtonViewModel',
  159. 'likeButtonViewModel', 'likeButtonViewModel',
  160. 'toggleButtonViewModel', 'toggleButtonViewModel',
  161. 'defaultButtonViewModel', 'buttonViewModel']
  162. )
  163. ))
  164. '''liberal_update(info, 'dislike_count', extract_button_count(
  165. deep_get(
  166. button_renderer, 'button',
  167. 'segmentedLikeDislikeButtonRenderer',
  168. 'dislikeButton', 'toggleButtonRenderer'
  169. )
  170. ))'''
  171. return info
  172. def _extract_from_owner_renderer(renderer_content):
  173. return {
  174. 'author': extract_str(renderer_content.get('title')),
  175. 'author_id': deep_get(
  176. renderer_content,
  177. 'navigationEndpoint', 'browseEndpoint', 'browseId'),
  178. }
  179. def _extract_from_video_header_renderer(renderer_content):
  180. return {
  181. 'title': extract_str(renderer_content.get('title')),
  182. 'time_published': extract_date(extract_str(
  183. renderer_content.get('publishDate'))),
  184. }
  185. def _extract_from_description_renderer(renderer_content):
  186. return {
  187. 'description': extract_str(
  188. renderer_content.get('descriptionBodyText'), recover_urls=True),
  189. }
  190. def _extract_metadata_row_info(renderer_content):
  191. # extract category and music list
  192. info = {
  193. 'category': None,
  194. 'music_list': [],
  195. }
  196. current_song = {}
  197. for row in deep_get(renderer_content, 'rows', default=[]):
  198. row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
  199. row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
  200. if row_title == 'Category':
  201. info['category'] = row_content
  202. elif row_title in ('Song', 'Music'):
  203. if current_song:
  204. info['music_list'].append(current_song)
  205. current_song = {'title': row_content}
  206. elif row_title == 'Artist':
  207. current_song['artist'] = row_content
  208. elif row_title == 'Album':
  209. current_song['album'] = row_content
  210. elif row_title == 'Writers':
  211. current_song['writers'] = row_content
  212. elif row_title.startswith('Licensed'):
  213. current_song['licensor'] = row_content
  214. if current_song:
  215. info['music_list'].append(current_song)
  216. return info
  217. def _extract_from_music_renderer(renderer_content):
  218. # latest format for the music list
  219. info = {
  220. 'music_list': [],
  221. }
  222. for carousel in renderer_content.get('carouselLockups', []):
  223. song = {}
  224. carousel = carousel.get('carouselLockupRenderer', {})
  225. video_renderer = carousel.get('videoLockup', {})
  226. video_renderer_info = extract_item_info(video_renderer)
  227. video_id = video_renderer_info.get('id')
  228. song['url'] = concat_or_none('https://www.youtube.com/watch?v=',
  229. video_id)
  230. song['title'] = video_renderer_info.get('title')
  231. for row in carousel.get('infoRows', []):
  232. row = row.get('infoRowRenderer', {})
  233. title = extract_str(row.get('title'))
  234. data = extract_str(row.get('defaultMetadata'))
  235. if title == 'SONG':
  236. song['title'] = data
  237. elif title == 'ARTIST':
  238. song['artist'] = data
  239. elif title == 'ALBUM':
  240. song['album'] = data
  241. elif title == 'WRITERS':
  242. song['writers'] = data
  243. info['music_list'].append(song)
  244. return info
  245. def _extract_from_video_metadata(renderer_content):
  246. info = _extract_from_video_information_renderer(renderer_content)
  247. liberal_dict_update(info, _extract_likes_dislikes(renderer_content))
  248. liberal_dict_update(info, _extract_from_owner_renderer(renderer_content))
  249. liberal_dict_update(info, _extract_metadata_row_info(deep_get(
  250. renderer_content, 'metadataRowContainer',
  251. 'metadataRowContainerRenderer', default={}
  252. )))
  253. liberal_update(info, 'title', extract_str(renderer_content.get('title')))
  254. liberal_update(
  255. info, 'description',
  256. extract_str(renderer_content.get('description'), recover_urls=True)
  257. )
  258. liberal_update(info, 'time_published',
  259. extract_date(renderer_content.get('dateText')))
  260. return info
  261. visible_extraction_dispatch = {
  262. # Either these ones spread around in various places
  263. 'slimVideoInformationRenderer': _extract_from_video_information_renderer,
  264. 'slimVideoActionBarRenderer': _extract_likes_dislikes,
  265. 'slimOwnerRenderer': _extract_from_owner_renderer,
  266. 'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
  267. 'videoDescriptionMusicSectionRenderer': _extract_from_music_renderer,
  268. 'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
  269. 'metadataRowContainerRenderer': _extract_metadata_row_info,
  270. # OR just this one, which contains SOME of the above inside it
  271. 'slimVideoMetadataRenderer': _extract_from_video_metadata,
  272. }
  273. def _extract_watch_info_mobile(top_level):
  274. '''Scrapes information from the visible page'''
  275. info = {}
  276. response = top_level.get('response', {})
  277. # this renderer has the stuff visible on the page
  278. # check for playlist
  279. items, _ = extract_items(response,
  280. item_types={'singleColumnWatchNextResults'})
  281. if items:
  282. watch_next_results = items[0]['singleColumnWatchNextResults']
  283. playlist = deep_get(watch_next_results, 'playlist', 'playlist')
  284. if playlist is None:
  285. info['playlist'] = None
  286. else:
  287. info['playlist'] = {}
  288. info['playlist']['title'] = playlist.get('title')
  289. info['playlist']['author'] = extract_str(multi_get(playlist,
  290. 'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
  291. author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
  292. 'navigationEndpoint', 'browseEndpoint', 'browseId')
  293. info['playlist']['author_id'] = author_id
  294. info['playlist']['author_url'] = concat_or_none(
  295. 'https://www.youtube.com/channel/', author_id)
  296. info['playlist']['id'] = playlist.get('playlistId')
  297. info['playlist']['url'] = concat_or_none(
  298. 'https://www.youtube.com/playlist?list=',
  299. info['playlist']['id'])
  300. info['playlist']['video_count'] = playlist.get('totalVideos')
  301. info['playlist']['current_index'] = playlist.get('currentIndex')
  302. info['playlist']['items'] = [
  303. extract_item_info(i) for i in playlist.get('contents', ())]
  304. else:
  305. info['playlist'] = None
  306. # use dispatch table to get information scattered in various renderers
  307. items, _ = extract_items(
  308. response,
  309. item_types=visible_extraction_dispatch.keys(),
  310. search_engagement_panels=True
  311. )
  312. found = set()
  313. for renderer in items:
  314. name, renderer_content = list(renderer.items())[0]
  315. found.add(name)
  316. liberal_dict_update(
  317. info,
  318. visible_extraction_dispatch[name](renderer_content)
  319. )
  320. # Call the function on blank dict for any that weren't found
  321. # so that the empty keys get added
  322. for name in visible_extraction_dispatch.keys() - found:
  323. liberal_dict_update(info, visible_extraction_dispatch[name]({}))
  324. # comment section info
  325. items, _ = extract_items(response, item_types={
  326. 'commentSectionRenderer', 'commentsEntryPointHeaderRenderer'})
  327. if items:
  328. header_type = list(items[0])[0]
  329. comment_info = items[0][header_type]
  330. # This seems to be some kind of A/B test being done on mobile, where
  331. # this is present instead of the normal commentSectionRenderer. It can
  332. # be seen here:
  333. # https://www.androidpolice.com/2019/10/31/google-youtube-app-comment-section-below-videos/
  334. # https://www.youtube.com/watch?v=bR5Q-wD-6qo
  335. if header_type == 'commentsEntryPointHeaderRenderer':
  336. comment_count_text = extract_str(multi_get(
  337. comment_info, 'commentCount', 'headerText'))
  338. else:
  339. comment_count_text = extract_str(deep_get(comment_info,
  340. 'header', 'commentSectionHeaderRenderer', 'countText'))
  341. if comment_count_text == 'Comments': # just this with no number, means 0 comments
  342. info['comment_count'] = '0'
  343. else:
  344. info['comment_count'] = extract_approx_int(comment_count_text)
  345. info['comments_disabled'] = False
  346. else: # no comment section present means comments are disabled
  347. info['comment_count'] = '0'
  348. info['comments_disabled'] = True
  349. # check for limited state
  350. items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'})
  351. if items:
  352. info['limited_state'] = True
  353. else:
  354. info['limited_state'] = False
  355. # related videos
  356. related, _ = extract_items(response)
  357. info['related_videos'] = [extract_item_info(renderer) for renderer in related]
  358. return info
  359. def _extract_watch_info_desktop(top_level):
  360. info = {
  361. 'comment_count': None,
  362. 'comments_disabled': None,
  363. 'limited_state': None,
  364. 'playlist': None,
  365. }
  366. video_info = {}
  367. for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
  368. if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
  369. video_info.update(list(renderer.values())[0])
  370. info.update(_extract_metadata_row_info(video_info))
  371. info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
  372. info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
  373. likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
  374. if len(likes_dislikes) == 2:
  375. info['like_count'] = extract_int(likes_dislikes[0])
  376. info['dislike_count'] = extract_int(likes_dislikes[1])
  377. else:
  378. info['like_count'] = None
  379. info['dislike_count'] = None
  380. info['title'] = extract_str(video_info.get('title', None))
  381. info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
  382. info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
  383. info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
  384. related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
  385. info['related_videos'] = [extract_item_info(renderer) for renderer in related]
  386. return info
  387. def update_format_with_codec_info(fmt, codec):
  388. if any(codec.startswith(c) for c in ('av', 'vp', 'h263', 'h264', 'mp4v')):
  389. if codec == 'vp8.0':
  390. codec = 'vp8'
  391. conservative_update(fmt, 'vcodec', codec)
  392. elif (codec.startswith('mp4a')
  393. or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis',
  394. 'ac-3')):
  395. conservative_update(fmt, 'acodec', codec)
  396. else:
  397. print('Warning: unrecognized codec: ' + codec)
  398. fmt_type_re = re.compile(
  399. r'(text|audio|video)/([\w0-9]+); codecs="([^"]+)"')
  400. def update_format_with_type_info(fmt, yt_fmt):
  401. # 'type' for invidious api format
  402. mime_type = multi_get(yt_fmt, 'mimeType', 'type')
  403. if mime_type is None:
  404. return
  405. match = re.fullmatch(fmt_type_re, mime_type)
  406. if match is None:
  407. print('Warning: Could not read mimetype', mime_type)
  408. return
  409. type, fmt['ext'], codecs = match.groups()
  410. codecs = codecs.split(', ')
  411. for codec in codecs:
  412. update_format_with_codec_info(fmt, codec)
  413. if type == 'audio':
  414. assert len(codecs) == 1
  415. def _extract_formats(info, player_response):
  416. streaming_data = player_response.get('streamingData', {})
  417. yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
  418. info['formats'] = []
  419. # because we may retry the extract_formats with a different player_response
  420. # so keep what we have
  421. conservative_update(info, 'hls_manifest_url',
  422. streaming_data.get('hlsManifestUrl'))
  423. conservative_update(info, 'dash_manifest_url',
  424. streaming_data.get('dash_manifest_url'))
  425. for yt_fmt in yt_formats:
  426. itag = yt_fmt.get('itag')
  427. # Translated audio track
  428. # Example: https://www.youtube.com/watch?v=gF9kkB0UWYQ
  429. # Only get the original language for now so a foreign
  430. # translation will not be picked just because it comes first
  431. if deep_get(yt_fmt, 'audioTrack', 'audioIsDefault') is False:
  432. continue
  433. fmt = {}
  434. fmt['itag'] = itag
  435. fmt['ext'] = None
  436. fmt['audio_bitrate'] = None
  437. fmt['bitrate'] = yt_fmt.get('bitrate')
  438. fmt['acodec'] = None
  439. fmt['vcodec'] = None
  440. fmt['width'] = yt_fmt.get('width')
  441. fmt['height'] = yt_fmt.get('height')
  442. fmt['file_size'] = extract_int(yt_fmt.get('contentLength'))
  443. fmt['audio_sample_rate'] = extract_int(yt_fmt.get('audioSampleRate'))
  444. fmt['duration_ms'] = yt_fmt.get('approxDurationMs')
  445. fmt['fps'] = yt_fmt.get('fps')
  446. fmt['init_range'] = yt_fmt.get('initRange')
  447. fmt['index_range'] = yt_fmt.get('indexRange')
  448. for key in ('init_range', 'index_range'):
  449. if fmt[key]:
  450. fmt[key]['start'] = int(fmt[key]['start'])
  451. fmt[key]['end'] = int(fmt[key]['end'])
  452. update_format_with_type_info(fmt, yt_fmt)
  453. cipher = dict(urllib.parse.parse_qsl(multi_get(yt_fmt,
  454. 'cipher', 'signatureCipher', default='')))
  455. if cipher:
  456. fmt['url'] = cipher.get('url')
  457. else:
  458. fmt['url'] = yt_fmt.get('url')
  459. fmt['s'] = cipher.get('s')
  460. fmt['sp'] = cipher.get('sp')
  461. # update with information from big table
  462. hardcoded_itag_info = _formats.get(str(itag), {})
  463. for key, value in hardcoded_itag_info.items():
  464. conservative_update(fmt, key, value) # prefer info from YouTube
  465. fmt['quality'] = hardcoded_itag_info.get('height')
  466. conservative_update(
  467. fmt, 'quality',
  468. extract_int(yt_fmt.get('quality'), whole_word=False)
  469. )
  470. conservative_update(
  471. fmt, 'quality',
  472. extract_int(yt_fmt.get('qualityLabel'), whole_word=False)
  473. )
  474. info['formats'].append(fmt)
  475. # get ip address
  476. if info['formats']:
  477. query_string = (info['formats'][0].get('url') or '?').split('?')[1]
  478. info['ip_address'] = deep_get(
  479. urllib.parse.parse_qs(query_string), 'ip', 0)
  480. else:
  481. info['ip_address'] = None
  482. hls_regex = re.compile(r'[\w_-]+=(?:"[^"]+"|[^",]+),')
  483. def extract_hls_formats(hls_manifest):
  484. '''returns hls_formats, err'''
  485. hls_formats = []
  486. try:
  487. lines = hls_manifest.splitlines()
  488. i = 0
  489. while i < len(lines):
  490. if lines[i].startswith('#EXT-X-STREAM-INF'):
  491. fmt = {'acodec': None, 'vcodec': None, 'height': None,
  492. 'width': None, 'fps': None, 'audio_bitrate': None,
  493. 'itag': None, 'file_size': None, 'duration_ms': None,
  494. 'audio_sample_rate': None, 'url': None}
  495. properties = lines[i].split(':')[1]
  496. properties += ',' # make regex work for last key-value pair
  497. for pair in hls_regex.findall(properties):
  498. key, value = pair.rstrip(',').split('=')
  499. if key == 'CODECS':
  500. for codec in value.strip('"').split(','):
  501. update_format_with_codec_info(fmt, codec)
  502. elif key == 'RESOLUTION':
  503. fmt['width'], fmt['height'] = map(int, value.split('x'))
  504. fmt['resolution'] = value
  505. elif key == 'FRAME-RATE':
  506. fmt['fps'] = int(value)
  507. i += 1
  508. fmt['url'] = lines[i]
  509. assert fmt['url'].startswith('http')
  510. fmt['ext'] = 'm3u8'
  511. hls_formats.append(fmt)
  512. i += 1
  513. except Exception as e:
  514. traceback.print_exc()
  515. return [], str(e)
  516. return hls_formats, None
  517. def _extract_playability_error(info, player_response, error_prefix=''):
  518. if info['formats']:
  519. info['playability_status'] = None
  520. info['playability_error'] = None
  521. return
  522. playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None)
  523. info['playability_status'] = playability_status
  524. playability_reason = extract_str(multi_deep_get(player_response,
  525. ['playabilityStatus', 'reason'],
  526. ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'],
  527. default='Could not find playability error')
  528. )
  529. if playability_status not in (None, 'OK'):
  530. info['playability_error'] = error_prefix + playability_reason
  531. elif not info['playability_error']: # do not override
  532. info['playability_error'] = error_prefix + 'Unknown playability error'
  533. SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  534. def extract_watch_info(polymer_json):
  535. info = {'playability_error': None, 'error': None,
  536. 'player_response_missing': None}
  537. if isinstance(polymer_json, dict):
  538. top_level = polymer_json
  539. elif isinstance(polymer_json, (list, tuple)):
  540. top_level = {}
  541. for page_part in polymer_json:
  542. if not isinstance(page_part, dict):
  543. return {'error': 'Invalid page part'}
  544. top_level.update(page_part)
  545. else:
  546. return {'error': 'Invalid top level polymer data'}
  547. error = check_missing_keys(top_level,
  548. ['player', 'args'],
  549. ['player', 'assets', 'js'],
  550. ['playerResponse'],
  551. )
  552. if error:
  553. info['playability_error'] = error
  554. player_response = top_level.get('playerResponse', {})
  555. # usually, only the embedded one has the urls
  556. player_args = deep_get(top_level, 'player', 'args', default={})
  557. if 'player_response' in player_args:
  558. embedded_player_response = json.loads(player_args['player_response'])
  559. else:
  560. embedded_player_response = {}
  561. # captions
  562. info['automatic_caption_languages'] = []
  563. info['manual_caption_languages'] = []
  564. info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
  565. info['translation_languages'] = []
  566. captions_info = player_response.get('captions', {})
  567. info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
  568. # Sometimes the above playerCaptionsRender is randomly missing
  569. # Extract base_url from one of the captions by removing lang specifiers
  570. if not info['_captions_base_url']:
  571. base_url = normalize_url(deep_get(
  572. captions_info,
  573. 'playerCaptionsTracklistRenderer',
  574. 'captionTracks',
  575. 0,
  576. 'baseUrl'
  577. ))
  578. if base_url:
  579. url_parts = urllib.parse.urlparse(base_url)
  580. qs = urllib.parse.parse_qs(url_parts.query)
  581. for key in ('tlang', 'lang', 'name', 'kind', 'fmt'):
  582. if key in qs:
  583. del qs[key]
  584. base_url = urllib.parse.urlunparse(url_parts._replace(
  585. query=urllib.parse.urlencode(qs, doseq=True)))
  586. info['_captions_base_url'] = base_url
  587. for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()):
  588. lang_code = caption_track.get('languageCode')
  589. if not lang_code:
  590. continue
  591. if caption_track.get('kind') == 'asr':
  592. info['automatic_caption_languages'].append(lang_code)
  593. else:
  594. info['manual_caption_languages'].append(lang_code)
  595. base_url = caption_track.get('baseUrl', '')
  596. lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
  597. if lang_name:
  598. info['_manual_caption_language_names'][lang_code] = lang_name
  599. for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()):
  600. lang_code = translation_lang_info.get('languageCode')
  601. if lang_code:
  602. info['translation_languages'].append(lang_code)
  603. if translation_lang_info.get('isTranslatable') == False:
  604. print('WARNING: Found non-translatable caption language')
  605. # formats
  606. _extract_formats(info, embedded_player_response)
  607. if not info['formats']:
  608. _extract_formats(info, player_response)
  609. # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
  610. info['player_urls_missing'] = (
  611. not info['formats'] and not embedded_player_response)
  612. # playability errors
  613. _extract_playability_error(info, player_response)
  614. # check age-restriction
  615. info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error'])
  616. # base_js (for decryption of signatures)
  617. info['base_js'] = deep_get(top_level, 'player', 'assets', 'js')
  618. if info['base_js']:
  619. info['base_js'] = normalize_url(info['base_js'])
  620. # must uniquely identify url
  621. info['player_name'] = urllib.parse.urlparse(info['base_js']).path
  622. else:
  623. info['player_name'] = None
  624. # extract stuff from visible parts of page
  625. mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={})
  626. if mobile:
  627. info.update(_extract_watch_info_mobile(top_level))
  628. else:
  629. info.update(_extract_watch_info_desktop(top_level))
  630. # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info
  631. vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={})
  632. liberal_update(info, 'title', extract_str(vd.get('title')))
  633. liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds')))
  634. liberal_update(info, 'view_count', extract_int(vd.get('viewCount')))
  635. # videos with no description have a blank string
  636. liberal_update(info, 'description', vd.get('shortDescription'))
  637. liberal_update(info, 'id', vd.get('videoId'))
  638. liberal_update(info, 'author', vd.get('author'))
  639. liberal_update(info, 'author_id', vd.get('channelId'))
  640. info['was_live'] = vd.get('isLiveContent')
  641. conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted
  642. liberal_update(info, 'tags', vd.get('keywords', []))
  643. # fallback stuff from microformat
  644. mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
  645. conservative_update(info, 'title', extract_str(mf.get('title')))
  646. conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds')))
  647. # this gives the view count for limited state videos
  648. conservative_update(info, 'view_count', extract_int(mf.get('viewCount')))
  649. conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True))
  650. conservative_update(info, 'author', mf.get('ownerChannelName'))
  651. conservative_update(info, 'author_id', mf.get('externalChannelId'))
  652. conservative_update(info, 'live', deep_get(mf, 'liveBroadcastDetails',
  653. 'isLiveNow'))
  654. liberal_update(info, 'unlisted', mf.get('isUnlisted'))
  655. liberal_update(info, 'category', mf.get('category'))
  656. liberal_update(info, 'time_published', mf.get('publishDate'))
  657. liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
  658. family_safe = mf.get('isFamilySafe')
  659. if family_safe is None:
  660. conservative_update(info, 'age_restricted', None)
  661. else:
  662. conservative_update(info, 'age_restricted', not family_safe)
  663. info['allowed_countries'] = mf.get('availableCountries', [])
  664. # other stuff
  665. info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
  666. info['storyboard_spec_url'] = deep_get(player_response, 'storyboards', 'playerStoryboardSpecRenderer', 'spec')
  667. return info
  668. single_char_codes = {
  669. 'n': '\n',
  670. '\\': '\\',
  671. '"': '"',
  672. "'": "'",
  673. 'b': '\b',
  674. 'f': '\f',
  675. 'n': '\n',
  676. 'r': '\r',
  677. 't': '\t',
  678. 'v': '\x0b',
  679. '0': '\x00',
  680. '\n': '', # backslash followed by literal newline joins lines
  681. }
  682. def js_escape_replace(match):
  683. r'''Resolves javascript string escape sequences such as \x..'''
  684. # some js-strings in the watch page html include them for no reason
  685. # https://mathiasbynens.be/notes/javascript-escapes
  686. escaped_sequence = match.group(1)
  687. if escaped_sequence[0] in ('x', 'u'):
  688. return chr(int(escaped_sequence[1:], base=16))
  689. # In javascript, if it's not one of those escape codes, it's just the
  690. # literal character. e.g., "\a" = "a"
  691. return single_char_codes.get(escaped_sequence, escaped_sequence)
  692. # works but complicated and unsafe:
  693. #PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({(?:"(?:[^"\\]|\\.)*?"|[^"])+?});')
  694. # Because there are sometimes additional statements after the json object
  695. # so we just capture all of those until end of script and tell json decoder
  696. # to ignore extra stuff after the json object
  697. PLAYER_RESPONSE_RE = re.compile(r'<script[^>]*?>[^<]*?var ytInitialPlayerResponse = ({.*?)</script>')
  698. INITIAL_DATA_RE = re.compile(r"<script[^>]*?>var ytInitialData = '(.+?[^\\])';")
  699. BASE_JS_RE = re.compile(r'jsUrl":\s*"([\w\-\./]+?/base.js)"')
  700. JS_STRING_ESCAPE_RE = re.compile(r'\\([^xu]|x..|u....)')
  701. def extract_watch_info_from_html(watch_html):
  702. base_js_match = BASE_JS_RE.search(watch_html)
  703. player_response_match = PLAYER_RESPONSE_RE.search(watch_html)
  704. initial_data_match = INITIAL_DATA_RE.search(watch_html)
  705. if base_js_match is not None:
  706. base_js_url = base_js_match.group(1)
  707. else:
  708. base_js_url = None
  709. if player_response_match is not None:
  710. decoder = json.JSONDecoder()
  711. # this will make it ignore extra stuff after end of object
  712. player_response = decoder.raw_decode(player_response_match.group(1))[0]
  713. else:
  714. return {'error': 'Could not find ytInitialPlayerResponse'}
  715. player_response = None
  716. if initial_data_match is not None:
  717. initial_data = initial_data_match.group(1)
  718. initial_data = JS_STRING_ESCAPE_RE.sub(js_escape_replace, initial_data)
  719. initial_data = json.loads(initial_data)
  720. else:
  721. print('extract_watch_info_from_html: failed to find initialData')
  722. initial_data = None
  723. # imitate old format expected by extract_watch_info
  724. fake_polymer_json = {
  725. 'player': {
  726. 'args': {},
  727. 'assets': {
  728. 'js': base_js_url
  729. }
  730. },
  731. 'playerResponse': player_response,
  732. 'response': initial_data,
  733. }
  734. return extract_watch_info(fake_polymer_json)
  735. def captions_available(info):
  736. return bool(info['_captions_base_url'])
  737. def get_caption_url(info, language, format, automatic=False, translation_language=None):
  738. '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
  739. url = info['_captions_base_url']
  740. if not url:
  741. return None
  742. url += '&lang=' + language
  743. url += '&fmt=' + format
  744. if automatic:
  745. url += '&kind=asr'
  746. elif language in info['_manual_caption_language_names']:
  747. url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='')
  748. if translation_language:
  749. url += '&tlang=' + translation_language
  750. return url
  751. def update_with_new_urls(info, player_response):
  752. '''Inserts urls from player_response json'''
  753. ERROR_PREFIX = 'Error getting missing player or bypassing age-restriction: '
  754. try:
  755. player_response = json.loads(player_response)
  756. except json.decoder.JSONDecodeError:
  757. traceback.print_exc()
  758. info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response'
  759. return
  760. _extract_formats(info, player_response)
  761. _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX)
  762. def requires_decryption(info):
  763. return ('formats' in info) and info['formats'] and info['formats'][0]['s']
  764. # adapted from youtube-dl and invidious:
  765. # https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr
  766. decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}{]+)return a\.join\(""\)\}')
  767. # gives us e.g. rt, .xK, 5 from rt.xK(a,5) or rt, ["xK"], 5 from rt["xK"](a,5)
  768. # (var, operation, argument)
  769. var_op_arg_re = re.compile(r'(\w+)(\.\w+|\["[^"]+"\])\(a,(\d+)\)')
  770. def extract_decryption_function(info, base_js):
  771. '''Insert decryption function into info. Return error string if not successful.
  772. Decryption function is a list of list[2] of numbers.
  773. It is advisable to cache the decryption function (uniquely identified by info['player_name']) so base.js (1 MB) doesn't need to be redownloaded each time'''
  774. info['decryption_function'] = None
  775. decrypt_function_match = decrypt_function_re.search(base_js)
  776. if decrypt_function_match is None:
  777. return 'Could not find decryption function in base.js'
  778. function_body = decrypt_function_match.group(1).split(';')[1:-1]
  779. if not function_body:
  780. return 'Empty decryption function body'
  781. var_with_operation_match = var_op_arg_re.fullmatch(function_body[0])
  782. if var_with_operation_match is None:
  783. return 'Could not find var_name'
  784. var_name = var_with_operation_match.group(1)
  785. var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL)
  786. if var_body_match is None:
  787. return 'Could not find var_body'
  788. operations = var_body_match.group(1).replace('\n', '').split('},')
  789. if not operations:
  790. return 'Did not find any definitions in var_body'
  791. operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others
  792. operation_definitions = {}
  793. for op in operations:
  794. colon_index = op.find(':')
  795. opening_brace_index = op.find('{')
  796. if colon_index == -1 or opening_brace_index == -1:
  797. return 'Could not parse operation'
  798. op_name = op[:colon_index]
  799. op_body = op[opening_brace_index+1:]
  800. if op_body == 'a.reverse()':
  801. operation_definitions[op_name] = 0
  802. elif op_body == 'a.splice(0,b)':
  803. operation_definitions[op_name] = 1
  804. elif op_body.startswith('var c=a[0]'):
  805. operation_definitions[op_name] = 2
  806. else:
  807. return 'Unknown op_body: ' + op_body
  808. decryption_function = []
  809. for op_with_arg in function_body:
  810. match = var_op_arg_re.fullmatch(op_with_arg)
  811. if match is None:
  812. return 'Could not parse operation with arg'
  813. op_name = match.group(2).strip('[].')
  814. if op_name not in operation_definitions:
  815. return 'Unknown op_name: ' + str(op_name)
  816. op_argument = match.group(3)
  817. decryption_function.append([operation_definitions[op_name], int(op_argument)])
  818. info['decryption_function'] = decryption_function
  819. return False
  820. def _operation_2(a, b):
  821. c = a[0]
  822. a[0] = a[b % len(a)]
  823. a[b % len(a)] = c
  824. def decrypt_signatures(info):
  825. '''Applies info['decryption_function'] to decrypt all the signatures. Return err.'''
  826. if not info.get('decryption_function'):
  827. return 'decryption_function not in info'
  828. for format in info['formats']:
  829. if not format['s'] or not format['sp'] or not format['url']:
  830. print('Warning: s, sp, or url not in format')
  831. continue
  832. a = list(format['s'])
  833. for op, argument in info['decryption_function']:
  834. if op == 0:
  835. a.reverse()
  836. elif op == 1:
  837. a = a[argument:]
  838. else:
  839. _operation_2(a, argument)
  840. signature = ''.join(a)
  841. format['url'] += '&' + format['sp'] + '=' + signature
  842. return False