modify_chapters.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. import copy
  2. import heapq
  3. import os
  4. from .common import PostProcessor
  5. from .ffmpeg import FFmpegPostProcessor, FFmpegSubtitlesConvertorPP
  6. from .sponsorblock import SponsorBlockPP
  7. from ..utils import PostProcessingError, orderedSet, prepend_extension
  8. _TINY_CHAPTER_DURATION = 1
  9. DEFAULT_SPONSORBLOCK_CHAPTER_TITLE = '[SponsorBlock]: %(category_names)l'
  10. class ModifyChaptersPP(FFmpegPostProcessor):
  11. def __init__(self, downloader, remove_chapters_patterns=None, remove_sponsor_segments=None, remove_ranges=None,
  12. *, sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False):
  13. FFmpegPostProcessor.__init__(self, downloader)
  14. self._remove_chapters_patterns = set(remove_chapters_patterns or [])
  15. self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys())
  16. self._ranges_to_remove = set(remove_ranges or [])
  17. self._sponsorblock_chapter_title = sponsorblock_chapter_title
  18. self._force_keyframes = force_keyframes
  19. @PostProcessor._restrict_to(images=False)
  20. def run(self, info):
  21. self._fixup_chapters(info)
  22. # Chapters must be preserved intact when downloading multiple formats of the same video.
  23. chapters, sponsor_chapters = self._mark_chapters_to_remove(
  24. copy.deepcopy(info.get('chapters')) or [],
  25. copy.deepcopy(info.get('sponsorblock_chapters')) or [])
  26. if not chapters and not sponsor_chapters:
  27. return [], info
  28. real_duration = self._get_real_video_duration(info['filepath'])
  29. if not chapters:
  30. chapters = [{'start_time': 0, 'end_time': info.get('duration') or real_duration, 'title': info['title']}]
  31. info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters)
  32. if not cuts:
  33. return [], info
  34. elif not info['chapters']:
  35. self.report_warning('You have requested to remove the entire video, which is not possible')
  36. return [], info
  37. original_duration, info['duration'] = info.get('duration'), info['chapters'][-1]['end_time']
  38. if self._duration_mismatch(real_duration, original_duration, 1):
  39. if not self._duration_mismatch(real_duration, info['duration']):
  40. self.to_screen(f'Skipping {self.pp_key()} since the video appears to be already cut')
  41. return [], info
  42. if not info.get('__real_download'):
  43. raise PostProcessingError('Cannot cut video since the real and expected durations mismatch. '
  44. 'Different chapters may have already been removed')
  45. else:
  46. self.write_debug('Expected and actual durations mismatch')
  47. concat_opts = self._make_concat_opts(cuts, real_duration)
  48. self.write_debug('Concat spec = {}'.format(', '.join(f'{c.get("inpoint", 0.0)}-{c.get("outpoint", "inf")}' for c in concat_opts)))
  49. def remove_chapters(file, is_sub):
  50. return file, self.remove_chapters(file, cuts, concat_opts, self._force_keyframes and not is_sub)
  51. in_out_files = [remove_chapters(info['filepath'], False)]
  52. in_out_files.extend(remove_chapters(in_file, True) for in_file in self._get_supported_subs(info))
  53. # Renaming should only happen after all files are processed
  54. files_to_remove = []
  55. for in_file, out_file in in_out_files:
  56. mtime = os.stat(in_file).st_mtime
  57. uncut_file = prepend_extension(in_file, 'uncut')
  58. os.replace(in_file, uncut_file)
  59. os.replace(out_file, in_file)
  60. self.try_utime(in_file, mtime, mtime)
  61. files_to_remove.append(uncut_file)
  62. return files_to_remove, info
  63. def _mark_chapters_to_remove(self, chapters, sponsor_chapters):
  64. if self._remove_chapters_patterns:
  65. warn_no_chapter_to_remove = True
  66. if not chapters:
  67. self.to_screen('Chapter information is unavailable')
  68. warn_no_chapter_to_remove = False
  69. for c in chapters:
  70. if any(regex.search(c['title']) for regex in self._remove_chapters_patterns):
  71. c['remove'] = True
  72. warn_no_chapter_to_remove = False
  73. if warn_no_chapter_to_remove:
  74. self.to_screen('There are no chapters matching the regex')
  75. if self._remove_sponsor_segments:
  76. warn_no_chapter_to_remove = True
  77. if not sponsor_chapters:
  78. self.to_screen('SponsorBlock information is unavailable')
  79. warn_no_chapter_to_remove = False
  80. for c in sponsor_chapters:
  81. if c['category'] in self._remove_sponsor_segments:
  82. c['remove'] = True
  83. warn_no_chapter_to_remove = False
  84. if warn_no_chapter_to_remove:
  85. self.to_screen('There are no matching SponsorBlock chapters')
  86. sponsor_chapters.extend({
  87. 'start_time': start,
  88. 'end_time': end,
  89. 'category': 'manually_removed',
  90. '_categories': [('manually_removed', start, end, 'Manually removed')],
  91. 'remove': True,
  92. } for start, end in self._ranges_to_remove)
  93. return chapters, sponsor_chapters
  94. def _get_supported_subs(self, info):
  95. for sub in (info.get('requested_subtitles') or {}).values():
  96. sub_file = sub.get('filepath')
  97. # The file might have been removed by --embed-subs
  98. if not sub_file or not os.path.exists(sub_file):
  99. continue
  100. ext = sub['ext']
  101. if ext not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS:
  102. self.report_warning(f'Cannot remove chapters from external {ext} subtitles; "{sub_file}" is now out of sync')
  103. continue
  104. # TODO: create __real_download for subs?
  105. yield sub_file
  106. def _remove_marked_arrange_sponsors(self, chapters):
  107. # Store cuts separately, since adjacent and overlapping cuts must be merged.
  108. cuts = []
  109. def append_cut(c):
  110. assert 'remove' in c, 'Not a cut is appended to cuts'
  111. last_to_cut = cuts[-1] if cuts else None
  112. if last_to_cut and last_to_cut['end_time'] >= c['start_time']:
  113. last_to_cut['end_time'] = max(last_to_cut['end_time'], c['end_time'])
  114. else:
  115. cuts.append(c)
  116. return len(cuts) - 1
  117. def excess_duration(c):
  118. # Cuts that are completely within the chapter reduce chapters' duration.
  119. # Since cuts can overlap, excess duration may be less that the sum of cuts' durations.
  120. # To avoid that, chapter stores the index to the fist cut within the chapter,
  121. # instead of storing excess duration. append_cut ensures that subsequent cuts (if any)
  122. # will be merged with previous ones (if necessary).
  123. cut_idx, excess = c.pop('cut_idx', len(cuts)), 0
  124. while cut_idx < len(cuts):
  125. cut = cuts[cut_idx]
  126. if cut['start_time'] >= c['end_time']:
  127. break
  128. if cut['end_time'] > c['start_time']:
  129. excess += min(cut['end_time'], c['end_time'])
  130. excess -= max(cut['start_time'], c['start_time'])
  131. cut_idx += 1
  132. return excess
  133. new_chapters = []
  134. def append_chapter(c):
  135. assert 'remove' not in c, 'Cut is appended to chapters'
  136. length = c['end_time'] - c['start_time'] - excess_duration(c)
  137. # Chapter is completely covered by cuts or sponsors.
  138. if length <= 0:
  139. return
  140. start = new_chapters[-1]['end_time'] if new_chapters else 0
  141. c.update(start_time=start, end_time=start + length)
  142. new_chapters.append(c)
  143. # Turn into a priority queue, index is a tie breaker.
  144. # Plain stack sorted by start_time is not enough: after splitting the chapter,
  145. # the part returned to the stack is not guaranteed to have start_time
  146. # less than or equal to the that of the stack's head.
  147. chapters = [(c['start_time'], i, c) for i, c in enumerate(chapters)]
  148. heapq.heapify(chapters)
  149. _, cur_i, cur_chapter = heapq.heappop(chapters)
  150. while chapters:
  151. _, i, c = heapq.heappop(chapters)
  152. # Non-overlapping chapters or cuts can be appended directly. However,
  153. # adjacent non-overlapping cuts must be merged, which is handled by append_cut.
  154. if cur_chapter['end_time'] <= c['start_time']:
  155. (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter)
  156. cur_i, cur_chapter = i, c
  157. continue
  158. # Eight possibilities for overlapping chapters: (cut, cut), (cut, sponsor),
  159. # (cut, normal), (sponsor, cut), (normal, cut), (sponsor, sponsor),
  160. # (sponsor, normal), and (normal, sponsor). There is no (normal, normal):
  161. # normal chapters are assumed not to overlap.
  162. if 'remove' in cur_chapter:
  163. # (cut, cut): adjust end_time.
  164. if 'remove' in c:
  165. cur_chapter['end_time'] = max(cur_chapter['end_time'], c['end_time'])
  166. # (cut, sponsor/normal): chop the beginning of the later chapter
  167. # (if it's not completely hidden by the cut). Push to the priority queue
  168. # to restore sorting by start_time: with beginning chopped, c may actually
  169. # start later than the remaining chapters from the queue.
  170. elif cur_chapter['end_time'] < c['end_time']:
  171. c['start_time'] = cur_chapter['end_time']
  172. c['_was_cut'] = True
  173. heapq.heappush(chapters, (c['start_time'], i, c))
  174. # (sponsor/normal, cut).
  175. elif 'remove' in c:
  176. cur_chapter['_was_cut'] = True
  177. # Chop the end of the current chapter if the cut is not contained within it.
  178. # Chopping the end doesn't break start_time sorting, no PQ push is necessary.
  179. if cur_chapter['end_time'] <= c['end_time']:
  180. cur_chapter['end_time'] = c['start_time']
  181. append_chapter(cur_chapter)
  182. cur_i, cur_chapter = i, c
  183. continue
  184. # Current chapter contains the cut within it. If the current chapter is
  185. # a sponsor chapter, check whether the categories before and after the cut differ.
  186. if '_categories' in cur_chapter:
  187. after_c = dict(cur_chapter, start_time=c['end_time'], _categories=[])
  188. cur_cats = []
  189. for cat_start_end in cur_chapter['_categories']:
  190. if cat_start_end[1] < c['start_time']:
  191. cur_cats.append(cat_start_end)
  192. if cat_start_end[2] > c['end_time']:
  193. after_c['_categories'].append(cat_start_end)
  194. cur_chapter['_categories'] = cur_cats
  195. if cur_chapter['_categories'] != after_c['_categories']:
  196. # Categories before and after the cut differ: push the after part to PQ.
  197. heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c))
  198. cur_chapter['end_time'] = c['start_time']
  199. append_chapter(cur_chapter)
  200. cur_i, cur_chapter = i, c
  201. continue
  202. # Either sponsor categories before and after the cut are the same or
  203. # we're dealing with a normal chapter. Just register an outstanding cut:
  204. # subsequent append_chapter will reduce the duration.
  205. cur_chapter.setdefault('cut_idx', append_cut(c))
  206. # (sponsor, normal): if a normal chapter is not completely overlapped,
  207. # chop the beginning of it and push it to PQ.
  208. elif '_categories' in cur_chapter and '_categories' not in c:
  209. if cur_chapter['end_time'] < c['end_time']:
  210. c['start_time'] = cur_chapter['end_time']
  211. c['_was_cut'] = True
  212. heapq.heappush(chapters, (c['start_time'], i, c))
  213. # (normal, sponsor) and (sponsor, sponsor)
  214. else:
  215. assert '_categories' in c, 'Normal chapters overlap'
  216. cur_chapter['_was_cut'] = True
  217. c['_was_cut'] = True
  218. # Push the part after the sponsor to PQ.
  219. if cur_chapter['end_time'] > c['end_time']:
  220. # deepcopy to make categories in after_c and cur_chapter/c refer to different lists.
  221. after_c = dict(copy.deepcopy(cur_chapter), start_time=c['end_time'])
  222. heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c))
  223. # Push the part after the overlap to PQ.
  224. elif c['end_time'] > cur_chapter['end_time']:
  225. after_cur = dict(copy.deepcopy(c), start_time=cur_chapter['end_time'])
  226. heapq.heappush(chapters, (after_cur['start_time'], cur_i, after_cur))
  227. c['end_time'] = cur_chapter['end_time']
  228. # (sponsor, sponsor): merge categories in the overlap.
  229. if '_categories' in cur_chapter:
  230. c['_categories'] = cur_chapter['_categories'] + c['_categories']
  231. # Inherit the cuts that the current chapter has accumulated within it.
  232. if 'cut_idx' in cur_chapter:
  233. c['cut_idx'] = cur_chapter['cut_idx']
  234. cur_chapter['end_time'] = c['start_time']
  235. append_chapter(cur_chapter)
  236. cur_i, cur_chapter = i, c
  237. (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter)
  238. return self._remove_tiny_rename_sponsors(new_chapters), cuts
  239. def _remove_tiny_rename_sponsors(self, chapters):
  240. new_chapters = []
  241. for i, c in enumerate(chapters):
  242. # Merge with the previous/next if the chapter is tiny.
  243. # Only tiny chapters resulting from a cut can be skipped.
  244. # Chapters that were already tiny in the original list will be preserved.
  245. if (('_was_cut' in c or '_categories' in c)
  246. and c['end_time'] - c['start_time'] < _TINY_CHAPTER_DURATION):
  247. if not new_chapters:
  248. # Prepend tiny chapter to the next one if possible.
  249. if i < len(chapters) - 1:
  250. chapters[i + 1]['start_time'] = c['start_time']
  251. continue
  252. else:
  253. old_c = new_chapters[-1]
  254. if i < len(chapters) - 1:
  255. next_c = chapters[i + 1]
  256. # Not a typo: key names in old_c and next_c are really different.
  257. prev_is_sponsor = 'categories' in old_c
  258. next_is_sponsor = '_categories' in next_c
  259. # Preferentially prepend tiny normals to normals and sponsors to sponsors.
  260. if (('_categories' not in c and prev_is_sponsor and not next_is_sponsor)
  261. or ('_categories' in c and not prev_is_sponsor and next_is_sponsor)):
  262. next_c['start_time'] = c['start_time']
  263. continue
  264. old_c['end_time'] = c['end_time']
  265. continue
  266. c.pop('_was_cut', None)
  267. cats = c.pop('_categories', None)
  268. if cats:
  269. category, _, _, category_name = min(cats, key=lambda c: c[2] - c[1])
  270. c.update({
  271. 'category': category,
  272. 'categories': orderedSet(x[0] for x in cats),
  273. 'name': category_name,
  274. 'category_names': orderedSet(x[3] for x in cats),
  275. })
  276. c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c.copy())
  277. # Merge identically named sponsors.
  278. if (new_chapters and 'categories' in new_chapters[-1]
  279. and new_chapters[-1]['title'] == c['title']):
  280. new_chapters[-1]['end_time'] = c['end_time']
  281. continue
  282. new_chapters.append(c)
  283. return new_chapters
  284. def remove_chapters(self, filename, ranges_to_cut, concat_opts, force_keyframes=False):
  285. in_file = filename
  286. out_file = prepend_extension(in_file, 'temp')
  287. if force_keyframes:
  288. in_file = self.force_keyframes(in_file, (t for c in ranges_to_cut for t in (c['start_time'], c['end_time'])))
  289. self.to_screen(f'Removing chapters from {filename}')
  290. self.concat_files([in_file] * len(concat_opts), out_file, concat_opts)
  291. if in_file != filename:
  292. self._delete_downloaded_files(in_file, msg=None)
  293. return out_file
  294. @staticmethod
  295. def _make_concat_opts(chapters_to_remove, duration):
  296. opts = [{}]
  297. for s in chapters_to_remove:
  298. # Do not create 0 duration chunk at the beginning.
  299. if s['start_time'] == 0:
  300. opts[-1]['inpoint'] = f'{s["end_time"]:.6f}'
  301. continue
  302. opts[-1]['outpoint'] = f'{s["start_time"]:.6f}'
  303. # Do not create 0 duration chunk at the end.
  304. if s['end_time'] < duration:
  305. opts.append({'inpoint': f'{s["end_time"]:.6f}'})
  306. return opts