1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108 |
- from youtube import util, yt_data_extract, channel, local_playlist, playlist
- from youtube import yt_app
- import settings
- import sqlite3
- import os
- import time
- import gevent
- import json
- import traceback
- import contextlib
- import defusedxml.ElementTree
- import urllib
- import math
- import secrets
- import collections
- import calendar # bullshit! https://bugs.python.org/issue6280
- import csv
- import re
- import flask
- from flask import request
- thumbnails_directory = os.path.join(settings.data_dir, "subscription_thumbnails")
- # https://stackabuse.com/a-sqlite-tutorial-with-python/
- database_path = os.path.join(settings.data_dir, "subscriptions.sqlite")
- def open_database():
- if not os.path.exists(settings.data_dir):
- os.makedirs(settings.data_dir)
- connection = sqlite3.connect(database_path, check_same_thread=False)
- try:
- cursor = connection.cursor()
- cursor.execute('''PRAGMA foreign_keys = 1''')
- # Create tables if they don't exist
- cursor.execute('''CREATE TABLE IF NOT EXISTS subscribed_channels (
- id integer PRIMARY KEY,
- yt_channel_id text UNIQUE NOT NULL,
- channel_name text NOT NULL,
- time_last_checked integer DEFAULT 0,
- next_check_time integer DEFAULT 0,
- muted integer DEFAULT 0
- )''')
- cursor.execute('''CREATE TABLE IF NOT EXISTS videos (
- id integer PRIMARY KEY,
- sql_channel_id integer NOT NULL REFERENCES subscribed_channels(id) ON UPDATE CASCADE ON DELETE CASCADE,
- video_id text UNIQUE NOT NULL,
- title text NOT NULL,
- duration text,
- time_published integer NOT NULL,
- is_time_published_exact integer DEFAULT 0,
- time_noticed integer NOT NULL,
- description text,
- watched integer default 0
- )''')
- cursor.execute('''CREATE TABLE IF NOT EXISTS tag_associations (
- id integer PRIMARY KEY,
- tag text NOT NULL,
- sql_channel_id integer NOT NULL REFERENCES subscribed_channels(id) ON UPDATE CASCADE ON DELETE CASCADE,
- UNIQUE(tag, sql_channel_id)
- )''')
- cursor.execute('''CREATE TABLE IF NOT EXISTS db_info (
- version integer DEFAULT 1
- )''')
- connection.commit()
- except:
- connection.rollback()
- connection.close()
- raise
- # https://stackoverflow.com/questions/19522505/using-sqlite3-in-python-with-with-keyword
- return contextlib.closing(connection)
- def with_open_db(function, *args, **kwargs):
- with open_database() as connection:
- with connection as cursor:
- return function(cursor, *args, **kwargs)
- def _is_subscribed(cursor, channel_id):
- result = cursor.execute('''SELECT EXISTS(
- SELECT 1
- FROM subscribed_channels
- WHERE yt_channel_id=?
- LIMIT 1
- )''', [channel_id]).fetchone()
- return bool(result[0])
- def is_subscribed(channel_id):
- if not os.path.exists(database_path):
- return False
- return with_open_db(_is_subscribed, channel_id)
- def _subscribe(channels):
- ''' channels is a list of (channel_id, channel_name) '''
- channels = list(channels)
- with open_database() as connection:
- with connection as cursor:
- channel_ids_to_check = [channel[0] for channel in channels if not _is_subscribed(cursor, channel[0])]
- rows = ((channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
- cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked, next_check_time)
- VALUES (?, ?, ?, ?)''', rows)
- if settings.autocheck_subscriptions:
- # important that this is after the changes have been committed to database
- # otherwise the autochecker (other thread) tries checking the channel before it's in the database
- channel_names.update(channels)
- check_channels_if_necessary(channel_ids_to_check)
- def delete_thumbnails(to_delete):
- for thumbnail in to_delete:
- try:
- video_id = thumbnail[0:-4]
- if video_id in existing_thumbnails:
- os.remove(os.path.join(thumbnails_directory, thumbnail))
- existing_thumbnails.remove(video_id)
- except Exception:
- print('Failed to delete thumbnail: ' + thumbnail)
- traceback.print_exc()
- def _unsubscribe(cursor, channel_ids):
- ''' channel_ids is a list of channel_ids '''
- to_delete = []
- for channel_id in channel_ids:
- rows = cursor.execute('''SELECT video_id
- FROM videos
- WHERE sql_channel_id = (
- SELECT id
- FROM subscribed_channels
- WHERE yt_channel_id=?
- )''', (channel_id,)).fetchall()
- to_delete += [row[0] + '.jpg' for row in rows]
- gevent.spawn(delete_thumbnails, to_delete)
- cursor.executemany("DELETE FROM subscribed_channels WHERE yt_channel_id=?", ((channel_id, ) for channel_id in channel_ids))
- def _get_videos(cursor, number_per_page, offset, tag=None):
- '''Returns a full page of videos with an offset, and a value good enough to be used as the total number of videos'''
- # We ask for the next 9 pages from the database
- # Then the actual length of the results tell us if there are more than 9 pages left, and if not, how many there actually are
- # This is done since there are only 9 page buttons on display at a time
- # If there are more than 9 pages left, we give a fake value in place of the real number of results if the entire database was queried without limit
- # This fake value is sufficient to get the page button generation macro to display 9 page buttons
- # If we wish to display more buttons this logic must change
- # We cannot use tricks with the sql id for the video since we frequently have filters and other restrictions in place on the results anyway
- # TODO: This is probably not the ideal solution
- if tag is not None:
- db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name, yt_channel_id
- FROM videos
- INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
- INNER JOIN tag_associations on videos.sql_channel_id = tag_associations.sql_channel_id
- WHERE tag = ? AND muted = 0
- ORDER BY time_noticed DESC, time_published DESC
- LIMIT ? OFFSET ?''', (tag, number_per_page*9, offset)).fetchall()
- else:
- db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name, yt_channel_id
- FROM videos
- INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
- WHERE muted = 0
- ORDER BY time_noticed DESC, time_published DESC
- LIMIT ? OFFSET ?''', (number_per_page*9, offset)).fetchall()
- pseudo_number_of_videos = offset + len(db_videos)
- videos = []
- for db_video in db_videos[0:number_per_page]:
- videos.append({
- 'id': db_video[0],
- 'title': db_video[1],
- 'duration': db_video[2],
- 'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
- 'author': db_video[5],
- 'author_id': db_video[6],
- 'author_url': '/https://www.youtube.com/channel/' + db_video[6],
- })
- return videos, pseudo_number_of_videos
- def _get_subscribed_channels(cursor):
- for item in cursor.execute('''SELECT channel_name, yt_channel_id, muted
- FROM subscribed_channels
- ORDER BY channel_name COLLATE NOCASE'''):
- yield item
- def _add_tags(cursor, channel_ids, tags):
- pairs = [(tag, yt_channel_id) for tag in tags for yt_channel_id in channel_ids]
- cursor.executemany('''INSERT OR IGNORE INTO tag_associations (tag, sql_channel_id)
- SELECT ?, id FROM subscribed_channels WHERE yt_channel_id = ? ''', pairs)
- def _remove_tags(cursor, channel_ids, tags):
- pairs = [(tag, yt_channel_id) for tag in tags for yt_channel_id in channel_ids]
- cursor.executemany('''DELETE FROM tag_associations
- WHERE tag = ? AND sql_channel_id = (
- SELECT id FROM subscribed_channels WHERE yt_channel_id = ?
- )''', pairs)
- def _get_tags(cursor, channel_id):
- return [row[0] for row in cursor.execute('''SELECT tag
- FROM tag_associations
- WHERE sql_channel_id = (
- SELECT id FROM subscribed_channels WHERE yt_channel_id = ?
- )''', (channel_id,))]
- def _get_all_tags(cursor):
- return [row[0] for row in cursor.execute('''SELECT DISTINCT tag FROM tag_associations''')]
- def _get_channel_names(cursor, channel_ids):
- ''' returns list of (channel_id, channel_name) '''
- result = []
- for channel_id in channel_ids:
- row = cursor.execute('''SELECT channel_name
- FROM subscribed_channels
- WHERE yt_channel_id = ?''', (channel_id,)).fetchone()
- result.append((channel_id, row[0]))
- return result
- def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_muted_status=False):
- ''' returns list of (channel_id, channel_name) '''
- statement = '''SELECT yt_channel_id, channel_name'''
- if include_muted_status:
- statement += ''', muted'''
- statement += '''
- FROM subscribed_channels
- WHERE subscribed_channels.id IN (
- SELECT tag_associations.sql_channel_id FROM tag_associations WHERE tag=?
- )
- '''
- if exclude_muted:
- statement += '''AND muted != 1\n'''
- if order:
- statement += '''ORDER BY channel_name COLLATE NOCASE'''
- return cursor.execute(statement, [tag]).fetchall()
- def _schedule_checking(cursor, channel_id, next_check_time):
- cursor.execute('''UPDATE subscribed_channels SET next_check_time = ? WHERE yt_channel_id = ?''', [int(next_check_time), channel_id])
- def _is_muted(cursor, channel_id):
- return bool(cursor.execute('''SELECT muted FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0])
- units = collections.OrderedDict([
- ('year', 31536000), # 365*24*3600
- ('month', 2592000), # 30*24*3600
- ('week', 604800), # 7*24*3600
- ('day', 86400), # 24*3600
- ('hour', 3600),
- ('minute', 60),
- ('second', 1),
- ])
- def youtube_timestamp_to_posix(dumb_timestamp):
- ''' Given a dumbed down timestamp such as 1 year ago, 3 hours ago,
- approximates the unix time (seconds since 1/1/1970) '''
- dumb_timestamp = dumb_timestamp.lower()
- now = time.time()
- if dumb_timestamp == "just now":
- return now
- split = dumb_timestamp.split(' ')
- quantifier, unit = int(split[0]), split[1]
- if quantifier > 1:
- unit = unit[:-1] # remove s from end
- return now - quantifier*units[unit]
- def posix_to_dumbed_down(posix_time):
- '''Inverse of youtube_timestamp_to_posix.'''
- delta = int(time.time() - posix_time)
- assert delta >= 0
- if delta == 0:
- return '0 seconds ago'
- for unit_name, unit_time in units.items():
- if delta >= unit_time:
- quantifier = round(delta/unit_time)
- if quantifier == 1:
- return '1 ' + unit_name + ' ago'
- else:
- return str(quantifier) + ' ' + unit_name + 's ago'
- else:
- raise Exception()
- def exact_timestamp(posix_time):
- result = time.strftime('%I:%M %p %m/%d/%y', time.localtime(posix_time))
- if result[0] == '0': # remove 0 infront of hour (like 01:00 PM)
- return result[1:]
- return result
- try:
- existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory))
- except FileNotFoundError:
- existing_thumbnails = set()
- # --- Manual checking system. Rate limited in order to support very large numbers of channels to be checked ---
- # Auto checking system plugs into this for convenience, though it doesn't really need the rate limiting
- check_channels_queue = util.RateLimitedQueue()
- checking_channels = set()
- # Just to use for printing channel checking status to console without opening database
- channel_names = dict()
- def check_channel_worker():
- while True:
- channel_id = check_channels_queue.get()
- try:
- _get_upstream_videos(channel_id)
- except Exception:
- traceback.print_exc()
- finally:
- checking_channels.remove(channel_id)
- for i in range(0, 5):
- gevent.spawn(check_channel_worker)
- # ----------------------------
- # --- Auto checking system - Spaghetti code ---
- def autocheck_dispatcher():
- '''Scans the auto_check_list. Sleeps until the earliest job is due, then adds that channel to the checking queue above. Can be sent a new job through autocheck_job_application'''
- while True:
- if len(autocheck_jobs) == 0:
- new_job = autocheck_job_application.get()
- autocheck_jobs.append(new_job)
- else:
- earliest_job_index = min(range(0, len(autocheck_jobs)), key=lambda index: autocheck_jobs[index]['next_check_time']) # https://stackoverflow.com/a/11825864
- earliest_job = autocheck_jobs[earliest_job_index]
- time_until_earliest_job = earliest_job['next_check_time'] - time.time()
- if time_until_earliest_job <= -5: # should not happen unless we're running extremely slow
- print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time']))
- next_check_time = time.time() + 3600*secrets.randbelow(60)/60
- with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time)
- autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time
- continue
- # make sure it's not muted
- if with_open_db(_is_muted, earliest_job['channel_id']):
- del autocheck_jobs[earliest_job_index]
- continue
- if time_until_earliest_job > 0: # it can become less than zero (in the past) when it's set to go off while the dispatcher is doing something else at that moment
- try:
- new_job = autocheck_job_application.get(timeout=time_until_earliest_job) # sleep for time_until_earliest_job time, but allow to be interrupted by new jobs
- except gevent.queue.Empty: # no new jobs
- pass
- else: # new job, add it to the list
- autocheck_jobs.append(new_job)
- continue
- # no new jobs, time to execute the earliest job
- channel_names[earliest_job['channel_id']] = earliest_job['channel_name']
- checking_channels.add(earliest_job['channel_id'])
- check_channels_queue.put(earliest_job['channel_id'])
- del autocheck_jobs[earliest_job_index]
- dispatcher_greenlet = None
- def start_autocheck_system():
- global autocheck_job_application
- global autocheck_jobs
- global dispatcher_greenlet
- # job application format: dict with keys (channel_id, channel_name, next_check_time)
- autocheck_job_application = gevent.queue.Queue() # only really meant to hold 1 item, just reusing gevent's wait and timeout machinery
- autocheck_jobs = [] # list of dicts with the keys (channel_id, channel_name, next_check_time). Stores all the channels that need to be autochecked and when to check them
- with open_database() as connection:
- with connection as cursor:
- now = time.time()
- for row in cursor.execute('''SELECT yt_channel_id, channel_name, next_check_time FROM subscribed_channels WHERE muted != 1''').fetchall():
- if row[2] is None:
- next_check_time = 0
- else:
- next_check_time = row[2]
- # expired, check randomly within the next hour
- # note: even if it isn't scheduled in the past right now, it might end up being if it's due soon and we dont start dispatching by then, see below where time_until_earliest_job is negative
- if next_check_time < now:
- next_check_time = now + 3600*secrets.randbelow(60)/60
- row = (row[0], row[1], next_check_time)
- _schedule_checking(cursor, row[0], next_check_time)
- autocheck_jobs.append({'channel_id': row[0], 'channel_name': row[1], 'next_check_time': next_check_time})
- dispatcher_greenlet = gevent.spawn(autocheck_dispatcher)
- def stop_autocheck_system():
- if dispatcher_greenlet is not None:
- dispatcher_greenlet.kill()
- def autocheck_setting_changed(old_value, new_value):
- if new_value:
- start_autocheck_system()
- else:
- stop_autocheck_system()
- settings.add_setting_changed_hook(
- 'autocheck_subscriptions',
- autocheck_setting_changed
- )
- if settings.autocheck_subscriptions:
- start_autocheck_system()
- # ----------------------------
- def check_channels_if_necessary(channel_ids):
- for channel_id in channel_ids:
- if channel_id not in checking_channels:
- checking_channels.add(channel_id)
- check_channels_queue.put(channel_id)
- def _get_atoma_feed(channel_id):
- url = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
- try:
- return util.fetch_url(url).decode('utf-8')
- except util.FetchError as e:
- # 404 is expected for terminated channels
- if e.code in ('404', '429'):
- return ''
- if e.code == '502':
- return str(e)
- raise
- def _get_channel_videos_first_page(channel_id, channel_status_name):
- try:
- # First try the playlist method
- pl_json = playlist.get_videos(
- 'UU' + channel_id[2:],
- 1,
- include_shorts=settings.include_shorts_in_subscriptions,
- report_text=None
- )
- pl_info = yt_data_extract.extract_playlist_info(pl_json)
- if pl_info.get('items'):
- pl_info['items'] = pl_info['items'][0:30]
- return pl_info
- # Try the channel api method
- channel_json = channel.get_channel_first_page(channel_id=channel_id)
- channel_info = yt_data_extract.extract_channel_info(
- json.loads(channel_json), 'videos'
- )
- return channel_info
- except util.FetchError as e:
- if e.code == '429' and settings.route_tor:
- error_message = ('Error checking channel ' + channel_status_name
- + ': YouTube blocked the request because the'
- + ' Tor exit node is overutilized. Try getting a new exit node'
- + ' by using the New Identity button in the Tor Browser.')
- if e.ip:
- error_message += ' Exit node IP address: ' + e.ip
- print(error_message)
- return None
- elif e.code == '502':
- print('Error checking channel', channel_status_name + ':', str(e))
- return None
- raise
- def _get_upstream_videos(channel_id):
- try:
- channel_status_name = channel_names[channel_id]
- except KeyError:
- channel_status_name = channel_id
- print("Checking channel: " + channel_status_name)
- tasks = (
- # channel page, need for video duration
- gevent.spawn(_get_channel_videos_first_page, channel_id,
- channel_status_name),
- # need atoma feed for exact published time
- gevent.spawn(_get_atoma_feed, channel_id)
- )
- gevent.joinall(tasks)
- channel_info, feed = tasks[0].value, tasks[1].value
- # extract published times from atoma feed
- times_published = {}
- try:
- def remove_bullshit(tag):
- '''Remove XML namespace bullshit from tagname. https://bugs.python.org/issue18304'''
- if '}' in tag:
- return tag[tag.rfind('}')+1:]
- return tag
- def find_element(base, tag_name):
- for element in base:
- if remove_bullshit(element.tag) == tag_name:
- return element
- return None
- root = defusedxml.ElementTree.fromstring(feed)
- assert remove_bullshit(root.tag) == 'feed'
- for entry in root:
- if (remove_bullshit(entry.tag) != 'entry'):
- continue
- # it's yt:videoId in the xml but the yt: is turned into a namespace which is removed by remove_bullshit
- video_id_element = find_element(entry, 'videoId')
- time_published_element = find_element(entry, 'published')
- assert video_id_element is not None
- assert time_published_element is not None
- time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00')))
- times_published[video_id_element.text] = time_published
- except AssertionError:
- print('Failed to read atoma feed for ' + channel_status_name)
- traceback.print_exc()
- except defusedxml.ElementTree.ParseError:
- print('Failed to read atoma feed for ' + channel_status_name)
- if channel_info is None: # there was an error
- return
- if channel_info['error']:
- print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
- return
- videos = channel_info['items']
- for i, video_item in enumerate(videos):
- if not video_item.get('description'):
- video_item['description'] = ''
- else:
- video_item['description'] = ''.join(run.get('text', '') for run in video_item['description'])
- if video_item['id'] in times_published:
- video_item['time_published'] = times_published[video_item['id']]
- video_item['is_time_published_exact'] = True
- elif video_item.get('time_published'):
- video_item['is_time_published_exact'] = False
- try:
- video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order
- except Exception:
- print(video_item)
- else:
- video_item['is_time_published_exact'] = False
- video_item['time_published'] = None
- video_item['channel_id'] = channel_id
- if len(videos) > 1:
- # Go back and fill in any videos that don't have a time published
- # using the time published of the surrounding ones
- for i in range(len(videos)-1):
- if (videos[i+1]['time_published'] is None
- and videos[i]['time_published'] is not None
- ):
- videos[i+1]['time_published'] = videos[i]['time_published'] - 1
- for i in reversed(range(1,len(videos))):
- if (videos[i-1]['time_published'] is None
- and videos[i]['time_published'] is not None
- ):
- videos[i-1]['time_published'] = videos[i]['time_published'] + 1
- # Special case: none of the videos have a time published.
- # In this case, make something up
- if videos and videos[0]['time_published'] is None:
- assert all(v['time_published'] is None for v in videos)
- now = time.time()
- for i in range(len(videos)):
- # 1 month between videos
- videos[i]['time_published'] = now - i*3600*24*30
- if len(videos) == 0:
- average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos
- elif len(videos) < 5:
- average_upload_period = int((time.time() - videos[len(videos)-1]['time_published'])/len(videos))
- else:
- average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos
- # calculate when to check next for auto checking
- # add some quantization and randomness to make pattern analysis by YouTube slightly harder
- quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600 # round up to nearest 4 hours
- randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x
- next_check_delay = randomized_upload_period/10 # check at 10x the channel posting rate. might want to fine tune this number
- next_check_time = int(time.time() + next_check_delay)
- with open_database() as connection:
- with connection as cursor:
- # Get video ids and duration of existing vids so we
- # can see how many new ones there are and update
- # livestreams/premiers
- existing_vids = list(cursor.execute(
- '''SELECT video_id, duration
- FROM videos
- INNER JOIN subscribed_channels
- ON videos.sql_channel_id = subscribed_channels.id
- WHERE yt_channel_id=?
- ORDER BY time_published DESC
- LIMIT 30''', [channel_id]).fetchall())
- existing_vid_ids = set(row[0] for row in existing_vids)
- existing_durs = dict(existing_vids)
- # new videos the channel has uploaded since last time we checked
- number_of_new_videos = 0
- for video in videos:
- if video['id'] in existing_vid_ids:
- break
- number_of_new_videos += 1
- is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0)
- time_videos_retrieved = int(time.time())
- rows = []
- update_rows = []
- for i, video_item in enumerate(videos):
- if (is_first_check
- or number_of_new_videos > 6
- or i >= number_of_new_videos):
- # don't want a crazy ordering on first check or check in a long time, since we're ordering by time_noticed
- # Last condition is for when the channel deleting videos
- # causes new videos to appear at the end of the backlog.
- # For instance, if we have 30 vids in the DB, and 1 vid
- # that we previously saw has since been deleted,
- # then a video we haven't seen before will appear as the
- # 30th. Don't want this to be considered a newly noticed
- # vid which would appear at top of subscriptions feed
- time_noticed = video_item['time_published']
- else:
- time_noticed = time_videos_retrieved
- # videos which need durations updated
- non_durations = ('upcoming', 'none', 'live', '')
- v_id = video_item['id']
- if (existing_durs.get(v_id) is not None
- and existing_durs[v_id].lower() in non_durations
- and video_item['duration'] not in non_durations
- ):
- update_rows.append((
- video_item['title'],
- video_item['duration'],
- video_item['time_published'],
- video_item['is_time_published_exact'],
- video_item['description'],
- video_item['id'],
- ))
- # all other videos
- else:
- rows.append((
- video_item['channel_id'],
- video_item['id'],
- video_item['title'],
- video_item['duration'],
- video_item['time_published'],
- video_item['is_time_published_exact'],
- time_noticed,
- video_item['description'],
- ))
- cursor.executemany('''INSERT OR IGNORE INTO videos (
- sql_channel_id,
- video_id,
- title,
- duration,
- time_published,
- is_time_published_exact,
- time_noticed,
- description
- )
- VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows)
- cursor.executemany('''UPDATE videos SET
- title=?,
- duration=?,
- time_published=?,
- is_time_published_exact=?,
- description=?
- WHERE video_id=?''', update_rows)
- cursor.execute('''UPDATE subscribed_channels
- SET time_last_checked = ?, next_check_time = ?
- WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
- if settings.autocheck_subscriptions:
- if not _is_muted(cursor, channel_id):
- autocheck_job_application.put({'channel_id': channel_id, 'channel_name': channel_names[channel_id], 'next_check_time': next_check_time})
- if number_of_new_videos == 0:
- print('No new videos from ' + channel_status_name)
- elif number_of_new_videos == 1:
- print('1 new video from ' + channel_status_name)
- else:
- print(str(number_of_new_videos) + ' new videos from ' + channel_status_name)
- def check_all_channels():
- with open_database() as connection:
- with connection as cursor:
- channel_id_name_list = cursor.execute('''SELECT yt_channel_id, channel_name
- FROM subscribed_channels
- WHERE muted != 1''').fetchall()
- channel_names.update(channel_id_name_list)
- check_channels_if_necessary([item[0] for item in channel_id_name_list])
- def check_tags(tags):
- channel_id_name_list = []
- with open_database() as connection:
- with connection as cursor:
- for tag in tags:
- channel_id_name_list += _channels_with_tag(cursor, tag, exclude_muted=True)
- channel_names.update(channel_id_name_list)
- check_channels_if_necessary([item[0] for item in channel_id_name_list])
- def check_specific_channels(channel_ids):
- with open_database() as connection:
- with connection as cursor:
- channel_id_name_list = []
- for channel_id in channel_ids:
- channel_id_name_list += cursor.execute('''SELECT yt_channel_id, channel_name
- FROM subscribed_channels
- WHERE yt_channel_id=?''', [channel_id]).fetchall()
- channel_names.update(channel_id_name_list)
- check_channels_if_necessary(channel_ids)
- CHANNEL_ID_RE = re.compile(r'UC[-_\w]{22}')
- @yt_app.route('/import_subscriptions', methods=['POST'])
- def import_subscriptions():
- # check if the post request has the file part
- if 'subscriptions_file' not in request.files:
- # flash('No file part')
- return flask.redirect(util.URL_ORIGIN + request.full_path)
- file = request.files['subscriptions_file']
- # if user does not select file, browser also
- # submit an empty part without filename
- if file.filename == '':
- # flash('No selected file')
- return flask.redirect(util.URL_ORIGIN + request.full_path)
- mime_type = file.mimetype
- if mime_type == 'application/json':
- info = file.read().decode('utf-8')
- if info == '':
- return '400 Bad Request: File is empty', 400
- try:
- info = json.loads(info)
- except json.decoder.JSONDecodeError:
- traceback.print_exc()
- return '400 Bad Request: Invalid json file', 400
- channels = []
- try:
- if 'app_version_int' in info: # NewPipe Format
- for item in info['subscriptions']:
- # Other service, such as SoundCloud
- if item.get('service_id', 0) != 0:
- continue
- channel_url = item['url']
- channel_id_match = CHANNEL_ID_RE.search(channel_url)
- if channel_id_match:
- channel_id = channel_id_match.group(0)
- else:
- print('WARNING: Could not find channel id in url',
- channel_url)
- continue
- channels.append((channel_id, item['name']))
- else: # Old Google Takeout format
- for item in info:
- snippet = item['snippet']
- channel_id = snippet['resourceId']['channelId']
- channels.append((channel_id, snippet['title']))
- except (KeyError, IndexError):
- traceback.print_exc()
- return '400 Bad Request: Unknown json structure', 400
- elif mime_type in ('application/xml', 'text/xml', 'text/x-opml'):
- file = file.read().decode('utf-8')
- try:
- root = defusedxml.ElementTree.fromstring(file)
- assert root.tag == 'opml'
- channels = []
- for outline_element in root[0][0]:
- if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib):
- continue
- channel_name = outline_element.attrib['text']
- channel_rss_url = outline_element.attrib['xmlUrl']
- channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip()
- channels.append((channel_id, channel_name))
- except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e:
- return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400
- elif mime_type in ('text/csv', 'application/vnd.ms-excel'):
- content = file.read().decode('utf-8')
- reader = csv.reader(content.splitlines())
- channels = []
- for row in reader:
- if not row or row[0].lower().strip() == 'channel id':
- continue
- elif len(row) > 1 and CHANNEL_ID_RE.fullmatch(row[0].strip()):
- channels.append( (row[0], row[-1]) )
- else:
- print('WARNING: Unknown row format:', row)
- else:
- error = 'Unsupported file format: ' + mime_type
- error += (' . Only subscription.json, subscriptions.csv files'
- ' (from Google Takeouts)'
- ' and XML OPML files exported from YouTube\'s'
- ' subscription manager page are supported')
- return (flask.render_template('error.html', error_message=error),
- 400)
- _subscribe(channels)
- return flask.redirect(util.URL_ORIGIN + '/subscription_manager', 303)
- @yt_app.route('/export_subscriptions', methods=['POST'])
- def export_subscriptions():
- include_muted = request.values.get('include_muted') == 'on'
- with open_database() as connection:
- with connection as cursor:
- sub_list = []
- for channel_name, channel_id, muted in (
- _get_subscribed_channels(cursor)):
- if muted and not include_muted:
- continue
- if request.values['export_format'] == 'json_google_takeout':
- sub_list.append({
- 'kind': 'youtube#subscription',
- 'snippet': {
- 'muted': bool(muted),
- 'resourceId': {
- 'channelId': channel_id,
- 'kind': 'youtube#channel',
- },
- 'tags': _get_tags(cursor, channel_id),
- 'title': channel_name,
- },
- })
- elif request.values['export_format'] == 'json_newpipe':
- sub_list.append({
- 'service_id': 0,
- 'url': 'https://www.youtube.com/channel/' + channel_id,
- 'name': channel_name,
- })
- elif request.values['export_format'] == 'opml':
- sub_list.append({
- 'channel_name': channel_name,
- 'channel_id': channel_id,
- })
- date_time = time.strftime('%Y%m%d%H%M', time.localtime())
- if request.values['export_format'] == 'json_google_takeout':
- r = flask.Response(json.dumps(sub_list), mimetype='text/json')
- cd = 'attachment; filename="subscriptions_%s.json"' % date_time
- r.headers['Content-Disposition'] = cd
- return r
- elif request.values['export_format'] == 'json_newpipe':
- r = flask.Response(json.dumps({
- 'app_version': '0.21.9',
- 'app_version_int': 975,
- 'subscriptions': sub_list,
- }), mimetype='text/json')
- file_name = 'newpipe_subscriptions_%s_youtube-local.json' % date_time
- cd = 'attachment; filename="%s"' % file_name
- r.headers['Content-Disposition'] = cd
- return r
- elif request.values['export_format'] == 'opml':
- r = flask.Response(
- flask.render_template('subscriptions.xml', sub_list=sub_list),
- mimetype='text/xml')
- cd = 'attachment; filename="subscriptions_%s.xml"' % date_time
- r.headers['Content-Disposition'] = cd
- return r
- else:
- return '400 Bad Request', 400
- @yt_app.route('/subscription_manager', methods=['GET'])
- def get_subscription_manager_page():
- group_by_tags = request.args.get('group_by_tags', '0') == '1'
- with open_database() as connection:
- with connection as cursor:
- if group_by_tags:
- tag_groups = []
- for tag in _get_all_tags(cursor):
- sub_list = []
- for channel_id, channel_name, muted in _channels_with_tag(cursor, tag, order=True, include_muted_status=True):
- sub_list.append({
- 'channel_url': util.URL_ORIGIN + '/channel/' + channel_id,
- 'channel_name': channel_name,
- 'channel_id': channel_id,
- 'muted': muted,
- 'tags': [t for t in _get_tags(cursor, channel_id) if t != tag],
- })
- tag_groups.append((tag, sub_list))
- # Channels with no tags
- channel_list = cursor.execute('''SELECT yt_channel_id, channel_name, muted
- FROM subscribed_channels
- WHERE id NOT IN (
- SELECT sql_channel_id FROM tag_associations
- )
- ORDER BY channel_name COLLATE NOCASE''').fetchall()
- if channel_list:
- sub_list = []
- for channel_id, channel_name, muted in channel_list:
- sub_list.append({
- 'channel_url': util.URL_ORIGIN + '/channel/' + channel_id,
- 'channel_name': channel_name,
- 'channel_id': channel_id,
- 'muted': muted,
- 'tags': [],
- })
- tag_groups.append(('No tags', sub_list))
- else:
- sub_list = []
- for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
- sub_list.append({
- 'channel_url': util.URL_ORIGIN + '/channel/' + channel_id,
- 'channel_name': channel_name,
- 'channel_id': channel_id,
- 'muted': muted,
- 'tags': _get_tags(cursor, channel_id),
- })
- if group_by_tags:
- return flask.render_template(
- 'subscription_manager.html',
- group_by_tags=True,
- tag_groups=tag_groups,
- )
- else:
- return flask.render_template(
- 'subscription_manager.html',
- group_by_tags=False,
- sub_list=sub_list,
- )
- def list_from_comma_separated_tags(string):
- return [tag.strip() for tag in string.split(',') if tag.strip()]
- @yt_app.route('/subscription_manager', methods=['POST'])
- def post_subscription_manager_page():
- action = request.values['action']
- with open_database() as connection:
- with connection as cursor:
- if action == 'add_tags':
- _add_tags(cursor, request.values.getlist('channel_ids'), [tag.lower() for tag in list_from_comma_separated_tags(request.values['tags'])])
- elif action == 'remove_tags':
- _remove_tags(cursor, request.values.getlist('channel_ids'), [tag.lower() for tag in list_from_comma_separated_tags(request.values['tags'])])
- elif action == 'unsubscribe':
- _unsubscribe(cursor, request.values.getlist('channel_ids'))
- elif action == 'unsubscribe_verify':
- unsubscribe_list = _get_channel_names(cursor, request.values.getlist('channel_ids'))
- return flask.render_template('unsubscribe_verify.html', unsubscribe_list=unsubscribe_list)
- elif action == 'mute':
- cursor.executemany('''UPDATE subscribed_channels
- SET muted = 1
- WHERE yt_channel_id = ?''', [(ci,) for ci in request.values.getlist('channel_ids')])
- elif action == 'unmute':
- cursor.executemany('''UPDATE subscribed_channels
- SET muted = 0
- WHERE yt_channel_id = ?''', [(ci,) for ci in request.values.getlist('channel_ids')])
- else:
- flask.abort(400)
- return flask.redirect(util.URL_ORIGIN + request.full_path, 303)
- @yt_app.route('/subscriptions', methods=['GET'])
- @yt_app.route('/feed/subscriptions', methods=['GET'])
- def get_subscriptions_page():
- page = int(request.args.get('page', 1))
- with open_database() as connection:
- with connection as cursor:
- tag = request.args.get('tag', None)
- videos, number_of_videos_in_db = _get_videos(cursor, 60, (page - 1)*60, tag)
- for video in videos:
- video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
- video['type'] = 'video'
- video['item_size'] = 'small'
- util.add_extra_html_info(video)
- tags = _get_all_tags(cursor)
- subscription_list = []
- for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
- subscription_list.append({
- 'channel_url': util.URL_ORIGIN + '/channel/' + channel_id,
- 'channel_name': channel_name,
- 'channel_id': channel_id,
- 'muted': muted,
- })
- return flask.render_template(
- 'subscriptions.html',
- header_playlist_names=local_playlist.get_playlist_names(),
- videos=videos,
- num_pages=math.ceil(number_of_videos_in_db/60),
- parameters_dictionary=request.args,
- tags=tags,
- current_tag=tag,
- subscription_list=subscription_list,
- )
- @yt_app.route('/subscriptions', methods=['POST'])
- @yt_app.route('/feed/subscriptions', methods=['POST'])
- def post_subscriptions_page():
- action = request.values['action']
- if action == 'subscribe':
- if len(request.values.getlist('channel_id')) != len(request.values.getlist('channel_name')):
- return '400 Bad Request, length of channel_id != length of channel_name', 400
- _subscribe(zip(request.values.getlist('channel_id'), request.values.getlist('channel_name')))
- elif action == 'unsubscribe':
- with_open_db(_unsubscribe, request.values.getlist('channel_id'))
- elif action == 'refresh':
- type = request.values['type']
- if type == 'all':
- check_all_channels()
- elif type == 'tag':
- check_tags(request.values.getlist('tag_name'))
- elif type == 'channel':
- check_specific_channels(request.values.getlist('channel_id'))
- else:
- flask.abort(400)
- else:
- flask.abort(400)
- return '', 204
- @yt_app.route('/data/subscription_thumbnails/<thumbnail>')
- def serve_subscription_thumbnail(thumbnail):
- '''Serves thumbnail from disk if it's been saved already. If not, downloads the thumbnail, saves to disk, and serves it.'''
- assert thumbnail[-4:] == '.jpg'
- video_id = thumbnail[0:-4]
- thumbnail_path = os.path.join(thumbnails_directory, thumbnail)
- if video_id in existing_thumbnails:
- try:
- f = open(thumbnail_path, 'rb')
- except FileNotFoundError:
- existing_thumbnails.remove(video_id)
- else:
- image = f.read()
- f.close()
- return flask.Response(image, mimetype='image/jpeg')
- url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
- try:
- image = util.fetch_url(url, report_text="Saved thumbnail: " + video_id)
- except urllib.error.HTTPError as e:
- print("Failed to download thumbnail for " + video_id + ": " + str(e))
- abort(e.code)
- try:
- f = open(thumbnail_path, 'wb')
- except FileNotFoundError:
- os.makedirs(thumbnails_directory, exist_ok=True)
- f = open(thumbnail_path, 'wb')
- f.write(image)
- f.close()
- existing_thumbnails.add(video_id)
- return flask.Response(image, mimetype='image/jpeg')
|