rssVideos: Now thread-safe (kinda)

This commit is contained in:
Geoffrey Frogeye 2022-03-23 18:54:05 +01:00
parent 2e759f9fc6
commit d88520552b
Signed by: geoffrey
GPG key ID: C72403E7F82E6AD8

View file

@ -9,6 +9,7 @@ with the unread items (non-video links are ignored).
""" """
import datetime import datetime
import filelock
import functools import functools
import logging import logging
import os import os
@ -26,8 +27,6 @@ import yt_dlp
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# TODO Lockfile, or a way to parallel watch and download
def configure_logging(args: configargparse.Namespace) -> None: def configure_logging(args: configargparse.Namespace) -> None:
# Configure logging # Configure logging
@ -107,17 +106,33 @@ def format_duration(duration: int) -> str:
class RVElement: class RVElement:
parent: "RVDatabase" parent: "RVDatabase"
item: dict item: dict
downloaded_filepath: typing.Optional[str]
RERESEARCH_AFTER = datetime.timedelta(hours=1)
def __init__(self, parent: "RVDatabase", item: dict) -> None: def __init__(self, parent: "RVDatabase", item: dict) -> None:
self.parent = parent self.parent = parent
self.item = item self.item = item
self.downloaded_filepath = None
@property @property
def id(self) -> str: def id(self) -> str:
return self.item["id"] return self.item["id"]
@property
def sid(self) -> str:
return self.id.split("/")[-1]
def metafile(self, extension: str) -> str:
return os.path.join(self.parent.METADATA_FOLDER, f"{self.sid}.{extension}")
def metafile_read(self, extension: str) -> typing.Any:
return self.parent.metafile_read(f"{self.sid}.{extension}")
def metafile_write(self, extension: str, data: typing.Any) -> None:
return self.parent.metafile_write(f"{self.sid}.{extension}", data)
def save(self) -> None:
self.metafile_write("item", self.item)
@property @property
def title(self) -> str: def title(self) -> str:
return self.item["title"] return self.item["title"]
@ -136,14 +151,8 @@ class RVElement:
@property @property
def is_researched(self) -> bool: def is_researched(self) -> bool:
return "ytdl_infos" in self.__dict__ metafile = self.metafile("ytdl")
return os.path.isfile(metafile)
def salvage_cache(self, cache: "RVElement") -> None:
if cache.is_researched:
self.__dict__["ytdl_infos"] = cache.__dict__["ytdl_infos"]
log.debug(f"From cache: {self}")
if cache.downloaded_filepath:
self.downloaded_filepath = cache.downloaded_filepath
def __str__(self) -> str: def __str__(self) -> str:
str = f"{self.date.strftime('%y-%m-%d %H:%M')} (" str = f"{self.date.strftime('%y-%m-%d %H:%M')} ("
@ -169,6 +178,14 @@ class RVElement:
@functools.cached_property @functools.cached_property
def ytdl_infos(self) -> typing.Optional[dict]: def ytdl_infos(self) -> typing.Optional[dict]:
try:
return self.metafile_read("ytdl")
except (FileNotFoundError, TypeError, AttributeError, EOFError):
infos = self._ytdl_infos()
self.metafile_write("ytdl", infos)
return infos
def _ytdl_infos(self) -> typing.Optional[dict]:
log.info(f"Researching: {self}") log.info(f"Researching: {self}")
try: try:
infos = self.parent.ytdl_dry.extract_info(self.link, download=False) infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
@ -180,9 +197,6 @@ class RVElement:
infos = None infos = None
if infos: if infos:
infos = self.parent.ytdl_dry.sanitize_info(infos) infos = self.parent.ytdl_dry.sanitize_info(infos)
# Save database once it's been computed
self.__dict__["ytdl_infos"] = infos
self.parent.save()
return infos return infos
@property @property
@ -196,6 +210,18 @@ class RVElement:
# Duration might be missing in playlists and stuff # Duration might be missing in playlists and stuff
return self.ytdl_infos is not None and "duration" in self.ytdl_infos return self.ytdl_infos is not None and "duration" in self.ytdl_infos
@functools.cached_property
def downloaded_filepath(self) -> typing.Optional[str]:
try:
return self.metafile_read("path")
except FileNotFoundError:
return None
@property
def was_downloaded(self) -> bool:
metafile = self.metafile("path")
return os.path.exists(metafile)
@property @property
def filepath(self) -> str: def filepath(self) -> str:
assert self.is_video assert self.is_video
@ -204,37 +230,36 @@ class RVElement:
return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos) return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)
@property @property
def filename(self) -> str: def basename(self) -> str:
assert self.is_video assert self.is_video
return os.path.splitext(self.filepath)[0] return os.path.splitext(self.filepath)[0]
def expire_info(self) -> None:
metafile = self.metafile("ytdl")
if os.path.isfile(metafile):
stat = os.stat(metafile)
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
diff = datetime.datetime.now() - mtime
if diff > self.RERESEARCH_AFTER:
os.unlink(metafile)
del self.ytdl_infos
def download(self) -> None: def download(self) -> None:
assert self.is_video assert self.is_video
if self.downloaded:
return
self.expire_info()
log.info(f"Downloading: {self}") log.info(f"Downloading: {self}")
if self.parent.args.research: lockfile = self.metafile("lock")
del self.ytdl_infos with filelock.FileLock(lockfile):
if not self.parent.args.dryrun: if not self.parent.args.dryrun:
with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl: with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
ydl.add_post_processor(SaveInfoPP(self)) ydl.add_post_processor(SaveInfoPP(self))
ydl.process_ie_result(self.ytdl_infos, download=True) ydl.process_ie_result(self.ytdl_infos, download=True)
self.parent.save()
def update_post_download(self, info: dict) -> None: def update_post_download(self, info: dict) -> None:
self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info) self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
self.metafile_write("path", self.downloaded_filepath)
@property
def was_downloaded(self) -> bool:
return self.downloaded_filepath is not None
def preload(self) -> None:
assert self.is_video
if self.downloaded:
log.debug(f"Currently downloaded: {self}")
return
if self.was_downloaded:
log.debug(f"Downloaded previously: {self}")
return
self.download()
@property @property
def watched(self) -> bool: def watched(self) -> bool:
@ -270,8 +295,7 @@ class RVElement:
return True return True
def watch(self) -> None: def watch(self) -> None:
if not self.downloaded: self.download()
self.download()
cmd = ["mpv", self.filepath] cmd = ["mpv", self.filepath]
log.debug(f"Running {cmd}") log.debug(f"Running {cmd}")
@ -279,17 +303,27 @@ class RVElement:
proc = subprocess.run(cmd) proc = subprocess.run(cmd)
proc.check_returncode() proc.check_returncode()
self.clean() self.undownload()
self.try_mark_read() self.try_mark_read()
def clean(self) -> None: def clean_file(self, folder: str, basename: str) -> None:
assert self.is_video for file in os.listdir(folder):
log.info(f"Removing gone video: {self.filename}*") if file.startswith(basename):
for file in os.listdir(): path = os.path.join(folder, file)
if file.startswith(self.filename): log.debug(f"Removing file: {path}")
log.debug(f"Removing file: {file}")
if not self.parent.args.dryrun: if not self.parent.args.dryrun:
os.unlink(file) os.unlink(path)
def undownload(self) -> None:
assert self.is_video
log.info(f"Removing gone video: {self.basename}*")
self.clean_file(".", self.basename)
def clean(self) -> None:
if self.is_video:
self.undownload()
log.info(f"Removing gone metadata: {self.sid}*")
self.clean_file(self.parent.METADATA_FOLDER, self.sid)
def mark_read(self) -> None: def mark_read(self) -> None:
log.debug(f"Marking {self} read") log.debug(f"Marking {self} read")
@ -309,7 +343,7 @@ class RVElement:
if r.text.strip() != "OK": if r.text.strip() != "OK":
raise RuntimeError(f"Couldn't mark {self} as read: {r.text}") raise RuntimeError(f"Couldn't mark {self} as read: {r.text}")
log.info(f"Marked {self} as read") log.info(f"Marked {self} as read")
self.parent.elements.remove(self) self.clean()
def try_mark_read(self) -> None: def try_mark_read(self) -> None:
try: try:
@ -319,7 +353,7 @@ class RVElement:
class RVDatabase: class RVDatabase:
SAVE_FILE = ".cache.p" METADATA_FOLDER = ".metadata"
args: configargparse.Namespace args: configargparse.Namespace
elements: list[RVElement] elements: list[RVElement]
@ -327,53 +361,27 @@ class RVDatabase:
def __init__(self, args: configargparse.Namespace) -> None: def __init__(self, args: configargparse.Namespace) -> None:
self.args = args self.args = args
def save(self) -> None: def metafile_read(self, name: str) -> typing.Any:
log.debug("Saving cache") path = os.path.join(self.METADATA_FOLDER, name)
if self.args.dryrun: log.debug(f"Reading {path}")
return with open(path, "rb") as mf:
with open(self.SAVE_FILE, "wb") as save_file: return pickle.load(mf)
pickle.dump(self, save_file)
@classmethod def metafile_write(self, name: str, data: typing.Any) -> None:
def load(cls) -> typing.Optional["RVDatabase"]: path = os.path.join(self.METADATA_FOLDER, name)
try: log.debug(f"Writing {path}")
with open(cls.SAVE_FILE, "rb") as save_file: if not self.args.dryrun:
return pickle.load(save_file) with open(path, "wb") as mf:
except (TypeError, AttributeError, EOFError): pickle.dump(data, mf)
log.warning("Corrupt / outdated cache, it will be rebuilt.")
except FileNotFoundError:
pass
return None
def salvage_cache_pre(self, cache: "RVDatabase") -> None:
if "auth_headers" in cache.__dict__:
self.auth_headers = cache.auth_headers
def salvage_cache(self, cache: "RVDatabase") -> None:
log.debug("Salvaging cache")
cache_els = dict()
for cache_el in cache.elements:
cache_els[cache_el.id] = cache_el
for el in self.elements:
if el.id in cache_els:
el.salvage_cache(cache_els[el.id])
def clean_cache(self, cache: "RVDatabase") -> None: def clean_cache(self, cache: "RVDatabase") -> None:
log.debug("Cleaning cache") log.debug("Cleaning cache")
self_els = dict() fresh_ids = set(el.id for el in self.elements)
for self_el in self.elements:
self_els[self_el.id] = self_el
for el in cache.elements: for el in cache.elements:
if el.id not in self_els: if el.id not in fresh_ids:
if el.is_researched and el.is_video: el.clean()
el.clean()
def import_cache(self, cache: "RVDatabase") -> None: def _auth_headers(self) -> dict[str, str]:
log.debug("Importing cache")
self.build_list([element.item for element in cache.elements])
@functools.cached_property
def auth_headers(self) -> dict[str, str]:
r = requests.get( r = requests.get(
f"{self.args.url}/accounts/ClientLogin", f"{self.args.url}/accounts/ClientLogin",
params={"Email": self.args.email, "Passwd": self.args.passwd}, params={"Email": self.args.email, "Passwd": self.args.passwd},
@ -385,6 +393,15 @@ class RVDatabase:
return {"Authorization": f"GoogleLogin auth={val}"} return {"Authorization": f"GoogleLogin auth={val}"}
raise RuntimeError("Couldn't find auth= key") raise RuntimeError("Couldn't find auth= key")
@functools.cached_property
def auth_headers(self) -> dict[str, str]:
try:
return self.metafile_read(".auth_headers")
except FileNotFoundError:
headers = self._auth_headers()
self.metafile_write(".auth_headers", headers)
return headers
def fetch_feed_elements(self) -> typing.Generator[dict, None, None]: def fetch_feed_elements(self) -> typing.Generator[dict, None, None]:
log.info("Fetching RSS feed") log.info("Fetching RSS feed")
continuation: typing.Optional[str] = None continuation: typing.Optional[str] = None
@ -409,45 +426,47 @@ class RVDatabase:
while continuation: while continuation:
yield from next_page() yield from next_page()
def build_list(self, items: typing.Iterable[dict]) -> None: def fetch_cache_elements(self) -> typing.Generator[dict, None, None]:
log.info("Fetching from cache")
for file in os.listdir(self.METADATA_FOLDER):
if not file.endswith(".item"):
continue
yield self.metafile_read(file)
def build_list(self, items: typing.Iterable[dict], save: bool = False) -> None:
self.elements = [] self.elements = []
for item in items: for item in items:
element = RVElement(self, item) element = RVElement(self, item)
self.elements.insert(0, element) self.elements.insert(0, element)
log.debug(f"Known: {element}") log.debug(f"Known: {element}")
if save:
element.save()
def read_feed(self) -> None: def read_feed(self) -> None:
self.build_list(self.fetch_feed_elements()) self.build_list(self.fetch_feed_elements(), save=True)
def read_cache(self) -> None:
self.build_list(self.fetch_cache_elements())
def clean_folder(self, folder: str, basenames: set[str]) -> None:
for file in os.listdir(folder):
path = os.path.join(folder, file)
if not os.path.isfile(path) or file[0] == ".":
continue
for basename in basenames:
if file.startswith(basename):
break
else:
log.info(f"Removing unknown file: {path}")
if not self.args.dryrun:
os.unlink(path)
def clean(self) -> None: def clean(self) -> None:
log.debug("Cleaning") log.debug("Cleaning")
filenames = set() filenames = set(el.basename for el in self.elements if el.is_video)
for element in self.elements: self.clean_folder(".", filenames)
if element.is_video: ids = set(el.sid for el in self.elements)
filenames.add(element.filename) self.clean_folder(self.METADATA_FOLDER, ids)
for file in os.listdir():
if file == RVDatabase.SAVE_FILE:
continue
if not os.path.isfile(file):
continue
for filename in filenames:
if file.startswith(filename):
break
else:
log.info(f"Removing unknown file: {file}")
if not self.args.dryrun:
os.unlink(file)
@property
def all_researched(self) -> bool:
for element in self.elements:
if not element.is_researched:
return False
return True
def attempt_clean(self) -> None:
if self.all_researched:
self.clean()
@property @property
def ytdl_opts(self) -> dict: def ytdl_opts(self) -> dict:
@ -468,7 +487,9 @@ class RVDatabase:
elements: typing.Iterable[RVElement] elements: typing.Iterable[RVElement]
# Inexpensive sort # Inexpensive sort
if args.order == "new": if args.order == "new":
elements = reversed(elements_src) elements = sorted(elements_src, key=lambda el: el.date, reverse=True)
elif args.order == "old":
elements = sorted(elements_src, key=lambda el: el.date)
elif args.order == "title": elif args.order == "title":
elements = sorted(elements_src, key=lambda el: el.title) elements = sorted(elements_src, key=lambda el: el.title)
elif args.order == "creator": elif args.order == "creator":
@ -478,8 +499,6 @@ class RVDatabase:
elif args.order == "random": elif args.order == "random":
elements = elements_src elements = elements_src
random.shuffle(elements) random.shuffle(elements)
else:
elements = elements_src
# Possibly expensive filtering # Possibly expensive filtering
elements = filter(lambda el: el.matches_filter(args), elements) elements = filter(lambda el: el.matches_filter(args), elements)
@ -575,11 +594,6 @@ def get_args() -> configargparse.Namespace:
env_var="RSS_VIDEOS_PASSWD", env_var="RSS_VIDEOS_PASSWD",
required=True, required=True,
) )
parser.add(
"--research",
help="Fetch video info again",
action="store_true",
)
parser.add( parser.add(
"--no-refresh", "--no-refresh",
dest="refresh", dest="refresh",
@ -641,79 +655,56 @@ def get_args() -> configargparse.Namespace:
"list", "list",
"watch", "watch",
"binge", "binge",
"clean",
), ),
default="download", default="download",
) )
args = parser.parse_args() args = parser.parse_args()
args.videos = os.path.realpath(os.path.expanduser(args.videos)) args.videos = os.path.realpath(os.path.expanduser(args.videos))
if not args.duration and args.max_duration:
args.duration = str(args.max_duration)
return args return args
def get_database(args: configargparse.Namespace) -> RVDatabase: def get_database(args: configargparse.Namespace) -> RVDatabase:
database = RVDatabase(args) cache = RVDatabase(args)
cache = RVDatabase.load() cache.read_cache()
feed_fetched = False if not args.refresh:
if cache: return cache
database.salvage_cache_pre(cache)
if args.refresh:
try:
database.read_feed()
feed_fetched = True
except requests.ConnectionError as err:
if args.action == "download":
raise RuntimeError("Couldn't fetch feed, refusing to download")
# This is a quirky failsafe in case of no internet connection,
# so the script doesn't go noting that no element is a video.
log.warning(f"Couldn't fetch feed: {err}")
if not feed_fetched:
if cache:
log.warning("Using cached feed.")
database.import_cache(cache)
else:
raise FileNotFoundError("Feed not fetched and no cached feed.")
if cache:
database.salvage_cache(cache)
database.clean_cache(cache)
database.save()
return database fresh = RVDatabase(args)
fresh.read_feed()
fresh.clean_cache(cache)
return fresh
def main() -> None: def main() -> None:
args = get_args() args = get_args()
configure_logging(args) configure_logging(args)
os.makedirs(args.videos, exist_ok=True) metadata_dir = os.path.join(args.videos, RVDatabase.METADATA_FOLDER)
for dir in (args.videos, metadata_dir):
os.makedirs(dir, exist_ok=True)
os.chdir(args.videos) os.chdir(args.videos)
database = get_database(args) database = get_database(args)
database.clean()
log.debug("Running action") log.debug("Running action")
if args.action == "clean": duration = 0
database.clean() for element in database.filter(args):
else: duration += element.duration if element.is_video else 0
duration = 0 if args.action == "download":
for element in database.filter(args): element.download()
duration += element.duration if element.is_video else 0 elif args.action == "list":
if args.action == "download": print(element)
element.preload() elif args.action in ("watch", "binge"):
elif args.action == "list": element.watch()
print(element) if args.action == "watch":
elif args.action in ("watch", "binge"): break
element.watch() else:
if args.action == "watch": raise NotImplementedError(f"Unimplemented action: {args.action}")
break log.info(f"Total duration: {format_duration(duration)}")
else:
raise NotImplementedError(f"Unimplemented action: {args.action}")
log.info(f"Total duration: {format_duration(duration)}")
database.attempt_clean()
database.try_mark_watched_read() database.try_mark_watched_read()
database.save()
if __name__ == "__main__": if __name__ == "__main__":