rssVideos: Now thread-safe (kinda)

This commit is contained in:
Geoffrey Frogeye 2022-03-23 18:54:05 +01:00
parent 2e759f9fc6
commit d88520552b
Signed by: geoffrey
GPG key ID: C72403E7F82E6AD8

View file

@ -9,6 +9,7 @@ with the unread items (non-video links are ignored).
"""
import datetime
import filelock
import functools
import logging
import os
@ -26,8 +27,6 @@ import yt_dlp
log = logging.getLogger(__name__)
# TODO Lockfile, or a way to parallel watch and download
def configure_logging(args: configargparse.Namespace) -> None:
# Configure logging
@ -107,17 +106,33 @@ def format_duration(duration: int) -> str:
class RVElement:
parent: "RVDatabase"
item: dict
downloaded_filepath: typing.Optional[str]
RERESEARCH_AFTER = datetime.timedelta(hours=1)
def __init__(self, parent: "RVDatabase", item: dict) -> None:
self.parent = parent
self.item = item
self.downloaded_filepath = None
@property
def id(self) -> str:
return self.item["id"]
@property
def sid(self) -> str:
return self.id.split("/")[-1]
def metafile(self, extension: str) -> str:
return os.path.join(self.parent.METADATA_FOLDER, f"{self.sid}.{extension}")
def metafile_read(self, extension: str) -> typing.Any:
return self.parent.metafile_read(f"{self.sid}.{extension}")
def metafile_write(self, extension: str, data: typing.Any) -> None:
return self.parent.metafile_write(f"{self.sid}.{extension}", data)
def save(self) -> None:
self.metafile_write("item", self.item)
@property
def title(self) -> str:
return self.item["title"]
@ -136,14 +151,8 @@ class RVElement:
@property
def is_researched(self) -> bool:
return "ytdl_infos" in self.__dict__
def salvage_cache(self, cache: "RVElement") -> None:
if cache.is_researched:
self.__dict__["ytdl_infos"] = cache.__dict__["ytdl_infos"]
log.debug(f"From cache: {self}")
if cache.downloaded_filepath:
self.downloaded_filepath = cache.downloaded_filepath
metafile = self.metafile("ytdl")
return os.path.isfile(metafile)
def __str__(self) -> str:
str = f"{self.date.strftime('%y-%m-%d %H:%M')} ("
@ -169,6 +178,14 @@ class RVElement:
@functools.cached_property
def ytdl_infos(self) -> typing.Optional[dict]:
try:
return self.metafile_read("ytdl")
except (FileNotFoundError, TypeError, AttributeError, EOFError):
infos = self._ytdl_infos()
self.metafile_write("ytdl", infos)
return infos
def _ytdl_infos(self) -> typing.Optional[dict]:
log.info(f"Researching: {self}")
try:
infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
@ -180,9 +197,6 @@ class RVElement:
infos = None
if infos:
infos = self.parent.ytdl_dry.sanitize_info(infos)
# Save database once it's been computed
self.__dict__["ytdl_infos"] = infos
self.parent.save()
return infos
@property
@ -196,6 +210,18 @@ class RVElement:
# Duration might be missing in playlists and stuff
return self.ytdl_infos is not None and "duration" in self.ytdl_infos
@functools.cached_property
def downloaded_filepath(self) -> typing.Optional[str]:
try:
return self.metafile_read("path")
except FileNotFoundError:
return None
@property
def was_downloaded(self) -> bool:
metafile = self.metafile("path")
return os.path.exists(metafile)
@property
def filepath(self) -> str:
assert self.is_video
@ -204,37 +230,36 @@ class RVElement:
return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)
@property
def filename(self) -> str:
def basename(self) -> str:
assert self.is_video
return os.path.splitext(self.filepath)[0]
def expire_info(self) -> None:
metafile = self.metafile("ytdl")
if os.path.isfile(metafile):
stat = os.stat(metafile)
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
diff = datetime.datetime.now() - mtime
if diff > self.RERESEARCH_AFTER:
os.unlink(metafile)
del self.ytdl_infos
def download(self) -> None:
assert self.is_video
if self.downloaded:
return
self.expire_info()
log.info(f"Downloading: {self}")
if self.parent.args.research:
del self.ytdl_infos
if not self.parent.args.dryrun:
with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
ydl.add_post_processor(SaveInfoPP(self))
ydl.process_ie_result(self.ytdl_infos, download=True)
self.parent.save()
lockfile = self.metafile("lock")
with filelock.FileLock(lockfile):
if not self.parent.args.dryrun:
with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
ydl.add_post_processor(SaveInfoPP(self))
ydl.process_ie_result(self.ytdl_infos, download=True)
def update_post_download(self, info: dict) -> None:
self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
@property
def was_downloaded(self) -> bool:
return self.downloaded_filepath is not None
def preload(self) -> None:
assert self.is_video
if self.downloaded:
log.debug(f"Currently downloaded: {self}")
return
if self.was_downloaded:
log.debug(f"Downloaded previously: {self}")
return
self.download()
self.metafile_write("path", self.downloaded_filepath)
@property
def watched(self) -> bool:
@ -270,8 +295,7 @@ class RVElement:
return True
def watch(self) -> None:
if not self.downloaded:
self.download()
self.download()
cmd = ["mpv", self.filepath]
log.debug(f"Running {cmd}")
@ -279,17 +303,27 @@ class RVElement:
proc = subprocess.run(cmd)
proc.check_returncode()
self.clean()
self.undownload()
self.try_mark_read()
def clean(self) -> None:
assert self.is_video
log.info(f"Removing gone video: {self.filename}*")
for file in os.listdir():
if file.startswith(self.filename):
log.debug(f"Removing file: {file}")
def clean_file(self, folder: str, basename: str) -> None:
for file in os.listdir(folder):
if file.startswith(basename):
path = os.path.join(folder, file)
log.debug(f"Removing file: {path}")
if not self.parent.args.dryrun:
os.unlink(file)
os.unlink(path)
def undownload(self) -> None:
assert self.is_video
log.info(f"Removing gone video: {self.basename}*")
self.clean_file(".", self.basename)
def clean(self) -> None:
if self.is_video:
self.undownload()
log.info(f"Removing gone metadata: {self.sid}*")
self.clean_file(self.parent.METADATA_FOLDER, self.sid)
def mark_read(self) -> None:
log.debug(f"Marking {self} read")
@ -309,7 +343,7 @@ class RVElement:
if r.text.strip() != "OK":
raise RuntimeError(f"Couldn't mark {self} as read: {r.text}")
log.info(f"Marked {self} as read")
self.parent.elements.remove(self)
self.clean()
def try_mark_read(self) -> None:
try:
@ -319,7 +353,7 @@ class RVElement:
class RVDatabase:
SAVE_FILE = ".cache.p"
METADATA_FOLDER = ".metadata"
args: configargparse.Namespace
elements: list[RVElement]
@ -327,53 +361,27 @@ class RVDatabase:
def __init__(self, args: configargparse.Namespace) -> None:
self.args = args
def save(self) -> None:
log.debug("Saving cache")
if self.args.dryrun:
return
with open(self.SAVE_FILE, "wb") as save_file:
pickle.dump(self, save_file)
def metafile_read(self, name: str) -> typing.Any:
path = os.path.join(self.METADATA_FOLDER, name)
log.debug(f"Reading {path}")
with open(path, "rb") as mf:
return pickle.load(mf)
@classmethod
def load(cls) -> typing.Optional["RVDatabase"]:
try:
with open(cls.SAVE_FILE, "rb") as save_file:
return pickle.load(save_file)
except (TypeError, AttributeError, EOFError):
log.warning("Corrupt / outdated cache, it will be rebuilt.")
except FileNotFoundError:
pass
return None
def salvage_cache_pre(self, cache: "RVDatabase") -> None:
if "auth_headers" in cache.__dict__:
self.auth_headers = cache.auth_headers
def salvage_cache(self, cache: "RVDatabase") -> None:
log.debug("Salvaging cache")
cache_els = dict()
for cache_el in cache.elements:
cache_els[cache_el.id] = cache_el
for el in self.elements:
if el.id in cache_els:
el.salvage_cache(cache_els[el.id])
def metafile_write(self, name: str, data: typing.Any) -> None:
path = os.path.join(self.METADATA_FOLDER, name)
log.debug(f"Writing {path}")
if not self.args.dryrun:
with open(path, "wb") as mf:
pickle.dump(data, mf)
def clean_cache(self, cache: "RVDatabase") -> None:
log.debug("Cleaning cache")
self_els = dict()
for self_el in self.elements:
self_els[self_el.id] = self_el
fresh_ids = set(el.id for el in self.elements)
for el in cache.elements:
if el.id not in self_els:
if el.is_researched and el.is_video:
el.clean()
if el.id not in fresh_ids:
el.clean()
def import_cache(self, cache: "RVDatabase") -> None:
log.debug("Importing cache")
self.build_list([element.item for element in cache.elements])
@functools.cached_property
def auth_headers(self) -> dict[str, str]:
def _auth_headers(self) -> dict[str, str]:
r = requests.get(
f"{self.args.url}/accounts/ClientLogin",
params={"Email": self.args.email, "Passwd": self.args.passwd},
@ -385,6 +393,15 @@ class RVDatabase:
return {"Authorization": f"GoogleLogin auth={val}"}
raise RuntimeError("Couldn't find auth= key")
@functools.cached_property
def auth_headers(self) -> dict[str, str]:
try:
return self.metafile_read(".auth_headers")
except FileNotFoundError:
headers = self._auth_headers()
self.metafile_write(".auth_headers", headers)
return headers
def fetch_feed_elements(self) -> typing.Generator[dict, None, None]:
log.info("Fetching RSS feed")
continuation: typing.Optional[str] = None
@ -409,45 +426,47 @@ class RVDatabase:
while continuation:
yield from next_page()
def build_list(self, items: typing.Iterable[dict]) -> None:
def fetch_cache_elements(self) -> typing.Generator[dict, None, None]:
log.info("Fetching from cache")
for file in os.listdir(self.METADATA_FOLDER):
if not file.endswith(".item"):
continue
yield self.metafile_read(file)
def build_list(self, items: typing.Iterable[dict], save: bool = False) -> None:
self.elements = []
for item in items:
element = RVElement(self, item)
self.elements.insert(0, element)
log.debug(f"Known: {element}")
if save:
element.save()
def read_feed(self) -> None:
self.build_list(self.fetch_feed_elements())
self.build_list(self.fetch_feed_elements(), save=True)
def read_cache(self) -> None:
self.build_list(self.fetch_cache_elements())
def clean_folder(self, folder: str, basenames: set[str]) -> None:
for file in os.listdir(folder):
path = os.path.join(folder, file)
if not os.path.isfile(path) or file[0] == ".":
continue
for basename in basenames:
if file.startswith(basename):
break
else:
log.info(f"Removing unknown file: {path}")
if not self.args.dryrun:
os.unlink(path)
def clean(self) -> None:
log.debug("Cleaning")
filenames = set()
for element in self.elements:
if element.is_video:
filenames.add(element.filename)
for file in os.listdir():
if file == RVDatabase.SAVE_FILE:
continue
if not os.path.isfile(file):
continue
for filename in filenames:
if file.startswith(filename):
break
else:
log.info(f"Removing unknown file: {file}")
if not self.args.dryrun:
os.unlink(file)
@property
def all_researched(self) -> bool:
for element in self.elements:
if not element.is_researched:
return False
return True
def attempt_clean(self) -> None:
if self.all_researched:
self.clean()
filenames = set(el.basename for el in self.elements if el.is_video)
self.clean_folder(".", filenames)
ids = set(el.sid for el in self.elements)
self.clean_folder(self.METADATA_FOLDER, ids)
@property
def ytdl_opts(self) -> dict:
@ -468,7 +487,9 @@ class RVDatabase:
elements: typing.Iterable[RVElement]
# Inexpensive sort
if args.order == "new":
elements = reversed(elements_src)
elements = sorted(elements_src, key=lambda el: el.date, reverse=True)
elif args.order == "old":
elements = sorted(elements_src, key=lambda el: el.date)
elif args.order == "title":
elements = sorted(elements_src, key=lambda el: el.title)
elif args.order == "creator":
@ -478,8 +499,6 @@ class RVDatabase:
elif args.order == "random":
elements = elements_src
random.shuffle(elements)
else:
elements = elements_src
# Possibly expensive filtering
elements = filter(lambda el: el.matches_filter(args), elements)
@ -575,11 +594,6 @@ def get_args() -> configargparse.Namespace:
env_var="RSS_VIDEOS_PASSWD",
required=True,
)
parser.add(
"--research",
help="Fetch video info again",
action="store_true",
)
parser.add(
"--no-refresh",
dest="refresh",
@ -641,79 +655,56 @@ def get_args() -> configargparse.Namespace:
"list",
"watch",
"binge",
"clean",
),
default="download",
)
args = parser.parse_args()
args.videos = os.path.realpath(os.path.expanduser(args.videos))
if not args.duration and args.max_duration:
args.duration = str(args.max_duration)
return args
def get_database(args: configargparse.Namespace) -> RVDatabase:
database = RVDatabase(args)
cache = RVDatabase.load()
feed_fetched = False
if cache:
database.salvage_cache_pre(cache)
if args.refresh:
try:
database.read_feed()
feed_fetched = True
except requests.ConnectionError as err:
if args.action == "download":
raise RuntimeError("Couldn't fetch feed, refusing to download")
# This is a quirky failsafe in case of no internet connection,
# so the script doesn't go noting that no element is a video.
log.warning(f"Couldn't fetch feed: {err}")
if not feed_fetched:
if cache:
log.warning("Using cached feed.")
database.import_cache(cache)
else:
raise FileNotFoundError("Feed not fetched and no cached feed.")
if cache:
database.salvage_cache(cache)
database.clean_cache(cache)
database.save()
cache = RVDatabase(args)
cache.read_cache()
if not args.refresh:
return cache
return database
fresh = RVDatabase(args)
fresh.read_feed()
fresh.clean_cache(cache)
return fresh
def main() -> None:
args = get_args()
configure_logging(args)
os.makedirs(args.videos, exist_ok=True)
metadata_dir = os.path.join(args.videos, RVDatabase.METADATA_FOLDER)
for dir in (args.videos, metadata_dir):
os.makedirs(dir, exist_ok=True)
os.chdir(args.videos)
database = get_database(args)
database.clean()
log.debug("Running action")
if args.action == "clean":
database.clean()
else:
duration = 0
for element in database.filter(args):
duration += element.duration if element.is_video else 0
if args.action == "download":
element.preload()
elif args.action == "list":
print(element)
elif args.action in ("watch", "binge"):
element.watch()
if args.action == "watch":
break
else:
raise NotImplementedError(f"Unimplemented action: {args.action}")
log.info(f"Total duration: {format_duration(duration)}")
database.attempt_clean()
duration = 0
for element in database.filter(args):
duration += element.duration if element.is_video else 0
if args.action == "download":
element.download()
elif args.action == "list":
print(element)
elif args.action in ("watch", "binge"):
element.watch()
if args.action == "watch":
break
else:
raise NotImplementedError(f"Unimplemented action: {args.action}")
log.info(f"Total duration: {format_duration(duration)}")
database.try_mark_watched_read()
database.save()
if __name__ == "__main__":