rssVideos: Now thread-safe (kinda)

2022-03-23 18:54:05 +01:00 · 2022-03-23 18:54:05 +01:00 · d88520552b
commit d88520552b
parent 2e759f9fc6
1 changed files with 167 additions and 176 deletions
--- a/config/scripts/rssVideos
+++ b/config/scripts/rssVideos
@ -9,6 +9,7 @@ with the unread items (non-video links are ignored).
 """

 import datetime
+import filelock
 import functools
 import logging
 import os
@ -26,8 +27,6 @@ import yt_dlp

 log = logging.getLogger(__name__)

-# TODO Lockfile, or a way to parallel watch and download
-

 def configure_logging(args: configargparse.Namespace) -> None:
    # Configure logging
@ -107,17 +106,33 @@ def format_duration(duration: int) -> str:
 class RVElement:
    parent: "RVDatabase"
    item: dict
-    downloaded_filepath: typing.Optional[str]
+
+    RERESEARCH_AFTER = datetime.timedelta(hours=1)

    def __init__(self, parent: "RVDatabase", item: dict) -> None:
        self.parent = parent
        self.item = item
-        self.downloaded_filepath = None

    @property
    def id(self) -> str:
        return self.item["id"]

+    @property
+    def sid(self) -> str:
+        return self.id.split("/")[-1]
+
+    def metafile(self, extension: str) -> str:
+        return os.path.join(self.parent.METADATA_FOLDER, f"{self.sid}.{extension}")
+
+    def metafile_read(self, extension: str) -> typing.Any:
+        return self.parent.metafile_read(f"{self.sid}.{extension}")
+
+    def metafile_write(self, extension: str, data: typing.Any) -> None:
+        return self.parent.metafile_write(f"{self.sid}.{extension}", data)
+
+    def save(self) -> None:
+        self.metafile_write("item", self.item)
+
    @property
    def title(self) -> str:
        return self.item["title"]
@ -136,14 +151,8 @@ class RVElement:

    @property
    def is_researched(self) -> bool:
-        return "ytdl_infos" in self.__dict__
-
-    def salvage_cache(self, cache: "RVElement") -> None:
-        if cache.is_researched:
-            self.__dict__["ytdl_infos"] = cache.__dict__["ytdl_infos"]
-            log.debug(f"From cache: {self}")
-        if cache.downloaded_filepath:
-            self.downloaded_filepath = cache.downloaded_filepath
+        metafile = self.metafile("ytdl")
+        return os.path.isfile(metafile)

    def __str__(self) -> str:
        str = f"{self.date.strftime('%y-%m-%d %H:%M')} ("
@ -169,6 +178,14 @@ class RVElement:

    @functools.cached_property
    def ytdl_infos(self) -> typing.Optional[dict]:
+        try:
+            return self.metafile_read("ytdl")
+        except (FileNotFoundError, TypeError, AttributeError, EOFError):
+            infos = self._ytdl_infos()
+            self.metafile_write("ytdl", infos)
+        return infos
+
+    def _ytdl_infos(self) -> typing.Optional[dict]:
        log.info(f"Researching: {self}")
        try:
            infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
@ -180,9 +197,6 @@ class RVElement:
            infos = None
        if infos:
            infos = self.parent.ytdl_dry.sanitize_info(infos)
-        # Save database once it's been computed
-        self.__dict__["ytdl_infos"] = infos
-        self.parent.save()
        return infos

    @property
@ -196,6 +210,18 @@ class RVElement:
        # Duration might be missing in playlists and stuff
        return self.ytdl_infos is not None and "duration" in self.ytdl_infos

+    @functools.cached_property
+    def downloaded_filepath(self) -> typing.Optional[str]:
+        try:
+            return self.metafile_read("path")
+        except FileNotFoundError:
+            return None
+
+    @property
+    def was_downloaded(self) -> bool:
+        metafile = self.metafile("path")
+        return os.path.exists(metafile)
+
    @property
    def filepath(self) -> str:
        assert self.is_video
@ -204,37 +230,36 @@ class RVElement:
        return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)

    @property
-    def filename(self) -> str:
+    def basename(self) -> str:
        assert self.is_video
        return os.path.splitext(self.filepath)[0]

+    def expire_info(self) -> None:
+        metafile = self.metafile("ytdl")
+        if os.path.isfile(metafile):
+            stat = os.stat(metafile)
+            mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
+            diff = datetime.datetime.now() - mtime
+            if diff > self.RERESEARCH_AFTER:
+                os.unlink(metafile)
+                del self.ytdl_infos
+
    def download(self) -> None:
        assert self.is_video
+        if self.downloaded:
+            return
+        self.expire_info()
        log.info(f"Downloading: {self}")
-        if self.parent.args.research:
-            del self.ytdl_infos
-        if not self.parent.args.dryrun:
-            with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
-                ydl.add_post_processor(SaveInfoPP(self))
-                ydl.process_ie_result(self.ytdl_infos, download=True)
-        self.parent.save()
+        lockfile = self.metafile("lock")
+        with filelock.FileLock(lockfile):
+            if not self.parent.args.dryrun:
+                with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
+                    ydl.add_post_processor(SaveInfoPP(self))
+                    ydl.process_ie_result(self.ytdl_infos, download=True)

    def update_post_download(self, info: dict) -> None:
        self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
-
-    @property
-    def was_downloaded(self) -> bool:
-        return self.downloaded_filepath is not None
-
-    def preload(self) -> None:
-        assert self.is_video
-        if self.downloaded:
-            log.debug(f"Currently downloaded: {self}")
-            return
-        if self.was_downloaded:
-            log.debug(f"Downloaded previously: {self}")
-            return
-        self.download()
+        self.metafile_write("path", self.downloaded_filepath)

    @property
    def watched(self) -> bool:
@ -270,8 +295,7 @@ class RVElement:
        return True

    def watch(self) -> None:
-        if not self.downloaded:
-            self.download()
+        self.download()

        cmd = ["mpv", self.filepath]
        log.debug(f"Running {cmd}")
@ -279,17 +303,27 @@ class RVElement:
            proc = subprocess.run(cmd)
            proc.check_returncode()

-        self.clean()
+        self.undownload()
        self.try_mark_read()

-    def clean(self) -> None:
-        assert self.is_video
-        log.info(f"Removing gone video: {self.filename}*")
-        for file in os.listdir():
-            if file.startswith(self.filename):
-                log.debug(f"Removing file: {file}")
+    def clean_file(self, folder: str, basename: str) -> None:
+        for file in os.listdir(folder):
+            if file.startswith(basename):
+                path = os.path.join(folder, file)
+                log.debug(f"Removing file: {path}")
                if not self.parent.args.dryrun:
-                    os.unlink(file)
+                    os.unlink(path)
+
+    def undownload(self) -> None:
+        assert self.is_video
+        log.info(f"Removing gone video: {self.basename}*")
+        self.clean_file(".", self.basename)
+
+    def clean(self) -> None:
+        if self.is_video:
+            self.undownload()
+        log.info(f"Removing gone metadata: {self.sid}*")
+        self.clean_file(self.parent.METADATA_FOLDER, self.sid)

    def mark_read(self) -> None:
        log.debug(f"Marking {self} read")
@ -309,7 +343,7 @@ class RVElement:
        if r.text.strip() != "OK":
            raise RuntimeError(f"Couldn't mark {self} as read: {r.text}")
        log.info(f"Marked {self} as read")
-        self.parent.elements.remove(self)
+        self.clean()

    def try_mark_read(self) -> None:
        try:
@ -319,7 +353,7 @@ class RVElement:


 class RVDatabase:
-    SAVE_FILE = ".cache.p"
+    METADATA_FOLDER = ".metadata"

    args: configargparse.Namespace
    elements: list[RVElement]
@ -327,53 +361,27 @@ class RVDatabase:
    def __init__(self, args: configargparse.Namespace) -> None:
        self.args = args

-    def save(self) -> None:
-        log.debug("Saving cache")
-        if self.args.dryrun:
-            return
-        with open(self.SAVE_FILE, "wb") as save_file:
-            pickle.dump(self, save_file)
+    def metafile_read(self, name: str) -> typing.Any:
+        path = os.path.join(self.METADATA_FOLDER, name)
+        log.debug(f"Reading {path}")
+        with open(path, "rb") as mf:
+            return pickle.load(mf)

-    @classmethod
-    def load(cls) -> typing.Optional["RVDatabase"]:
-        try:
-            with open(cls.SAVE_FILE, "rb") as save_file:
-                return pickle.load(save_file)
-        except (TypeError, AttributeError, EOFError):
-            log.warning("Corrupt / outdated cache, it will be rebuilt.")
-        except FileNotFoundError:
-            pass
-        return None
-
-    def salvage_cache_pre(self, cache: "RVDatabase") -> None:
-        if "auth_headers" in cache.__dict__:
-            self.auth_headers = cache.auth_headers
-
-    def salvage_cache(self, cache: "RVDatabase") -> None:
-        log.debug("Salvaging cache")
-        cache_els = dict()
-        for cache_el in cache.elements:
-            cache_els[cache_el.id] = cache_el
-        for el in self.elements:
-            if el.id in cache_els:
-                el.salvage_cache(cache_els[el.id])
+    def metafile_write(self, name: str, data: typing.Any) -> None:
+        path = os.path.join(self.METADATA_FOLDER, name)
+        log.debug(f"Writing {path}")
+        if not self.args.dryrun:
+            with open(path, "wb") as mf:
+                pickle.dump(data, mf)

    def clean_cache(self, cache: "RVDatabase") -> None:
        log.debug("Cleaning cache")
-        self_els = dict()
-        for self_el in self.elements:
-            self_els[self_el.id] = self_el
+        fresh_ids = set(el.id for el in self.elements)
        for el in cache.elements:
-            if el.id not in self_els:
-                if el.is_researched and el.is_video:
-                    el.clean()
+            if el.id not in fresh_ids:
+                el.clean()

-    def import_cache(self, cache: "RVDatabase") -> None:
-        log.debug("Importing cache")
-        self.build_list([element.item for element in cache.elements])
-
-    @functools.cached_property
-    def auth_headers(self) -> dict[str, str]:
+    def _auth_headers(self) -> dict[str, str]:
        r = requests.get(
            f"{self.args.url}/accounts/ClientLogin",
            params={"Email": self.args.email, "Passwd": self.args.passwd},
@ -385,6 +393,15 @@ class RVDatabase:
                return {"Authorization": f"GoogleLogin auth={val}"}
        raise RuntimeError("Couldn't find auth= key")

+    @functools.cached_property
+    def auth_headers(self) -> dict[str, str]:
+        try:
+            return self.metafile_read(".auth_headers")
+        except FileNotFoundError:
+            headers = self._auth_headers()
+            self.metafile_write(".auth_headers", headers)
+            return headers
+
    def fetch_feed_elements(self) -> typing.Generator[dict, None, None]:
        log.info("Fetching RSS feed")
        continuation: typing.Optional[str] = None
@ -409,45 +426,47 @@ class RVDatabase:
            while continuation:
                yield from next_page()

-    def build_list(self, items: typing.Iterable[dict]) -> None:
+    def fetch_cache_elements(self) -> typing.Generator[dict, None, None]:
+        log.info("Fetching from cache")
+        for file in os.listdir(self.METADATA_FOLDER):
+            if not file.endswith(".item"):
+                continue
+            yield self.metafile_read(file)
+
+    def build_list(self, items: typing.Iterable[dict], save: bool = False) -> None:
        self.elements = []
        for item in items:
            element = RVElement(self, item)
            self.elements.insert(0, element)
            log.debug(f"Known: {element}")
+            if save:
+                element.save()

    def read_feed(self) -> None:
-        self.build_list(self.fetch_feed_elements())
+        self.build_list(self.fetch_feed_elements(), save=True)
+
+    def read_cache(self) -> None:
+        self.build_list(self.fetch_cache_elements())
+
+    def clean_folder(self, folder: str, basenames: set[str]) -> None:
+        for file in os.listdir(folder):
+            path = os.path.join(folder, file)
+            if not os.path.isfile(path) or file[0] == ".":
+                continue
+            for basename in basenames:
+                if file.startswith(basename):
+                    break
+            else:
+                log.info(f"Removing unknown file: {path}")
+                if not self.args.dryrun:
+                    os.unlink(path)

    def clean(self) -> None:
        log.debug("Cleaning")
-        filenames = set()
-        for element in self.elements:
-            if element.is_video:
-                filenames.add(element.filename)
-        for file in os.listdir():
-            if file == RVDatabase.SAVE_FILE:
-                continue
-            if not os.path.isfile(file):
-                continue
-            for filename in filenames:
-                if file.startswith(filename):
-                    break
-            else:
-                log.info(f"Removing unknown file: {file}")
-                if not self.args.dryrun:
-                    os.unlink(file)
-
-    @property
-    def all_researched(self) -> bool:
-        for element in self.elements:
-            if not element.is_researched:
-                return False
-        return True
-
-    def attempt_clean(self) -> None:
-        if self.all_researched:
-            self.clean()
+        filenames = set(el.basename for el in self.elements if el.is_video)
+        self.clean_folder(".", filenames)
+        ids = set(el.sid for el in self.elements)
+        self.clean_folder(self.METADATA_FOLDER, ids)

    @property
    def ytdl_opts(self) -> dict:
@ -468,7 +487,9 @@ class RVDatabase:
        elements: typing.Iterable[RVElement]
        # Inexpensive sort
        if args.order == "new":
-            elements = reversed(elements_src)
+            elements = sorted(elements_src, key=lambda el: el.date, reverse=True)
+        elif args.order == "old":
+            elements = sorted(elements_src, key=lambda el: el.date)
        elif args.order == "title":
            elements = sorted(elements_src, key=lambda el: el.title)
        elif args.order == "creator":
@ -478,8 +499,6 @@ class RVDatabase:
        elif args.order == "random":
            elements = elements_src
            random.shuffle(elements)
-        else:
-            elements = elements_src

        # Possibly expensive filtering
        elements = filter(lambda el: el.matches_filter(args), elements)
@ -575,11 +594,6 @@ def get_args() -> configargparse.Namespace:
        env_var="RSS_VIDEOS_PASSWD",
        required=True,
    )
-    parser.add(
-        "--research",
-        help="Fetch video info again",
-        action="store_true",
-    )
    parser.add(
        "--no-refresh",
        dest="refresh",
@ -641,79 +655,56 @@ def get_args() -> configargparse.Namespace:
            "list",
            "watch",
            "binge",
-            "clean",
        ),
        default="download",
    )

    args = parser.parse_args()
    args.videos = os.path.realpath(os.path.expanduser(args.videos))
-    if not args.duration and args.max_duration:
-        args.duration = str(args.max_duration)

    return args


 def get_database(args: configargparse.Namespace) -> RVDatabase:
-    database = RVDatabase(args)
-    cache = RVDatabase.load()
-    feed_fetched = False
-    if cache:
-        database.salvage_cache_pre(cache)
-    if args.refresh:
-        try:
-            database.read_feed()
-            feed_fetched = True
-        except requests.ConnectionError as err:
-            if args.action == "download":
-                raise RuntimeError("Couldn't fetch feed, refusing to download")
-                # This is a quirky failsafe in case of no internet connection,
-                # so the script doesn't go noting that no element is a video.
-            log.warning(f"Couldn't fetch feed: {err}")
-    if not feed_fetched:
-        if cache:
-            log.warning("Using cached feed.")
-            database.import_cache(cache)
-        else:
-            raise FileNotFoundError("Feed not fetched and no cached feed.")
-    if cache:
-        database.salvage_cache(cache)
-        database.clean_cache(cache)
-        database.save()
+    cache = RVDatabase(args)
+    cache.read_cache()
+    if not args.refresh:
+        return cache

-    return database
+    fresh = RVDatabase(args)
+    fresh.read_feed()
+    fresh.clean_cache(cache)
+    return fresh


 def main() -> None:
    args = get_args()
    configure_logging(args)

-    os.makedirs(args.videos, exist_ok=True)
+    metadata_dir = os.path.join(args.videos, RVDatabase.METADATA_FOLDER)
+    for dir in (args.videos, metadata_dir):
+        os.makedirs(dir, exist_ok=True)
    os.chdir(args.videos)

    database = get_database(args)
+    database.clean()

    log.debug("Running action")
-    if args.action == "clean":
-        database.clean()
-    else:
-        duration = 0
-        for element in database.filter(args):
-            duration += element.duration if element.is_video else 0
-            if args.action == "download":
-                element.preload()
-            elif args.action == "list":
-                print(element)
-            elif args.action in ("watch", "binge"):
-                element.watch()
-                if args.action == "watch":
-                    break
-            else:
-                raise NotImplementedError(f"Unimplemented action: {args.action}")
-        log.info(f"Total duration: {format_duration(duration)}")
-        database.attempt_clean()
+    duration = 0
+    for element in database.filter(args):
+        duration += element.duration if element.is_video else 0
+        if args.action == "download":
+            element.download()
+        elif args.action == "list":
+            print(element)
+        elif args.action in ("watch", "binge"):
+            element.watch()
+            if args.action == "watch":
+                break
+        else:
+            raise NotImplementedError(f"Unimplemented action: {args.action}")
+    log.info(f"Total duration: {format_duration(duration)}")
    database.try_mark_watched_read()
-    database.save()


 if __name__ == "__main__":