rssVideos: Better sanitization of ytdl info

rssVideos: --total-duration
Controleld binging
2021-12-20 18:57:13 +01:00 · 2021-12-19 23:13:41 +01:00 · 2021-12-19 22:29:16 +01:00 · 2021-12-19 15:10:16 +01:00 · 2021-12-19 11:45:41 +01:00 · 2021-12-19 10:59:02 +01:00
1 changed files with 209 additions and 82 deletions
--- a/config/scripts/rssVideos
+++ b/config/scripts/rssVideos
@ -17,6 +17,7 @@ import random
 import re
 import subprocess
 import sys
+import time
 import typing
 import urllib.parse
 import urllib.request
@ -25,11 +26,12 @@ from xml.dom import minidom

 import coloredlogs
 import configargparse
-import yt_dlp as youtube_dl
+import yt_dlp

 log = logging.getLogger(__name__)

 # TODO Lockfile, or a way to parallel watch and download
+# TODO Save ytdl infos and view info separately

 def configure_logging(args: configargparse.Namespace) -> None:
    # Configure logging
@ -44,16 +46,76 @@ def configure_logging(args: configargparse.Namespace) -> None:
        )


+class SaveInfoPP(yt_dlp.postprocessor.common.PostProcessor):
+    """
+    yt_dlp.process_ie_result() doesn't return a completely updated info dict,
+    notably the extension is still the one before it realizes the files cannot
+    be merged. So we use this PostProcessor to catch the info dict in its final
+    form and save what we need from it (it's not serializable in this state).
+    """
+
+    def __init__(self, rvelement: "RVElement") -> None:
+        self.rvelement = rvelement
+        super().__init__()
+
+    def run(self, info: dict) -> tuple[list, dict]:
+        self.rvelement.update_post_download(info)
+        return [], info
+
+def parse_duration(string: str) -> int:
+    DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, "": 1}
+
+    mult_index = string[-1].lower()
+    if mult_index.isdigit():
+        mult_index = ""
+    else:
+        string = string[:-1]
+    try:
+        multiplier = DURATION_MULTIPLIERS[mult_index]
+    except IndexError:
+        raise ValueError(f"Unknown duration multiplier: {mult_index}")
+
+    return int(string) * multiplier
+
+
+def compare_duration(compstr: str) -> typing.Callable[[int], bool]:
+    DURATION_COMPARATORS = {
+        "<": int.__lt__,
+        "-": int.__lt__,
+        ">": int.__gt__,
+        "+": int.__gt__,
+        "=": int.__eq__,
+        "": int.__le__,
+    }
+
+    comp_index = compstr[0]
+    if comp_index.isdigit():
+        comp_index = ""
+    else:
+        compstr = compstr[1:]
+    try:
+        comparator = DURATION_COMPARATORS[comp_index]
+    except IndexError:
+        raise ValueError(f"Unknown duration comparator: {comp_index}")
+
+    duration = parse_duration(compstr)
+
+    return lambda d: comparator(d, duration)
+
+def format_duration(duration: int) -> str:
+    return time.strftime("%H:%M:%S", time.gmtime(duration))
+
+
 class RVElement:
    parent: "RVDatabase"
    item: minidom.Element
-    was_downloaded: bool
+    downloaded_filepath: typing.Optional[str]
    watched: bool

    def __init__(self, parent: "RVDatabase", item: minidom.Element) -> None:
        self.parent = parent
        self.item = item
-        self.was_downloaded = False
+        self.downloaded_filepath = None
        self.watched = False

    def get_tag_data(self, tag_name: str) -> str:
@ -101,16 +163,25 @@ class RVElement:
        return "ytdl_infos" in self.__dict__

    def salvage_cache(self, cache: "RVElement") -> None:
-        if cache.is_researched:
+        if not self.parent.args.research and cache.is_researched:
            self.__dict__["ytdl_infos"] = cache.__dict__["ytdl_infos"]
            log.debug(f"From cache: {self}")
-        if cache.was_downloaded:
-            self.was_downloaded = True
+        if cache.downloaded_filepath:
+            self.downloaded_filepath = cache.downloaded_filepath
        if cache.watched:
            self.watched = True

    def __str__(self) -> str:
-        return f"{self.guid}: {self.creator} – {self.title} – {self.link}"
+        str = f"{self.guid}: {self.creator if self.creator else '?'} – {self.title}"
+        if self.is_researched:
+            if self.is_video:
+                str += f" ({format_duration(self.duration)})"
+            else:
+                str += " (N/A)"
+        else:
+            str += " (?)"
+        str += f" – {self.link}"
+        return str

    @property
    def downloaded(self) -> bool:
@ -122,21 +193,15 @@ class RVElement:
    def ytdl_infos(self) -> typing.Optional[dict]:
        log.info(f"Researching: {self}")
        try:
-            infos = self.parent.ytdl_dry.extract_info(self.link)
+            infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
        except KeyboardInterrupt as e:
            raise e
-        except youtube_dl.utils.DownloadError as e:
+        except yt_dlp.utils.DownloadError as e:
            # TODO Still raise in case of temporary network issue
            log.warning(e)
            infos = None
-        # Apparently that thing is transformed from a LazyList
-        # somewhere in the normal yt_dlp process
-        if (
-            infos
-            and "thumbnails" in infos
-            and isinstance(infos["thumbnails"], youtube_dl.utils.LazyList)
-        ):
-            infos["thumbnails"] = infos["thumbnails"].exhaust()
+        if infos:
+            infos = self.parent.ytdl_dry.sanitize_info(infos)
        # Save database once it's been computed
        self.__dict__["ytdl_infos"] = infos
        self.parent.save()
@ -156,7 +221,8 @@ class RVElement:
    @property
    def filepath(self) -> str:
        assert self.is_video
-        # TODO This doesn't change the extension to mkv when the formats are incomaptible
+        if self.downloaded_filepath:
+            return self.downloaded_filepath
        return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)

    @property
@ -168,10 +234,18 @@ class RVElement:
        assert self.is_video
        log.info(f"Downloading: {self}")
        if not self.parent.args.dryrun:
-            self.parent.ytdl.process_ie_result(self.ytdl_infos, True, {})
-        self.was_downloaded = True
+            with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
+                ydl.add_post_processor(SaveInfoPP(self))
+                ydl.process_ie_result(self.ytdl_infos, download=True)
        self.parent.save()

+    def update_post_download(self, info: dict) -> None:
+        self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
+
+    @property
+    def was_downloaded(self) -> bool:
+        return self.downloaded_filepath is not None
+
    def preload(self) -> None:
        assert self.is_video
        if self.downloaded:
@ -182,18 +256,8 @@ class RVElement:
            return
        self.download()

-    MATCHES_DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, None: 1}
-
-    MATCHES_DURATION_COMPARATORS = {
-        "<": int.__lt__,
-        "-": int.__lt__,
-        ">": int.__gt__,
-        "+": int.__gt__,
-        "=": int.__eq__,
-        None: int.__le__,
-    }
-
    def matches_filter(self, args: configargparse.Namespace) -> bool:
+        # Inexpensive filters
        if args.seen != "any" and (args.seen == "seen") != self.watched:
            log.debug(f"Not {args.seen}: {self}")
            return False
@ -206,39 +270,22 @@ class RVElement:
        if args.link and not re.search(args.link, self.link):
            log.debug(f"Link not matching {args.link}: {self}")
            return False
-        if args.creator and (not self.creator or not re.search(args.creator, self.creator)):
+        if args.creator and (
+            not self.creator or not re.search(args.creator, self.creator)
+        ):
            log.debug(f"Creator not matching {args.creator}: {self}")
            return False
+
+        # Expensive filters
        if not self.is_video:
            log.debug(f"Not a video: {self}")
            return False
-        if args.duration:
-            dur = args.duration
+        if args.duration and not compare_duration(args.duration)(self.duration):
+            log.debug(
+                f"Duration {self.duration} not matching {args.duration}: {self}"
+            )
+            return False

-            mult_index = dur[-1].lower()
-            if mult_index.isdigit():
-                mult_index = None
-            else:
-                dur = dur[:-1]
-            try:
-                multiplier = self.MATCHES_DURATION_MULTIPLIERS[mult_index]
-            except IndexError:
-                raise ValueError(f"Unknown duration multiplier: {mult_index}")
-
-            comp_index = dur[0]
-            if comp_index.isdigit():
-                comp_index = None
-            else:
-                dur = dur[1:]
-            try:
-                comparator = self.MATCHES_DURATION_COMPARATORS[comp_index]
-            except IndexError:
-                raise ValueError(f"Unknown duration comparator: {comp_index}")
-
-            duration = int(dur)
-            if not comparator(self.duration, duration * multiplier):
-                log.debug(f"Duration {self.duration} not matching {args.duration}: {self}")
-                return False
        return True

    def watch(self) -> None:
@ -365,28 +412,60 @@ class RVDatabase:
    @property
    def ytdl_dry_opts(self) -> dict:
        opts = self.ytdl_opts.copy()
-        opts.update({"simulate": True, "quiet": True})
+        opts.update({"quiet": True})
        return opts

    @property
-    def ytdl(self) -> youtube_dl.YoutubeDL:
-        return youtube_dl.YoutubeDL(self.ytdl_opts)
-
-    @property
-    def ytdl_dry(self) -> youtube_dl.YoutubeDL:
-        return youtube_dl.YoutubeDL(self.ytdl_dry_opts)
+    def ytdl_dry(self) -> yt_dlp.YoutubeDL:
+        return yt_dlp.YoutubeDL(self.ytdl_dry_opts)

    def filter(self, args: configargparse.Namespace) -> typing.Iterable[RVElement]:
        elements: typing.Iterable[RVElement]
-        if args.order == "old":
-            elements = self.elements
-        elif args.order == "new":
+        # Inexpensive sort
+        if args.order == "new":
            elements = reversed(self.elements)
+        elif args.order == "title":
+            elements = sorted(self.elements, key=lambda el: el.title)
+        elif args.order == "creator":
+            elements = sorted(self.elements, key=lambda el: el.creator or "")
+        elif args.order == "link":
+            elements = sorted(self.elements, key=lambda el: el.link)
        elif args.order == "random":
            elements_random = self.elements.copy()
            random.shuffle(elements_random)
            elements = elements_random
-        return filter(lambda el: el.matches_filter(args), elements)
+        else:
+            elements = self.elements
+
+        # Possibly expensive filtering
+        elements = filter(lambda el: el.matches_filter(args), elements)
+
+        # Expensive sort
+        if args.order == "short":
+            elements = sorted(
+                elements, key=lambda el: el.duration if el.is_video else 0
+            )
+        elif args.order == "long":
+            elements = sorted(
+                elements, key=lambda el: el.duration if el.is_video else 0, reverse=True
+            )
+
+        # Post sorting filtering
+        if args.total_duration:
+            rem = parse_duration(args.total_duration)
+            old_els = list(elements)
+            elements = list()
+            while rem > 0:
+                for el in old_els:
+                    if el.duration < rem:
+                        elements.append(el)
+                        rem -= el.duration
+                        old_els.remove(el)
+                        break
+                else:
+                    break
+
+        return elements


 def get_args() -> configargparse.Namespace:
@ -428,6 +507,17 @@ def get_args() -> configargparse.Namespace:
        env_var="RSS_VIDEOS_FEED",
        required=True,
    )
+    parser.add(
+        "--research",
+        help="Fetch video info again",
+        action="store_true",
+    )
+    parser.add(
+        "--no-refresh",
+        dest="refresh",
+        help="Don't fetch feed",
+        action="store_false",
+    )
    parser.add(
        "--videos",
        help="Directory to store videos",
@ -438,7 +528,7 @@ def get_args() -> configargparse.Namespace:
    # Which videos
    parser.add(
        "--order",
-        choices=("old", "new", "random"),
+        choices=("old", "new", "title", "creator", "link", "short", "long", "random"),
        default="old",
        help="Sorting mechanism",
    )
@ -447,7 +537,16 @@ def get_args() -> configargparse.Namespace:
    parser.add("--title", help="Regex to filter by title")
    parser.add("--link", help="Regex to filter by link")
    parser.add("--duration", help="Comparative to filter by duration")
-    parser.add("--seen", choices=("seen","unseen","any"), default="unseen", help="Only include seen/unseen/any videos")
+    parser.add(
+        "--seen",
+        choices=("seen", "unseen", "any"),
+        default="unseen",
+        help="Only include seen/unseen/any videos",
+    )
+    parser.add(
+        "--total-duration",
+        help="Use videos that fit under the total given",
+    )
    # TODO Envrionment variables
    parser.add(
        "--max-duration",
@ -476,7 +575,15 @@ def get_args() -> configargparse.Namespace:
    parser.add(
        "action",
        nargs="?",
-        choices=("download", "list", "watch", "binge", "clean"),
+        choices=(
+            "download",
+            "list",
+            "watch",
+            "binge",
+            "clean",
+            "seen",
+            "unseen",
+        ),
        default="download",
    )

@ -497,14 +604,22 @@ def main() -> None:

    database = RVDatabase(args)
    cache = RVDatabase.load()
-    try:
-        database.read_feed()
-    except urllib.error.URLError as err:
-        if args.action == "download" or not cache:
-            raise err
-        else:
-            log.warning("Cannot fetch RSS feed, using cached feed.", err)
+    feed_fetched = False
+    if args.refresh:
+        try:
+            database.read_feed()
+            feed_fetched = True
+        except urllib.error.URLError as err:
+            if args.action == "download":
+                raise RuntimeError("Couldn't fetch feed, refusing to download")
+                # This is a quirky failsafe in case of no internet connection,
+                # so the script doesn't go noting that no element is a video.
+    if not feed_fetched:
+        if cache:
+            log.warning("Using cached feed.")
            database.import_cache(cache)
+        else:
+            raise FileNotFoundError("Feed not fetched and no cached feed.")
    if cache:
        database.salvage_cache(cache)
        database.clean_cache(cache)
@ -514,7 +629,7 @@ def main() -> None:
    if args.action == "clean":
        database.clean()
    else:
-        database.attempt_clean()
+        duration = 0
        for element in database.filter(args):
            if args.action == "download":
                element.preload()
@ -522,8 +637,20 @@ def main() -> None:
                print(element)
            elif args.action in ("watch", "binge"):
                element.watch()
-            if args.action == "watch":
-                break
+                if args.action == "watch":
+                    break
+            elif args.action == "seen":
+                if not element.watched:
+                    log.info(f"Maked as seen: {element}")
+                    element.watched = True
+            elif args.action == "unseen":
+                if element.watched:
+                    log.info(f"Maked as unseen: {element}")
+                    element.watched = False
+            else:
+                raise NotImplementedError(f"Unimplemented action: {args.action}")
+            duration += element.duration if element.is_video else 0
+        log.info(f"Total duration: {format_duration(duration)}")
        database.attempt_clean()
    database.save()
Author	SHA1	Message	Date
Geoffrey Frogeye	105bd9461c	rssVideos: Better sanitization of ytdl info	2021-12-20 18:57:13 +01:00
Geoffrey Frogeye	5b7926df8f	rssVideos: --total-duration Controleld binging	2021-12-19 23:13:41 +01:00
Geoffrey Frogeye	00a9da6afc	rssVideos: Allow skipping feed fetching For dev speed	2021-12-19 22:29:16 +01:00
Geoffrey Frogeye	daff602a31	rssVideos: Work correctly with merged files	2021-12-19 15:10:16 +01:00
Geoffrey Frogeye	9684586eec	rssVideos: More sort orders and duration command	2021-12-19 11:45:41 +01:00
Geoffrey Frogeye	4890555668	rssVideos: Can toggle seen/unseen video state	2021-12-19 10:59:02 +01:00