rssVideos: Better sanitization of ytdl info

rssVideos: --total-duration
Controleld binging
2021-12-20 18:57:13 +01:00 · 2021-12-19 23:13:41 +01:00 · 2021-12-19 22:29:16 +01:00 · 2021-12-19 15:10:16 +01:00 · 2021-12-19 11:45:41 +01:00 · 2021-12-19 10:59:02 +01:00
1 changed files with 209 additions and 82 deletions
--- a/config/scripts/rssVideos
+++ b/config/scripts/rssVideos
@ -17,6 +17,7 @@ import random
 import re
 import subprocess
 import sys
 import time
 import typing
 import urllib.parse
 import urllib.request
@ -25,11 +26,12 @@ from xml.dom import minidom
 import coloredlogs
 import configargparse
-import yt_dlp as youtube_dl
+import yt_dlp
 log = logging.getLogger(__name__)
 # TODO Lockfile, or a way to parallel watch and download
 # TODO Save ytdl infos and view info separately
 def configure_logging(args: configargparse.Namespace) -> None:
    # Configure logging
@ -44,16 +46,76 @@ def configure_logging(args: configargparse.Namespace) -> None:
        )
 class SaveInfoPP(yt_dlp.postprocessor.common.PostProcessor):
    """
    yt_dlp.process_ie_result() doesn't return a completely updated info dict,
    notably the extension is still the one before it realizes the files cannot
    be merged. So we use this PostProcessor to catch the info dict in its final
    form and save what we need from it (it's not serializable in this state).
    """
    def __init__(self, rvelement: "RVElement") -> None:
        self.rvelement = rvelement
        super().__init__()
    def run(self, info: dict) -> tuple[list, dict]:
        self.rvelement.update_post_download(info)
        return [], info
 def parse_duration(string: str) -> int:
    DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, "": 1}
    mult_index = string[-1].lower()
    if mult_index.isdigit():
        mult_index = ""
    else:
        string = string[:-1]
    try:
        multiplier = DURATION_MULTIPLIERS[mult_index]
    except IndexError:
        raise ValueError(f"Unknown duration multiplier: {mult_index}")
    return int(string) * multiplier
 def compare_duration(compstr: str) -> typing.Callable[[int], bool]:
    DURATION_COMPARATORS = {
        "<": int.__lt__,
        "-": int.__lt__,
        ">": int.__gt__,
        "+": int.__gt__,
        "=": int.__eq__,
        "": int.__le__,
    }
    comp_index = compstr[0]
    if comp_index.isdigit():
        comp_index = ""
    else:
        compstr = compstr[1:]
    try:
        comparator = DURATION_COMPARATORS[comp_index]
    except IndexError:
        raise ValueError(f"Unknown duration comparator: {comp_index}")
    duration = parse_duration(compstr)
    return lambda d: comparator(d, duration)
 def format_duration(duration: int) -> str:
    return time.strftime("%H:%M:%S", time.gmtime(duration))
 class RVElement:
    parent: "RVDatabase"
    item: minidom.Element
-    was_downloaded: bool
+    downloaded_filepath: typing.Optional[str]
    watched: bool
    def __init__(self, parent: "RVDatabase", item: minidom.Element) -> None:
        self.parent = parent
        self.item = item
-        self.was_downloaded = False
+        self.downloaded_filepath = None
        self.watched = False
    def get_tag_data(self, tag_name: str) -> str:
@ -101,16 +163,25 @@ class RVElement:
        return "ytdl_infos" in self.__dict__
    def salvage_cache(self, cache: "RVElement") -> None:
-        if cache.is_researched:
+        if not self.parent.args.research and cache.is_researched:
            self.__dict__["ytdl_infos"] = cache.__dict__["ytdl_infos"]
            log.debug(f"From cache: {self}")
-        if cache.was_downloaded:
+        if cache.downloaded_filepath:
-            self.was_downloaded = True
+            self.downloaded_filepath = cache.downloaded_filepath
        if cache.watched:
            self.watched = True
    def __str__(self) -> str:
-        return f"{self.guid}: {self.creator} – {self.title} – {self.link}"
+        str = f"{self.guid}: {self.creator if self.creator else '?'} – {self.title}"
        if self.is_researched:
            if self.is_video:
                str += f" ({format_duration(self.duration)})"
            else:
                str += " (N/A)"
        else:
            str += " (?)"
        str += f" – {self.link}"
        return str
    @property
    def downloaded(self) -> bool:
@ -122,21 +193,15 @@ class RVElement:
    def ytdl_infos(self) -> typing.Optional[dict]:
        log.info(f"Researching: {self}")
        try:
-            infos = self.parent.ytdl_dry.extract_info(self.link)
+            infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
        except KeyboardInterrupt as e:
            raise e
-        except youtube_dl.utils.DownloadError as e:
+        except yt_dlp.utils.DownloadError as e:
            # TODO Still raise in case of temporary network issue
            log.warning(e)
            infos = None
-        # Apparently that thing is transformed from a LazyList
+        if infos:
-        # somewhere in the normal yt_dlp process
+            infos = self.parent.ytdl_dry.sanitize_info(infos)
        if (
            infos
            and "thumbnails" in infos
            and isinstance(infos["thumbnails"], youtube_dl.utils.LazyList)
        ):
            infos["thumbnails"] = infos["thumbnails"].exhaust()
        # Save database once it's been computed
        self.__dict__["ytdl_infos"] = infos
        self.parent.save()
@ -156,7 +221,8 @@ class RVElement:
    @property
    def filepath(self) -> str:
        assert self.is_video
-        # TODO This doesn't change the extension to mkv when the formats are incomaptible
+        if self.downloaded_filepath:
            return self.downloaded_filepath
        return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)
    @property
@ -168,10 +234,18 @@ class RVElement:
        assert self.is_video
        log.info(f"Downloading: {self}")
        if not self.parent.args.dryrun:
-            self.parent.ytdl.process_ie_result(self.ytdl_infos, True, {})
+            with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
-        self.was_downloaded = True
+                ydl.add_post_processor(SaveInfoPP(self))
                ydl.process_ie_result(self.ytdl_infos, download=True)
        self.parent.save()
    def update_post_download(self, info: dict) -> None:
        self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
    @property
    def was_downloaded(self) -> bool:
        return self.downloaded_filepath is not None
    def preload(self) -> None:
        assert self.is_video
        if self.downloaded:
@ -182,18 +256,8 @@ class RVElement:
            return
        self.download()
    MATCHES_DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, None: 1}
    MATCHES_DURATION_COMPARATORS = {
        "<": int.__lt__,
        "-": int.__lt__,
        ">": int.__gt__,
        "+": int.__gt__,
        "=": int.__eq__,
        None: int.__le__,
    }
    def matches_filter(self, args: configargparse.Namespace) -> bool:
        # Inexpensive filters
        if args.seen != "any" and (args.seen == "seen") != self.watched:
            log.debug(f"Not {args.seen}: {self}")
            return False
@ -206,39 +270,22 @@ class RVElement:
        if args.link and not re.search(args.link, self.link):
            log.debug(f"Link not matching {args.link}: {self}")
            return False
-        if args.creator and (not self.creator or not re.search(args.creator, self.creator)):
+        if args.creator and (
            not self.creator or not re.search(args.creator, self.creator)
        ):
            log.debug(f"Creator not matching {args.creator}: {self}")
            return False
        # Expensive filters
        if not self.is_video:
            log.debug(f"Not a video: {self}")
            return False
-        if args.duration:
+        if args.duration and not compare_duration(args.duration)(self.duration):
-            dur = args.duration
+            log.debug(
                f"Duration {self.duration} not matching {args.duration}: {self}"
            )
            return False
            mult_index = dur[-1].lower()
            if mult_index.isdigit():
                mult_index = None
            else:
                dur = dur[:-1]
            try:
                multiplier = self.MATCHES_DURATION_MULTIPLIERS[mult_index]
            except IndexError:
                raise ValueError(f"Unknown duration multiplier: {mult_index}")
            comp_index = dur[0]
            if comp_index.isdigit():
                comp_index = None
            else:
                dur = dur[1:]
            try:
                comparator = self.MATCHES_DURATION_COMPARATORS[comp_index]
            except IndexError:
                raise ValueError(f"Unknown duration comparator: {comp_index}")
            duration = int(dur)
            if not comparator(self.duration, duration * multiplier):
                log.debug(f"Duration {self.duration} not matching {args.duration}: {self}")
                return False
        return True
    def watch(self) -> None:
@ -365,28 +412,60 @@ class RVDatabase:
    @property
    def ytdl_dry_opts(self) -> dict:
        opts = self.ytdl_opts.copy()
-        opts.update({"simulate": True, "quiet": True})
+        opts.update({"quiet": True})
        return opts
    @property
-    def ytdl(self) -> youtube_dl.YoutubeDL:
+    def ytdl_dry(self) -> yt_dlp.YoutubeDL:
-        return youtube_dl.YoutubeDL(self.ytdl_opts)
+        return yt_dlp.YoutubeDL(self.ytdl_dry_opts)
    @property
    def ytdl_dry(self) -> youtube_dl.YoutubeDL:
        return youtube_dl.YoutubeDL(self.ytdl_dry_opts)
    def filter(self, args: configargparse.Namespace) -> typing.Iterable[RVElement]:
        elements: typing.Iterable[RVElement]
-        if args.order == "old":
+        # Inexpensive sort
-            elements = self.elements
+        if args.order == "new":
        elif args.order == "new":
            elements = reversed(self.elements)
        elif args.order == "title":
            elements = sorted(self.elements, key=lambda el: el.title)
        elif args.order == "creator":
            elements = sorted(self.elements, key=lambda el: el.creator or "")
        elif args.order == "link":
            elements = sorted(self.elements, key=lambda el: el.link)
        elif args.order == "random":
            elements_random = self.elements.copy()
            random.shuffle(elements_random)
            elements = elements_random
-        return filter(lambda el: el.matches_filter(args), elements)
+        else:
            elements = self.elements
        # Possibly expensive filtering
        elements = filter(lambda el: el.matches_filter(args), elements)
        # Expensive sort
        if args.order == "short":
            elements = sorted(
                elements, key=lambda el: el.duration if el.is_video else 0
            )
        elif args.order == "long":
            elements = sorted(
                elements, key=lambda el: el.duration if el.is_video else 0, reverse=True
            )
        # Post sorting filtering
        if args.total_duration:
            rem = parse_duration(args.total_duration)
            old_els = list(elements)
            elements = list()
            while rem > 0:
                for el in old_els:
                    if el.duration < rem:
                        elements.append(el)
                        rem -= el.duration
                        old_els.remove(el)
                        break
                else:
                    break
        return elements
 def get_args() -> configargparse.Namespace:
@ -428,6 +507,17 @@ def get_args() -> configargparse.Namespace:
        env_var="RSS_VIDEOS_FEED",
        required=True,
    )
    parser.add(
        "--research",
        help="Fetch video info again",
        action="store_true",
    )
    parser.add(
        "--no-refresh",
        dest="refresh",
        help="Don't fetch feed",
        action="store_false",
    )
    parser.add(
        "--videos",
        help="Directory to store videos",
@ -438,7 +528,7 @@ def get_args() -> configargparse.Namespace:
    # Which videos
    parser.add(
        "--order",
-        choices=("old", "new", "random"),
+        choices=("old", "new", "title", "creator", "link", "short", "long", "random"),
        default="old",
        help="Sorting mechanism",
    )
@ -447,7 +537,16 @@ def get_args() -> configargparse.Namespace:
    parser.add("--title", help="Regex to filter by title")
    parser.add("--link", help="Regex to filter by link")
    parser.add("--duration", help="Comparative to filter by duration")
-    parser.add("--seen", choices=("seen","unseen","any"), default="unseen", help="Only include seen/unseen/any videos")
+    parser.add(
        "--seen",
        choices=("seen", "unseen", "any"),
        default="unseen",
        help="Only include seen/unseen/any videos",
    )
    parser.add(
        "--total-duration",
        help="Use videos that fit under the total given",
    )
    # TODO Envrionment variables
    parser.add(
        "--max-duration",
@ -476,7 +575,15 @@ def get_args() -> configargparse.Namespace:
    parser.add(
        "action",
        nargs="?",
-        choices=("download", "list", "watch", "binge", "clean"),
+        choices=(
            "download",
            "list",
            "watch",
            "binge",
            "clean",
            "seen",
            "unseen",
        ),
        default="download",
    )
@ -497,14 +604,22 @@ def main() -> None:
    database = RVDatabase(args)
    cache = RVDatabase.load()
-    try:
+    feed_fetched = False
-        database.read_feed()
+    if args.refresh:
-    except urllib.error.URLError as err:
+        try:
-        if args.action == "download" or not cache:
+            database.read_feed()
-            raise err
+            feed_fetched = True
-        else:
+        except urllib.error.URLError as err:
-            log.warning("Cannot fetch RSS feed, using cached feed.", err)
+            if args.action == "download":
                raise RuntimeError("Couldn't fetch feed, refusing to download")
                # This is a quirky failsafe in case of no internet connection,
                # so the script doesn't go noting that no element is a video.
    if not feed_fetched:
        if cache:
            log.warning("Using cached feed.")
            database.import_cache(cache)
        else:
            raise FileNotFoundError("Feed not fetched and no cached feed.")
    if cache:
        database.salvage_cache(cache)
        database.clean_cache(cache)
@ -514,7 +629,7 @@ def main() -> None:
    if args.action == "clean":
        database.clean()
    else:
-        database.attempt_clean()
+        duration = 0
        for element in database.filter(args):
            if args.action == "download":
                element.preload()
@ -522,8 +637,20 @@ def main() -> None:
                print(element)
            elif args.action in ("watch", "binge"):
                element.watch()
-            if args.action == "watch":
+                if args.action == "watch":
-                break
+                    break
            elif args.action == "seen":
                if not element.watched:
                    log.info(f"Maked as seen: {element}")
                    element.watched = True
            elif args.action == "unseen":
                if element.watched:
                    log.info(f"Maked as unseen: {element}")
                    element.watched = False
            else:
                raise NotImplementedError(f"Unimplemented action: {args.action}")
            duration += element.duration if element.is_video else 0
        log.info(f"Total duration: {format_duration(duration)}")
        database.attempt_clean()
    database.save()
Author	SHA1	Message	Date
Geoffrey Frogeye	105bd9461c	rssVideos: Better sanitization of ytdl info	2021-12-20 18:57:13 +01:00
Geoffrey Frogeye	5b7926df8f	rssVideos: --total-duration Controleld binging	2021-12-19 23:13:41 +01:00
Geoffrey Frogeye	00a9da6afc	rssVideos: Allow skipping feed fetching For dev speed	2021-12-19 22:29:16 +01:00
Geoffrey Frogeye	daff602a31	rssVideos: Work correctly with merged files	2021-12-19 15:10:16 +01:00
Geoffrey Frogeye	9684586eec	rssVideos: More sort orders and duration command	2021-12-19 11:45:41 +01:00
Geoffrey Frogeye	4890555668	rssVideos: Can toggle seen/unseen video state	2021-12-19 10:59:02 +01:00