dotfiles/config/scripts/rssVideos

#!/usr/bin/env python3

"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""

# TODO Distribute this correclty, in the meanwhile please do
# pip install --user yt-dlp ConfigArgParse

# TODO Better logging (youtube-dl allow to pass loggers)

import sys
import urllib.request
import urllib.parse
import os
from xml.dom import minidom
import yt_dlp as youtube_dl
import configargparse


def get_args() -> configargparse.Namespace:
    defaultConfigPath = os.path.join(
        os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
    )

    parser = configargparse.ArgParser(
        description="Download videos linked in "
        + "a RSS feed (e.g. an unread feed from "
        + "an RSS aggregator",
        default_config_files=[defaultConfigPath],
    )
    parser.add(
        "-c", "--config", required=False, is_config_file=True, help="Configuration file"
    )
    parser.add(
        "--feed",
        help="URL of the RSS feed (must be public for now)",
        env_var="RSS_VIDEOS_FEED",
        required=True,
    )
    parser.add(
        "--videos",
        help="Directory to store videos",
        env_var="RSS_VIDEOS_VIDEO_DIR",
        required=True,
    )
    parser.add(
        "-n",
        "--dryrun",
        help="Do not download the videos",
        action="store_const",
        const=True,
        default=False,
    )
    # TODO This feature might require additional documentation and an on/off switch
    parser.add(
        "--track",
        help="Directory where download videos are marked "
        + "to not download them after deletion.",
        env_var="RSS_VIDEOS_TRACK",
        required=False,
        default=".rssVideos",
    )
    parser.add(
        "--max-duration",
        help="Skip video longer than this amount of seconds",
        env_var="RSS_VIDEOS_MAX_DURATION",
        type=int,
        default=0,
    )
    parser.add(
        "--format",
        help="Use this format to download videos."
        + " See FORMAT SELECTION in youtube-dl(1)",
        env_var="RSS_VIDEOS_FORMAT",
        default="bestvideo+bestaudio/best",
    )
    parser.add(
        "--subtitles",
        help="Download all subtitles",
        env_var="RSS_VIDEOS_SUBTITLES",
        action="store_true",
    )

    args = parser.parse_args()
    args.videos = os.path.realpath(os.path.expanduser(args.videos))
    args.track = os.path.expanduser(args.track)
    if not os.path.isabs(args.track):
        args.track = os.path.realpath(os.path.join(args.videos, args.track))

    return args


def get_links(args: configargparse.Namespace) -> list[str]:
    """
    Read the feed XML, get the links
    """
    links = list()
    with urllib.request.urlopen(args.feed) as request:
        with minidom.parse(request) as xmldoc:
            for item in xmldoc.getElementsByTagName("item"):
                try:
                    linkNode = item.getElementsByTagName("link")[0]
                    link: str = linkNode.childNodes[0].data
                    if link not in links:
                        links.append(link)
                except BaseException as e:
                    print("Error while getting link from item:", e)
                    continue
    return links


def get_video_infos(
    args: configargparse.Namespace, ydl_opts: dict, links: list[str]
) -> dict[str, dict]:
    """
    Filter out non-video links and store video download info
    and associated filename
    """
    videosInfos = dict()

    dry_ydl_opts = ydl_opts.copy()
    dry_ydl_opts.update({"simulate": True, "quiet": True})
    with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:
        for link in links:
            print(f"Researching {link}...")
            try:
                infos = ydl.extract_info(link)
                if args.max_duration > 0 and infos["duration"] > args.max_duration:
                    print(
                        f"{infos['title']}: Skipping as longer than max duration: "
                        f"{infos['duration']} > {args.max_duration}"
                    )
                    continue
                filepath = ydl.prepare_filename(infos)
                filename, extension = os.path.splitext(filepath)
                videosInfos[filename] = infos
                print(f"{infos['title']}: Added")

            except BaseException as e:
                print(e)
                continue

    return videosInfos


def get_downloaded_videos(
    args: configargparse.Namespace, videosInfos: dict[str, dict]
) -> tuple[set[str], set[str]]:
    videosDownloaded = set()
    videosPartiallyDownloaded = set()
    """
    Read the directory content, delete everything that's not a
    video on the download list or already downloaded
    """

    for filepath in os.listdir(args.videos):
        fullpath = os.path.join(args.videos, filepath)
        if not os.path.isfile(fullpath):
            continue
        filename, extension = os.path.splitext(filepath)

        for onlineFilename in videosInfos.keys():
            # Full name already there: completly downloaded
            # → remove from the download list
            if filename == onlineFilename:
                videosDownloaded.add(onlineFilename)
                break
            elif filename.startswith(onlineFilename):
                # Subtitle file
                # → ignore
                if filename.endswith(".vtt"):
                    break

                # Partial name already there: not completly downloaded
                # → keep on the download list
                videosPartiallyDownloaded.add(onlineFilename)
                break
        # Unrelated filename: delete
        else:
            print(f"Deleting: {filename}")
            os.unlink(fullpath)

    return videosDownloaded, videosPartiallyDownloaded


def get_tracked_videos(args: configargparse.Namespace, known: set[str]) -> set[str]:
    """
    Return videos previously downloaded (=tracked) amongst the unread videos.
    This is stored in the tracking directory as empty extension-less files.
    Other tracking markers (e.g. for now read videos) are deleted.
    """

    videosTracked = set()

    for filepath in os.listdir(args.track):
        fullpath = os.path.join(args.track, filepath)
        if not os.path.isfile(fullpath):
            continue
        # Here filename is a filepath as no extension

        if filepath in known:
            videosTracked.add(filepath)
        else:
            os.unlink(fullpath)

    return videosTracked


def main() -> None:

    args = get_args()

    os.makedirs(args.videos, exist_ok=True)
    os.makedirs(args.track, exist_ok=True)
    ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}

    print("→ Retrieveing RSS feed")
    links = get_links(args)
    # Oldest first
    links = links[::-1]

    print(f"→ Getting infos on {len(links)} unread articles")
    videosInfos = get_video_infos(args, ydl_opts, links)

    print(f"→ Deciding on what to do for {len(videosInfos)} videos")
    videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(
        args, videosInfos
    )
    videosTracked = get_tracked_videos(args, set(videosInfos.keys()))

    # Deciding for the rest based on the informations

    def markTracked(filename: str) -> None:
        markerPath = os.path.join(args.track, onlineFilename)
        open(markerPath, "a").close()

    videosToDownload: set[str] = set()
    videosReads: set[str] = set()
    for onlineFilename in videosInfos.keys():
        # If the video was once downloaded but manually deleted,
        # the marker should be left
        if onlineFilename in videosTracked:
            print(f"Should be marked as read: {onlineFilename}")
            # TODO Automatically do that one day maybe?
            # Need to login to the FreshRSS API and keep track of
            # the item id along the process
            videosReads.add(onlineFilename)
        elif onlineFilename in videosDownloaded:
            markTracked(onlineFilename)
            print(f"Already downloaded: {onlineFilename}")
        else:
            if onlineFilename in videosPartiallyDownloaded:
                print(f"Will be continued: {onlineFilename}")
            else:
                print(f"Will be downloaded: {onlineFilename}")
            videosToDownload.add(onlineFilename)

    # Download the missing videos
    print(f"→ Downloading {len(videosToDownload)} videos")

    os.chdir(args.videos)

    exit_code = 0
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        for onlineFilename, infos in videosInfos.items():
            if onlineFilename not in videosToDownload:
                continue

            # Really download
            if args.dryrun:
                print(f"Would download {onlineFilename}")
            else:
                # Apparently that thing is transformed from a LazyList
                # somewhere in the normal yt_dlp process
                if isinstance(infos["thumbnails"], youtube_dl.utils.LazyList):
                    infos["thumbnails"] = infos["thumbnails"].exhaust()
                try:
                    ydl.process_ie_result(infos, True, {})

                    markTracked(onlineFilename)
                except BaseException as e:
                    print(e)
                    exit_code = 1
                    continue

    sys.exit(exit_code)


if __name__ == "__main__":
    main()
No results found.