dotfiles/config/scripts/rssVideos

#!/usr/bin/env python3

"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""

# TODO Distribute this correclty, in the meanwhile please do
# pip install --user youtube-dl ConfigArgParse progressbar2

# TODO Better logging (youtube-dl allow to pass loggers)

import sys
from typing import Dict, Set, Tuple
import urllib.request
import urllib.parse
import os
from xml.dom import minidom
import youtube_dl
import configargparse


def get_args() -> configargparse.Namespace:
    defaultConfigPath = os.path.join(
        os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
    )

    parser = configargparse.ArgParser(
        description="Download videos linked in "
        + "a RSS feed (e.g. an unread feed from "
        + "an RSS aggregator",
        default_config_files=[defaultConfigPath],
    )
    parser.add(
        "-c", "--config", required=False, is_config_file=True, help="Configuration file"
    )
    parser.add(
        "--feed",
        help="URL of the RSS feed (must be public for now)",
        env_var="RSS_VIDEOS_FEED",
        required=True,
    )
    parser.add(
        "--videos",
        help="Directory to store videos",
        env_var="RSS_VIDEOS_VIDEO_DIR",
        required=True,
    )
    parser.add(
        "-n",
        "--dryrun",
        help="Do not download the videos",
        action="store_const",
        const=True,
        default=False,
    )
    # TODO This feature might require additional documentation and an on/off switch
    parser.add(
        "--track",
        help="Directory where download videos are marked "
        + "to not download them after deletion.",
        env_var="RSS_VIDEOS_TRACK",
        required=False,
        default=".rssVideos",
    )
    parser.add(
        "--max-duration",
        help="Skip video longer than this amount of seconds",
        env_var="RSS_VIDEOS_MAX_DURATION",
        type=int,
        default=0,
    )
    parser.add(
        "--format",
        help="Use this format to download videos."
        + " See FORMAT SELECTION in youtube-dl(1)",
        env_var="RSS_VIDEOS_FORMAT",
        default="bestvideo+bestaudio/best",
    )
    parser.add(
        "--subtitles",
        help="Download all subtitles",
        env_var="RSS_VIDEOS_SUBTITLES",
        action="store_true",
    )

    args = parser.parse_args()
    args.videos = os.path.realpath(os.path.expanduser(args.videos))
    args.track = os.path.expanduser(args.track)
    if not os.path.isabs(args.track):
        args.track = os.path.realpath(os.path.join(args.videos, args.track))

    return args


def get_links(args: configargparse.Namespace) -> Set[str]:
    """
    Read the feed XML, get the links
    """
    links = set()
    with urllib.request.urlopen(args.feed) as request:
        with minidom.parse(request) as xmldoc:
            for item in xmldoc.getElementsByTagName("item"):
                try:
                    linkNode = item.getElementsByTagName("link")[0]
                    link: str = linkNode.childNodes[0].data
                    links.add(link)
                except BaseException as e:
                    print("Error while getting link from item:", e)
                    continue
    return links


def get_video_infos(
    args: configargparse.Namespace, ydl_opts: Dict, links: Set[str]
) -> Dict[str, Dict]:
    """
    Filter out non-video links and store video download info
    and associated filename
    """
    videosInfos = dict()

    dry_ydl_opts = ydl_opts.copy()
    dry_ydl_opts.update({"simulate": True, "quiet": True})
    with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:
        for link in links:
            print(f"Researching {link}...")
            try:
                infos = ydl.extract_info(link)
                if args.max_duration > 0 and infos["duration"] > args.max_duration:
                    print(
                        f"{infos['title']}: Skipping as longer than max duration: "
                        f"{infos['duration']} > {args.max_duration}"
                    )
                    continue
                filepath = ydl.prepare_filename(infos)
                filename, extension = os.path.splitext(filepath)
                videosInfos[filename] = infos
                print(f"{infos['title']}: Added")

            except BaseException as e:
                print(e)
                continue

    return videosInfos


def get_downloaded_videos(
    args: configargparse.Namespace, videosInfos: Dict[str, Dict]
) -> Tuple[Set[str], Set[str]]:
    videosDownloaded = set()
    videosPartiallyDownloaded = set()
    """
    Read the directory content, delete everything that's not a
    video on the download list or already downloaded
    """

    for filepath in os.listdir(args.videos):
        fullpath = os.path.join(args.videos, filepath)
        if not os.path.isfile(fullpath):
            continue
        filename, extension = os.path.splitext(filepath)

        for onlineFilename in videosInfos.keys():
            # Full name already there: completly downloaded
            # → remove from the download list
            if filename == onlineFilename:
                videosDownloaded.add(onlineFilename)
                break
            elif filename.startswith(onlineFilename):
                # Subtitle file
                # → ignore
                if filename.endswith(".vtt"):
                    break

                # Partial name already there: not completly downloaded
                # → keep on the download list
                videosPartiallyDownloaded.add(onlineFilename)
                break
        # Unrelated filename: delete
        else:
            print(f"Deleting: {filename}")
            os.unlink(fullpath)

    return videosDownloaded, videosPartiallyDownloaded


def get_tracked_videos(args: configargparse.Namespace, known: Set[str]) -> Set[str]:
    """
    Return videos previously downloaded (=tracked) amongst the unread videos.
    This is stored in the tracking directory as empty extension-less files.
    Other tracking markers (e.g. for now read videos) are deleted.
    """

    videosTracked = set()

    for filepath in os.listdir(args.track):
        fullpath = os.path.join(args.track, filepath)
        if not os.path.isfile(fullpath):
            continue
        # Here filename is a filepath as no extension

        if filepath in known:
            videosTracked.add(filepath)
        else:
            os.unlink(fullpath)

    return videosTracked


def main() -> None:

    args = get_args()

    os.makedirs(args.videos, exist_ok=True)
    os.makedirs(args.track, exist_ok=True)
    ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}

    print("→ Retrieveing RSS feed")
    links = get_links(args)

    print(f"→ Getting infos on {len(links)} unread articles")
    videosInfos = get_video_infos(args, ydl_opts, links)

    print(f"→ Deciding on what to do for {len(videosInfos)} videos")
    videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(
        args, videosInfos
    )
    videosTracked = get_tracked_videos(args, set(videosInfos.keys()))

    # Deciding for the rest based on the informations

    def markTracked(filename: str) -> None:
        markerPath = os.path.join(args.track, onlineFilename)
        open(markerPath, "a").close()

    videosToDownload: Set[str] = set()
    videosReads: Set[str] = set()
    for onlineFilename in videosInfos.keys():
        # If the video was once downloaded but manually deleted,
        # the marker should be left
        if onlineFilename in videosTracked:
            print(f"Should be marked as read: {onlineFilename}")
            # TODO Automatically do that one day maybe?
            # Need to login to the FreshRSS API and keep track of
            # the item id along the process
            videosReads.add(onlineFilename)
        elif onlineFilename in videosDownloaded:
            markTracked(onlineFilename)
            print(f"Already downloaded: {onlineFilename}")
        else:
            if onlineFilename in videosPartiallyDownloaded:
                print(f"Will be continued: {onlineFilename}")
            else:
                print(f"Will be downloaded: {onlineFilename}")
            videosToDownload.add(onlineFilename)

    # Download the missing videos
    print(f"→ Downloading {len(videosToDownload)} videos")

    os.chdir(args.videos)

    exit_code = 0
    if not args.dryrun:
        # TODO Progressbar one day maybe?
        # We have all the info we need to make a reliable one
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            for onlineFilename in videosToDownload:
                infos = videosInfos[onlineFilename]

                # Really download
                try:
                    ydl.process_ie_result(infos, True, {})

                    markTracked(onlineFilename)
                except BaseException as e:
                    print(e)
                    exit_code = 1
                    continue

    sys.exit(exit_code)


if __name__ == "__main__":
    main()
rssVideos 2019-04-30 08:22:27 +02:00			`#!/usr/bin/env python3`

			`"""`
			`Script that download videos that are linked as an article`
			`in a RSS feed.`
			`The common use case would be a feed from an RSS aggregator`
			`with the unread items (non-video links are ignored).`
			`"""`

			`# TODO Distribute this correclty, in the meanwhile please do`
			`# pip install --user youtube-dl ConfigArgParse progressbar2`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`# TODO Better logging (youtube-dl allow to pass loggers)`
rssVideos 2019-04-30 08:22:27 +02:00
rssViedos mostly 2020-12-27 14:20:44 +01:00			`import sys`
			`from typing import Dict, Set, Tuple`
rssVideos 2019-04-30 08:22:27 +02:00			`import urllib.request`
			`import urllib.parse`
			`import os`
			`from xml.dom import minidom`
			`import youtube_dl`
			`import configargparse`


rssViedos mostly 2020-12-27 14:20:44 +01:00			`def get_args() -> configargparse.Namespace:`
			`defaultConfigPath = os.path.join(`
			`os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"`
			`)`

			`parser = configargparse.ArgParser(`
			`description="Download videos linked in "`
			`+ "a RSS feed (e.g. an unread feed from "`
			`+ "an RSS aggregator",`
			`default_config_files=[defaultConfigPath],`
			`)`
			`parser.add(`
			`"-c", "--config", required=False, is_config_file=True, help="Configuration file"`
			`)`
			`parser.add(`
			`"--feed",`
			`help="URL of the RSS feed (must be public for now)",`
			`env_var="RSS_VIDEOS_FEED",`
			`required=True,`
			`)`
			`parser.add(`
			`"--videos",`
			`help="Directory to store videos",`
			`env_var="RSS_VIDEOS_VIDEO_DIR",`
			`required=True,`
			`)`
			`parser.add(`
			`"-n",`
			`"--dryrun",`
			`help="Do not download the videos",`
			`action="store_const",`
			`const=True,`
			`default=False,`
			`)`
			`# TODO This feature might require additional documentation and an on/off switch`
			`parser.add(`
			`"--track",`
			`help="Directory where download videos are marked "`
			`+ "to not download them after deletion.",`
			`env_var="RSS_VIDEOS_TRACK",`
			`required=False,`
			`default=".rssVideos",`
			`)`
			`parser.add(`
			`"--max-duration",`
			`help="Skip video longer than this amount of seconds",`
			`env_var="RSS_VIDEOS_MAX_DURATION",`
			`type=int,`
			`default=0,`
			`)`
			`parser.add(`
			`"--format",`
			`help="Use this format to download videos."`
			`+ " See FORMAT SELECTION in youtube-dl(1)",`
			`env_var="RSS_VIDEOS_FORMAT",`
			`default="bestvideo+bestaudio/best",`
			`)`
			`parser.add(`
			`"--subtitles",`
			`help="Download all subtitles",`
			`env_var="RSS_VIDEOS_SUBTITLES",`
			`action="store_true",`
			`)`
rssVideos 2019-04-30 08:22:27 +02:00
			`args = parser.parse_args()`
			`args.videos = os.path.realpath(os.path.expanduser(args.videos))`
			`args.track = os.path.expanduser(args.track)`
			`if not os.path.isabs(args.track):`
			`args.track = os.path.realpath(os.path.join(args.videos, args.track))`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`return args`
rssVideos 2019-04-30 08:22:27 +02:00

rssViedos mostly 2020-12-27 14:20:44 +01:00			`def get_links(args: configargparse.Namespace) -> Set[str]:`
			`"""`
			`Read the feed XML, get the links`
			`"""`
			`links = set()`
rssVideos 2019-04-30 08:22:27 +02:00			`with urllib.request.urlopen(args.feed) as request:`
			`with minidom.parse(request) as xmldoc:`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`for item in xmldoc.getElementsByTagName("item"):`
rssVideos 2019-04-30 08:22:27 +02:00			`try:`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`linkNode = item.getElementsByTagName("link")[0]`
rssVideos 2019-04-30 08:22:27 +02:00			`link: str = linkNode.childNodes[0].data`
			`links.add(link)`
			`except BaseException as e:`
			`print("Error while getting link from item:", e)`
			`continue`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`return links`
rssVideos 2019-04-30 08:22:27 +02:00

rssViedos mostly 2020-12-27 14:20:44 +01:00			`def get_video_infos(`
			`args: configargparse.Namespace, ydl_opts: Dict, links: Set[str]`
			`) -> Dict[str, Dict]:`
			`"""`
			`Filter out non-video links and store video download info`
			`and associated filename`
			`"""`
			`videosInfos = dict()`
rssVideos 2019-04-30 08:22:27 +02:00
rssViedos mostly 2020-12-27 14:20:44 +01:00			`dry_ydl_opts = ydl_opts.copy()`
			`dry_ydl_opts.update({"simulate": True, "quiet": True})`
			`with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:`
rssVideos 2019-04-30 08:22:27 +02:00			`for link in links:`
			`print(f"Researching {link}...")`
			`try:`
			`infos = ydl.extract_info(link)`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`if args.max_duration > 0 and infos["duration"] > args.max_duration:`
			`print(`
			`f"{infos['title']}: Skipping as longer than max duration: "`
			`f"{infos['duration']} > {args.max_duration}"`
			`)`
			`continue`
rssVideos 2019-04-30 08:22:27 +02:00			`filepath = ydl.prepare_filename(infos)`
			`filename, extension = os.path.splitext(filepath)`
			`videosInfos[filename] = infos`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`print(f"{infos['title']}: Added")`

rssVideos 2019-04-30 08:22:27 +02:00			`except BaseException as e:`
			`print(e)`
			`continue`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`return videosInfos`
rssVideos 2019-04-30 08:22:27 +02:00

rssViedos mostly 2020-12-27 14:20:44 +01:00			`def get_downloaded_videos(`
			`args: configargparse.Namespace, videosInfos: Dict[str, Dict]`
			`) -> Tuple[Set[str], Set[str]]:`
			`videosDownloaded = set()`
			`videosPartiallyDownloaded = set()`
			`"""`
			`Read the directory content, delete everything that's not a`
			`video on the download list or already downloaded`
			`"""`
rssVideos 2019-04-30 08:22:27 +02:00
			`for filepath in os.listdir(args.videos):`
			`fullpath = os.path.join(args.videos, filepath)`
			`if not os.path.isfile(fullpath):`
			`continue`
			`filename, extension = os.path.splitext(filepath)`

			`for onlineFilename in videosInfos.keys():`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`# Full name already there: completly downloaded`
			`# → remove from the download list`
rssVideos 2019-04-30 08:22:27 +02:00			`if filename == onlineFilename:`
			`videosDownloaded.add(onlineFilename)`
			`break`
			`elif filename.startswith(onlineFilename):`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`# Subtitle file`
			`# → ignore`
			`if filename.endswith(".vtt"):`
			`break`

			`# Partial name already there: not completly downloaded`
			`# → keep on the download list`
rssVideos 2019-04-30 08:22:27 +02:00			`videosPartiallyDownloaded.add(onlineFilename)`
			`break`
			`# Unrelated filename: delete`
			`else:`
			`print(f"Deleting: {filename}")`
			`os.unlink(fullpath)`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`return videosDownloaded, videosPartiallyDownloaded`


			`def get_tracked_videos(args: configargparse.Namespace, known: Set[str]) -> Set[str]:`
			`"""`
			`Return videos previously downloaded (=tracked) amongst the unread videos.`
			`This is stored in the tracking directory as empty extension-less files.`
			`Other tracking markers (e.g. for now read videos) are deleted.`
			`"""`
rssVideos 2019-04-30 08:22:27 +02:00
rssViedos mostly 2020-12-27 14:20:44 +01:00			`videosTracked = set()`
rssVideos 2019-04-30 08:22:27 +02:00
			`for filepath in os.listdir(args.track):`
			`fullpath = os.path.join(args.track, filepath)`
			`if not os.path.isfile(fullpath):`
			`continue`
			`# Here filename is a filepath as no extension`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`if filepath in known:`
rssVideos 2019-04-30 08:22:27 +02:00			`videosTracked.add(filepath)`
			`else:`
			`os.unlink(fullpath)`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`return videosTracked`
rssVideos 2019-04-30 08:22:27 +02:00

rssViedos mostly 2020-12-27 14:20:44 +01:00			`def main() -> None:`
rssVideos 2019-04-30 08:22:27 +02:00
rssViedos mostly 2020-12-27 14:20:44 +01:00			`args = get_args()`

			`os.makedirs(args.videos, exist_ok=True)`
			`os.makedirs(args.track, exist_ok=True)`
			`ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}`

			`print("→ Retrieveing RSS feed")`
			`links = get_links(args)`

			`print(f"→ Getting infos on {len(links)} unread articles")`
			`videosInfos = get_video_infos(args, ydl_opts, links)`

			`print(f"→ Deciding on what to do for {len(videosInfos)} videos")`
			`videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(`
			`args, videosInfos`
			`)`
			`videosTracked = get_tracked_videos(args, set(videosInfos.keys()))`

			`# Deciding for the rest based on the informations`

			`def markTracked(filename: str) -> None:`
			`markerPath = os.path.join(args.track, onlineFilename)`
			`open(markerPath, "a").close()`
rssVideos 2019-04-30 08:22:27 +02:00
			`videosToDownload: Set[str] = set()`
			`videosReads: Set[str] = set()`
			`for onlineFilename in videosInfos.keys():`
			`# If the video was once downloaded but manually deleted,`
			`# the marker should be left`
			`if onlineFilename in videosTracked:`
			`print(f"Should be marked as read: {onlineFilename}")`
			`# TODO Automatically do that one day maybe?`
			`# Need to login to the FreshRSS API and keep track of`
			`# the item id along the process`
			`videosReads.add(onlineFilename)`
			`elif onlineFilename in videosDownloaded:`
			`markTracked(onlineFilename)`
			`print(f"Already downloaded: {onlineFilename}")`
			`else:`
			`if onlineFilename in videosPartiallyDownloaded:`
			`print(f"Will be continued: {onlineFilename}")`
			`else:`
			`print(f"Will be downloaded: {onlineFilename}")`
			`videosToDownload.add(onlineFilename)`

			`# Download the missing videos`
			`print(f"→ Downloading {len(videosToDownload)} videos")`

			`os.chdir(args.videos)`

rssViedos mostly 2020-12-27 14:20:44 +01:00			`exit_code = 0`
rssVideos dryRun 2019-05-08 17:25:23 +02:00			`if not args.dryrun:`
			`# TODO Progressbar one day maybe?`
			`# We have all the info we need to make a reliable one`
			`with youtube_dl.YoutubeDL(ydl_opts) as ydl:`
			`for onlineFilename in videosToDownload:`
			`infos = videosInfos[onlineFilename]`
rssVideos 2019-04-30 08:22:27 +02:00
rssVideos dryRun 2019-05-08 17:25:23 +02:00			`# Really download`
gti 2019-05-11 14:16:15 +02:00			`try:`
			`ydl.process_ie_result(infos, True, {})`
rssVideos 2019-04-30 08:22:27 +02:00
gti 2019-05-11 14:16:15 +02:00			`markTracked(onlineFilename)`
rssViedos mostly 2020-12-27 14:20:44 +01:00			`except BaseException as e:`
			`print(e)`
			`exit_code = 1`
gti 2019-05-11 14:16:15 +02:00			`continue`
rssVideos 2019-04-30 08:22:27 +02:00
rssViedos mostly 2020-12-27 14:20:44 +01:00			`sys.exit(exit_code)`


			`if __name__ == "__main__":`
			`main()`