dotfiles/config/scripts/rssVideos

#!/usr/bin/env python3

"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""

# TODO Distribute this correclty, in the meanwhile please do
# pip install --user youtube-dl ConfigArgParse progressbar2

# TODO Allow to specify youtube_dl options (e.g. subtitles)
# TODO Restrict quality (it's not that I don't like 8GB 4K videos but...)

from typing import Dict, Set
import urllib.request
import urllib.parse
import os
from xml.dom import minidom
import youtube_dl
import configargparse


if __name__ == "__main__":

    defaultConfigPath = os.path.join(os.path.expanduser(
        os.getenv('XDG_CONFIG_PATH', '~/.config/')), 'rssVideos')


    parser = configargparse.ArgParser(description="Download videos linked in " +
                                      "a RSS feed (e.g. an unread feed from " +
                                      "an RSS aggregator",
                                      default_config_files=[defaultConfigPath])
    parser.add('-c', '--config', required=False, is_config_file=True,
               help='Configuration file')
    parser.add('--feed', help='URL of the RSS feed (must be public for now)',
               env_var='RSS_VIDEOS_FEED', required=True)
    parser.add('--videos', help='Directory to store videos',
               env_var='RSS_VIDEOS_VIDEO_DIR', required=True)
    parser.add('-n', '--dryrun', help='Do not download the videos',
               action='store_const', const=True, default=False)
    # TODO This feature might require additional documentation and an on/off switc
    parser.add('--track', help='Directory where download videos are maked (so they are not downloaded twice)',
               env_var='RSS_VIDEOS_TRACK', required=False, default='.rssVideos')

    args = parser.parse_args()
    args.videos = os.path.realpath(os.path.expanduser(args.videos))
    args.track = os.path.expanduser(args.track)
    if not os.path.isabs(args.track):
        args.track = os.path.realpath(os.path.join(args.videos, args.track))

    os.makedirs(args.videos, exist_ok=True)
    os.makedirs(args.track, exist_ok=True)

    # Read the feed XML, get the links
    print("→ Retrieveing RSS feed")

    links: Set[str] = set()
    with urllib.request.urlopen(args.feed) as request:
        with minidom.parse(request) as xmldoc:
            for item in xmldoc.getElementsByTagName('item'):
                try:
                    linkNode = item.getElementsByTagName('link')[0]
                    link: str = linkNode.childNodes[0].data
                    links.add(link)
                except BaseException as e:
                    print("Error while getting link from item:", e)
                    continue

    # Filter out non-video links and store video download info
    # and associated filename
    print(f"→ Getting infos on {len(links)} unread articles")

    videosInfos: Dict[str, str] = {}

    ydl_opts = {
        "simulate": True,
        "quiet": True
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        for link in links:
            print(f"Researching {link}...")
            try:
                infos = ydl.extract_info(link)
                filepath = ydl.prepare_filename(infos)
                filename, extension = os.path.splitext(filepath)
                videosInfos[filename] = infos
            except BaseException as e:
                print(e)
                continue

    # Read the directory content, delete everything that's not a
    # video on the download list or already downloaded
    print(f"→ Deciding on what to do for {len(videosInfos)} videos")

    # Getting information on the video directory

    videosDownloaded: Set[str] = set()
    videosPartiallyDownloaded: Set[str] = set()

    for filepath in os.listdir(args.videos):
        fullpath = os.path.join(args.videos, filepath)
        if not os.path.isfile(fullpath):
            continue
        filename, extension = os.path.splitext(filepath)

        for onlineFilename in videosInfos.keys():
            # Full name already there: completly downloaded → remove from the download list
            if filename == onlineFilename:
                videosDownloaded.add(onlineFilename)
                break
            # Partial name already there: not completly downloaded → keep on the download list
            elif filename.startswith(onlineFilename):
                videosPartiallyDownloaded.add(onlineFilename)
                break
        # Unrelated filename: delete
        else:
            print(f"Deleting: {filename}")
            os.unlink(fullpath)

    # Getting informations on the tracking directory

    # Videos that were once downloaded using this tool
    videosTracked: Set[str] = set()

    for filepath in os.listdir(args.track):
        fullpath = os.path.join(args.track, filepath)
        if not os.path.isfile(fullpath):
            continue
        # Here filename is a filepath as no extension

        if filepath in videosInfos:
            videosTracked.add(filepath)
        else:
            os.unlink(fullpath)

    # Deciding for the rest based on the informations


    def markTracked(filename):
        markerPath = os.path.join(args.track, onlineFilename)
        open(markerPath, 'a').close()


    videosToDownload: Set[str] = set()
    videosReads: Set[str] = set()
    for onlineFilename in videosInfos.keys():
        # If the video was once downloaded but manually deleted,
        # the marker should be left
        if onlineFilename in videosTracked:
            print(f"Should be marked as read: {onlineFilename}")
            # TODO Automatically do that one day maybe?
            # Need to login to the FreshRSS API and keep track of
            # the item id along the process
            videosReads.add(onlineFilename)
        elif onlineFilename in videosDownloaded:
            markTracked(onlineFilename)
            print(f"Already downloaded: {onlineFilename}")
        else:
            if onlineFilename in videosPartiallyDownloaded:
                print(f"Will be continued: {onlineFilename}")
            else:
                print(f"Will be downloaded: {onlineFilename}")
            videosToDownload.add(onlineFilename)

    # Download the missing videos
    print(f"→ Downloading {len(videosToDownload)} videos")

    os.chdir(args.videos)

    if not args.dryrun:
        # TODO Progressbar one day maybe?
        # We have all the info we need to make a reliable one
        ydl_opts = {
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            for onlineFilename in videosToDownload:
                infos = videosInfos[onlineFilename]

                # Really download
                try:
                    ydl.process_ie_result(infos, True, {})

                    markTracked(onlineFilename)
                except:
                    continue
No results found.