#!/usr/bin/env python3 """ Script that download videos that are linked as an article in a RSS feed. The common use case would be a feed from an RSS aggregator with the unread items (non-video links are ignored). """ # TODO Distribute this correclty, in the meanwhile please do # pip install --user yt-dlp ConfigArgParse # TODO Better logging (youtube-dl allow to pass loggers) import sys import urllib.request import urllib.parse import os from xml.dom import minidom import yt_dlp as youtube_dl import configargparse def get_args() -> configargparse.Namespace: defaultConfigPath = os.path.join( os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos" ) parser = configargparse.ArgParser( description="Download videos linked in " + "a RSS feed (e.g. an unread feed from " + "an RSS aggregator", default_config_files=[defaultConfigPath], ) parser.add( "-c", "--config", required=False, is_config_file=True, help="Configuration file" ) parser.add( "--feed", help="URL of the RSS feed (must be public for now)", env_var="RSS_VIDEOS_FEED", required=True, ) parser.add( "--videos", help="Directory to store videos", env_var="RSS_VIDEOS_VIDEO_DIR", required=True, ) parser.add( "-n", "--dryrun", help="Do not download the videos", action="store_const", const=True, default=False, ) # TODO This feature might require additional documentation and an on/off switch parser.add( "--track", help="Directory where download videos are marked " + "to not download them after deletion.", env_var="RSS_VIDEOS_TRACK", required=False, default=".rssVideos", ) parser.add( "--max-duration", help="Skip video longer than this amount of seconds", env_var="RSS_VIDEOS_MAX_DURATION", type=int, default=0, ) parser.add( "--format", help="Use this format to download videos." + " See FORMAT SELECTION in youtube-dl(1)", env_var="RSS_VIDEOS_FORMAT", default="bestvideo+bestaudio/best", ) parser.add( "--subtitles", help="Download all subtitles", env_var="RSS_VIDEOS_SUBTITLES", action="store_true", ) args = parser.parse_args() args.videos = os.path.realpath(os.path.expanduser(args.videos)) args.track = os.path.expanduser(args.track) if not os.path.isabs(args.track): args.track = os.path.realpath(os.path.join(args.videos, args.track)) return args def get_links(args: configargparse.Namespace) -> list[str]: """ Read the feed XML, get the links """ links = list() with urllib.request.urlopen(args.feed) as request: with minidom.parse(request) as xmldoc: for item in xmldoc.getElementsByTagName("item"): try: linkNode = item.getElementsByTagName("link")[0] link: str = linkNode.childNodes[0].data if link not in links: links.append(link) except BaseException as e: print("Error while getting link from item:", e) continue return links def get_video_infos( args: configargparse.Namespace, ydl_opts: dict, links: list[str] ) -> dict[str, dict]: """ Filter out non-video links and store video download info and associated filename """ videosInfos = dict() dry_ydl_opts = ydl_opts.copy() dry_ydl_opts.update({"simulate": True, "quiet": True}) with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl: for link in links: print(f"Researching {link}...") try: infos = ydl.extract_info(link) if args.max_duration > 0 and infos["duration"] > args.max_duration: print( f"{infos['title']}: Skipping as longer than max duration: " f"{infos['duration']} > {args.max_duration}" ) continue filepath = ydl.prepare_filename(infos) filename, extension = os.path.splitext(filepath) videosInfos[filename] = infos print(f"{infos['title']}: Added") except BaseException as e: print(e) continue return videosInfos def get_downloaded_videos( args: configargparse.Namespace, videosInfos: dict[str, dict] ) -> tuple[set[str], set[str]]: videosDownloaded = set() videosPartiallyDownloaded = set() """ Read the directory content, delete everything that's not a video on the download list or already downloaded """ for filepath in os.listdir(args.videos): fullpath = os.path.join(args.videos, filepath) if not os.path.isfile(fullpath): continue filename, extension = os.path.splitext(filepath) for onlineFilename in videosInfos.keys(): # Full name already there: completly downloaded # → remove from the download list if filename == onlineFilename: videosDownloaded.add(onlineFilename) break elif filename.startswith(onlineFilename): # Subtitle file # → ignore if filename.endswith(".vtt"): break # Partial name already there: not completly downloaded # → keep on the download list videosPartiallyDownloaded.add(onlineFilename) break # Unrelated filename: delete else: print(f"Deleting: {filename}") os.unlink(fullpath) return videosDownloaded, videosPartiallyDownloaded def get_tracked_videos(args: configargparse.Namespace, known: set[str]) -> set[str]: """ Return videos previously downloaded (=tracked) amongst the unread videos. This is stored in the tracking directory as empty extension-less files. Other tracking markers (e.g. for now read videos) are deleted. """ videosTracked = set() for filepath in os.listdir(args.track): fullpath = os.path.join(args.track, filepath) if not os.path.isfile(fullpath): continue # Here filename is a filepath as no extension if filepath in known: videosTracked.add(filepath) else: os.unlink(fullpath) return videosTracked def main() -> None: args = get_args() os.makedirs(args.videos, exist_ok=True) os.makedirs(args.track, exist_ok=True) ydl_opts = {"format": args.format, "allsubtitles": args.subtitles} print("→ Retrieveing RSS feed") links = get_links(args) # Oldest first links = links[::-1] print(f"→ Getting infos on {len(links)} unread articles") videosInfos = get_video_infos(args, ydl_opts, links) print(f"→ Deciding on what to do for {len(videosInfos)} videos") videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos( args, videosInfos ) videosTracked = get_tracked_videos(args, set(videosInfos.keys())) # Deciding for the rest based on the informations def markTracked(filename: str) -> None: markerPath = os.path.join(args.track, onlineFilename) open(markerPath, "a").close() videosToDownload: set[str] = set() videosReads: set[str] = set() for onlineFilename in videosInfos.keys(): # If the video was once downloaded but manually deleted, # the marker should be left if onlineFilename in videosTracked: print(f"Should be marked as read: {onlineFilename}") # TODO Automatically do that one day maybe? # Need to login to the FreshRSS API and keep track of # the item id along the process videosReads.add(onlineFilename) elif onlineFilename in videosDownloaded: markTracked(onlineFilename) print(f"Already downloaded: {onlineFilename}") else: if onlineFilename in videosPartiallyDownloaded: print(f"Will be continued: {onlineFilename}") else: print(f"Will be downloaded: {onlineFilename}") videosToDownload.add(onlineFilename) # Download the missing videos print(f"→ Downloading {len(videosToDownload)} videos") os.chdir(args.videos) exit_code = 0 with youtube_dl.YoutubeDL(ydl_opts) as ydl: for onlineFilename, infos in videosInfos.items(): if onlineFilename not in videosToDownload: continue # Really download if args.dryrun: print(f"Would download {onlineFilename}") else: # Apparently that thing is transformed from a LazyList # somewhere in the normal yt_dlp process if isinstance(infos["thumbnails"], youtube_dl.utils.LazyList): infos["thumbnails"] = infos["thumbnails"].exhaust() try: ydl.process_ie_result(infos, True, {}) markTracked(onlineFilename) except BaseException as e: print(e) exit_code = 1 continue sys.exit(exit_code) if __name__ == "__main__": main()