dotfiles/config/scripts/rssVideos

288 lines
9 KiB
Plaintext
Raw Normal View History

2019-04-30 08:22:27 +02:00
#!/usr/bin/env python3
"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""
# TODO Distribute this correclty, in the meanwhile please do
# pip install --user youtube-dl ConfigArgParse progressbar2
2020-12-27 14:20:44 +01:00
# TODO Better logging (youtube-dl allow to pass loggers)
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
import sys
from typing import Dict, Set, Tuple
2019-04-30 08:22:27 +02:00
import urllib.request
import urllib.parse
import os
from xml.dom import minidom
import youtube_dl
import configargparse
2020-12-27 14:20:44 +01:00
def get_args() -> configargparse.Namespace:
defaultConfigPath = os.path.join(
os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
)
parser = configargparse.ArgParser(
description="Download videos linked in "
+ "a RSS feed (e.g. an unread feed from "
+ "an RSS aggregator",
default_config_files=[defaultConfigPath],
)
parser.add(
"-c", "--config", required=False, is_config_file=True, help="Configuration file"
)
parser.add(
"--feed",
help="URL of the RSS feed (must be public for now)",
env_var="RSS_VIDEOS_FEED",
required=True,
)
parser.add(
"--videos",
help="Directory to store videos",
env_var="RSS_VIDEOS_VIDEO_DIR",
required=True,
)
parser.add(
"-n",
"--dryrun",
help="Do not download the videos",
action="store_const",
const=True,
default=False,
)
# TODO This feature might require additional documentation and an on/off switch
parser.add(
"--track",
help="Directory where download videos are marked "
+ "to not download them after deletion.",
env_var="RSS_VIDEOS_TRACK",
required=False,
default=".rssVideos",
)
parser.add(
"--max-duration",
help="Skip video longer than this amount of seconds",
env_var="RSS_VIDEOS_MAX_DURATION",
type=int,
default=0,
)
parser.add(
"--format",
help="Use this format to download videos."
+ " See FORMAT SELECTION in youtube-dl(1)",
env_var="RSS_VIDEOS_FORMAT",
default="bestvideo+bestaudio/best",
)
parser.add(
"--subtitles",
help="Download all subtitles",
env_var="RSS_VIDEOS_SUBTITLES",
action="store_true",
)
2019-04-30 08:22:27 +02:00
args = parser.parse_args()
args.videos = os.path.realpath(os.path.expanduser(args.videos))
args.track = os.path.expanduser(args.track)
if not os.path.isabs(args.track):
args.track = os.path.realpath(os.path.join(args.videos, args.track))
2020-12-27 14:20:44 +01:00
return args
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def get_links(args: configargparse.Namespace) -> Set[str]:
"""
Read the feed XML, get the links
"""
links = set()
2019-04-30 08:22:27 +02:00
with urllib.request.urlopen(args.feed) as request:
with minidom.parse(request) as xmldoc:
2020-12-27 14:20:44 +01:00
for item in xmldoc.getElementsByTagName("item"):
2019-04-30 08:22:27 +02:00
try:
2020-12-27 14:20:44 +01:00
linkNode = item.getElementsByTagName("link")[0]
2019-04-30 08:22:27 +02:00
link: str = linkNode.childNodes[0].data
links.add(link)
except BaseException as e:
print("Error while getting link from item:", e)
continue
2020-12-27 14:20:44 +01:00
return links
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def get_video_infos(
args: configargparse.Namespace, ydl_opts: Dict, links: Set[str]
) -> Dict[str, Dict]:
"""
Filter out non-video links and store video download info
and associated filename
"""
videosInfos = dict()
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
dry_ydl_opts = ydl_opts.copy()
dry_ydl_opts.update({"simulate": True, "quiet": True})
with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:
2019-04-30 08:22:27 +02:00
for link in links:
print(f"Researching {link}...")
try:
infos = ydl.extract_info(link)
2020-12-27 14:20:44 +01:00
if args.max_duration > 0 and infos["duration"] > args.max_duration:
print(
f"{infos['title']}: Skipping as longer than max duration: "
f"{infos['duration']} > {args.max_duration}"
)
continue
2019-04-30 08:22:27 +02:00
filepath = ydl.prepare_filename(infos)
filename, extension = os.path.splitext(filepath)
videosInfos[filename] = infos
2020-12-27 14:20:44 +01:00
print(f"{infos['title']}: Added")
2019-04-30 08:22:27 +02:00
except BaseException as e:
print(e)
continue
2020-12-27 14:20:44 +01:00
return videosInfos
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def get_downloaded_videos(
args: configargparse.Namespace, videosInfos: Dict[str, Dict]
) -> Tuple[Set[str], Set[str]]:
videosDownloaded = set()
videosPartiallyDownloaded = set()
"""
Read the directory content, delete everything that's not a
video on the download list or already downloaded
"""
2019-04-30 08:22:27 +02:00
for filepath in os.listdir(args.videos):
fullpath = os.path.join(args.videos, filepath)
if not os.path.isfile(fullpath):
continue
filename, extension = os.path.splitext(filepath)
for onlineFilename in videosInfos.keys():
2020-12-27 14:20:44 +01:00
# Full name already there: completly downloaded
# → remove from the download list
2019-04-30 08:22:27 +02:00
if filename == onlineFilename:
videosDownloaded.add(onlineFilename)
break
elif filename.startswith(onlineFilename):
2020-12-27 14:20:44 +01:00
# Subtitle file
# → ignore
if filename.endswith(".vtt"):
break
# Partial name already there: not completly downloaded
# → keep on the download list
2019-04-30 08:22:27 +02:00
videosPartiallyDownloaded.add(onlineFilename)
break
# Unrelated filename: delete
else:
print(f"Deleting: {filename}")
os.unlink(fullpath)
2020-12-27 14:20:44 +01:00
return videosDownloaded, videosPartiallyDownloaded
def get_tracked_videos(args: configargparse.Namespace, known: Set[str]) -> Set[str]:
"""
Return videos previously downloaded (=tracked) amongst the unread videos.
This is stored in the tracking directory as empty extension-less files.
Other tracking markers (e.g. for now read videos) are deleted.
"""
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
videosTracked = set()
2019-04-30 08:22:27 +02:00
for filepath in os.listdir(args.track):
fullpath = os.path.join(args.track, filepath)
if not os.path.isfile(fullpath):
continue
# Here filename is a filepath as no extension
2020-12-27 14:20:44 +01:00
if filepath in known:
2019-04-30 08:22:27 +02:00
videosTracked.add(filepath)
else:
os.unlink(fullpath)
2020-12-27 14:20:44 +01:00
return videosTracked
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def main() -> None:
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
args = get_args()
os.makedirs(args.videos, exist_ok=True)
os.makedirs(args.track, exist_ok=True)
ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}
print("→ Retrieveing RSS feed")
links = get_links(args)
print(f"→ Getting infos on {len(links)} unread articles")
videosInfos = get_video_infos(args, ydl_opts, links)
print(f"→ Deciding on what to do for {len(videosInfos)} videos")
videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(
args, videosInfos
)
videosTracked = get_tracked_videos(args, set(videosInfos.keys()))
# Deciding for the rest based on the informations
def markTracked(filename: str) -> None:
markerPath = os.path.join(args.track, onlineFilename)
open(markerPath, "a").close()
2019-04-30 08:22:27 +02:00
videosToDownload: Set[str] = set()
videosReads: Set[str] = set()
for onlineFilename in videosInfos.keys():
# If the video was once downloaded but manually deleted,
# the marker should be left
if onlineFilename in videosTracked:
print(f"Should be marked as read: {onlineFilename}")
# TODO Automatically do that one day maybe?
# Need to login to the FreshRSS API and keep track of
# the item id along the process
videosReads.add(onlineFilename)
elif onlineFilename in videosDownloaded:
markTracked(onlineFilename)
print(f"Already downloaded: {onlineFilename}")
else:
if onlineFilename in videosPartiallyDownloaded:
print(f"Will be continued: {onlineFilename}")
else:
print(f"Will be downloaded: {onlineFilename}")
videosToDownload.add(onlineFilename)
# Download the missing videos
print(f"→ Downloading {len(videosToDownload)} videos")
os.chdir(args.videos)
2020-12-27 14:20:44 +01:00
exit_code = 0
2019-05-08 17:25:23 +02:00
if not args.dryrun:
# TODO Progressbar one day maybe?
# We have all the info we need to make a reliable one
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
for onlineFilename in videosToDownload:
infos = videosInfos[onlineFilename]
2019-04-30 08:22:27 +02:00
2019-05-08 17:25:23 +02:00
# Really download
2019-05-11 14:16:15 +02:00
try:
ydl.process_ie_result(infos, True, {})
2019-04-30 08:22:27 +02:00
2019-05-11 14:16:15 +02:00
markTracked(onlineFilename)
2020-12-27 14:20:44 +01:00
except BaseException as e:
print(e)
exit_code = 1
2019-05-11 14:16:15 +02:00
continue
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
sys.exit(exit_code)
if __name__ == "__main__":
main()