2019-04-30 08:22:27 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
"""
|
|
|
|
Script that download videos that are linked as an article
|
|
|
|
in a RSS feed.
|
|
|
|
The common use case would be a feed from an RSS aggregator
|
|
|
|
with the unread items (non-video links are ignored).
|
|
|
|
"""
|
|
|
|
|
|
|
|
# TODO Distribute this correclty, in the meanwhile please do
|
2021-10-17 14:29:25 +02:00
|
|
|
# pip install --user yt-dlp ConfigArgParse
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
# TODO Better logging (youtube-dl allow to pass loggers)
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
import sys
|
2019-04-30 08:22:27 +02:00
|
|
|
import urllib.request
|
|
|
|
import urllib.parse
|
|
|
|
import os
|
|
|
|
from xml.dom import minidom
|
2021-10-17 14:29:25 +02:00
|
|
|
import yt_dlp as youtube_dl
|
2019-04-30 08:22:27 +02:00
|
|
|
import configargparse
|
|
|
|
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
def get_args() -> configargparse.Namespace:
|
|
|
|
defaultConfigPath = os.path.join(
|
|
|
|
os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
|
|
|
|
)
|
|
|
|
|
|
|
|
parser = configargparse.ArgParser(
|
|
|
|
description="Download videos linked in "
|
|
|
|
+ "a RSS feed (e.g. an unread feed from "
|
|
|
|
+ "an RSS aggregator",
|
|
|
|
default_config_files=[defaultConfigPath],
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"-c", "--config", required=False, is_config_file=True, help="Configuration file"
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"--feed",
|
|
|
|
help="URL of the RSS feed (must be public for now)",
|
|
|
|
env_var="RSS_VIDEOS_FEED",
|
|
|
|
required=True,
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"--videos",
|
|
|
|
help="Directory to store videos",
|
|
|
|
env_var="RSS_VIDEOS_VIDEO_DIR",
|
|
|
|
required=True,
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"-n",
|
|
|
|
"--dryrun",
|
|
|
|
help="Do not download the videos",
|
|
|
|
action="store_const",
|
|
|
|
const=True,
|
|
|
|
default=False,
|
|
|
|
)
|
|
|
|
# TODO This feature might require additional documentation and an on/off switch
|
|
|
|
parser.add(
|
|
|
|
"--track",
|
|
|
|
help="Directory where download videos are marked "
|
|
|
|
+ "to not download them after deletion.",
|
|
|
|
env_var="RSS_VIDEOS_TRACK",
|
|
|
|
required=False,
|
|
|
|
default=".rssVideos",
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"--max-duration",
|
|
|
|
help="Skip video longer than this amount of seconds",
|
|
|
|
env_var="RSS_VIDEOS_MAX_DURATION",
|
|
|
|
type=int,
|
|
|
|
default=0,
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"--format",
|
|
|
|
help="Use this format to download videos."
|
|
|
|
+ " See FORMAT SELECTION in youtube-dl(1)",
|
|
|
|
env_var="RSS_VIDEOS_FORMAT",
|
|
|
|
default="bestvideo+bestaudio/best",
|
|
|
|
)
|
|
|
|
parser.add(
|
|
|
|
"--subtitles",
|
|
|
|
help="Download all subtitles",
|
|
|
|
env_var="RSS_VIDEOS_SUBTITLES",
|
|
|
|
action="store_true",
|
|
|
|
)
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
args.videos = os.path.realpath(os.path.expanduser(args.videos))
|
|
|
|
args.track = os.path.expanduser(args.track)
|
|
|
|
if not os.path.isabs(args.track):
|
|
|
|
args.track = os.path.realpath(os.path.join(args.videos, args.track))
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
return args
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
|
2021-10-17 14:29:25 +02:00
|
|
|
def get_links(args: configargparse.Namespace) -> list[str]:
|
2020-12-27 14:20:44 +01:00
|
|
|
"""
|
|
|
|
Read the feed XML, get the links
|
|
|
|
"""
|
2021-10-17 14:29:25 +02:00
|
|
|
links = list()
|
2019-04-30 08:22:27 +02:00
|
|
|
with urllib.request.urlopen(args.feed) as request:
|
|
|
|
with minidom.parse(request) as xmldoc:
|
2020-12-27 14:20:44 +01:00
|
|
|
for item in xmldoc.getElementsByTagName("item"):
|
2019-04-30 08:22:27 +02:00
|
|
|
try:
|
2020-12-27 14:20:44 +01:00
|
|
|
linkNode = item.getElementsByTagName("link")[0]
|
2019-04-30 08:22:27 +02:00
|
|
|
link: str = linkNode.childNodes[0].data
|
2021-10-17 14:29:25 +02:00
|
|
|
if link not in links:
|
|
|
|
links.append(link)
|
2019-04-30 08:22:27 +02:00
|
|
|
except BaseException as e:
|
|
|
|
print("Error while getting link from item:", e)
|
|
|
|
continue
|
2020-12-27 14:20:44 +01:00
|
|
|
return links
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
def get_video_infos(
|
2021-10-17 14:29:25 +02:00
|
|
|
args: configargparse.Namespace, ydl_opts: dict, links: list[str]
|
|
|
|
) -> dict[str, dict]:
|
2020-12-27 14:20:44 +01:00
|
|
|
"""
|
|
|
|
Filter out non-video links and store video download info
|
|
|
|
and associated filename
|
|
|
|
"""
|
|
|
|
videosInfos = dict()
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
dry_ydl_opts = ydl_opts.copy()
|
|
|
|
dry_ydl_opts.update({"simulate": True, "quiet": True})
|
|
|
|
with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:
|
2019-04-30 08:22:27 +02:00
|
|
|
for link in links:
|
|
|
|
print(f"Researching {link}...")
|
|
|
|
try:
|
|
|
|
infos = ydl.extract_info(link)
|
2020-12-27 14:20:44 +01:00
|
|
|
if args.max_duration > 0 and infos["duration"] > args.max_duration:
|
|
|
|
print(
|
|
|
|
f"{infos['title']}: Skipping as longer than max duration: "
|
|
|
|
f"{infos['duration']} > {args.max_duration}"
|
|
|
|
)
|
|
|
|
continue
|
2019-04-30 08:22:27 +02:00
|
|
|
filepath = ydl.prepare_filename(infos)
|
|
|
|
filename, extension = os.path.splitext(filepath)
|
|
|
|
videosInfos[filename] = infos
|
2020-12-27 14:20:44 +01:00
|
|
|
print(f"{infos['title']}: Added")
|
|
|
|
|
2019-04-30 08:22:27 +02:00
|
|
|
except BaseException as e:
|
|
|
|
print(e)
|
|
|
|
continue
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
return videosInfos
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
def get_downloaded_videos(
|
2021-10-17 14:29:25 +02:00
|
|
|
args: configargparse.Namespace, videosInfos: dict[str, dict]
|
|
|
|
) -> tuple[set[str], set[str]]:
|
2020-12-27 14:20:44 +01:00
|
|
|
videosDownloaded = set()
|
|
|
|
videosPartiallyDownloaded = set()
|
|
|
|
"""
|
|
|
|
Read the directory content, delete everything that's not a
|
|
|
|
video on the download list or already downloaded
|
|
|
|
"""
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
for filepath in os.listdir(args.videos):
|
|
|
|
fullpath = os.path.join(args.videos, filepath)
|
|
|
|
if not os.path.isfile(fullpath):
|
|
|
|
continue
|
|
|
|
filename, extension = os.path.splitext(filepath)
|
|
|
|
|
|
|
|
for onlineFilename in videosInfos.keys():
|
2020-12-27 14:20:44 +01:00
|
|
|
# Full name already there: completly downloaded
|
|
|
|
# → remove from the download list
|
2019-04-30 08:22:27 +02:00
|
|
|
if filename == onlineFilename:
|
|
|
|
videosDownloaded.add(onlineFilename)
|
|
|
|
break
|
|
|
|
elif filename.startswith(onlineFilename):
|
2020-12-27 14:20:44 +01:00
|
|
|
# Subtitle file
|
|
|
|
# → ignore
|
|
|
|
if filename.endswith(".vtt"):
|
|
|
|
break
|
|
|
|
|
|
|
|
# Partial name already there: not completly downloaded
|
|
|
|
# → keep on the download list
|
2019-04-30 08:22:27 +02:00
|
|
|
videosPartiallyDownloaded.add(onlineFilename)
|
|
|
|
break
|
|
|
|
# Unrelated filename: delete
|
|
|
|
else:
|
|
|
|
print(f"Deleting: {filename}")
|
|
|
|
os.unlink(fullpath)
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
return videosDownloaded, videosPartiallyDownloaded
|
|
|
|
|
|
|
|
|
2021-10-17 14:29:25 +02:00
|
|
|
def get_tracked_videos(args: configargparse.Namespace, known: set[str]) -> set[str]:
|
2020-12-27 14:20:44 +01:00
|
|
|
"""
|
|
|
|
Return videos previously downloaded (=tracked) amongst the unread videos.
|
|
|
|
This is stored in the tracking directory as empty extension-less files.
|
|
|
|
Other tracking markers (e.g. for now read videos) are deleted.
|
|
|
|
"""
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
videosTracked = set()
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
for filepath in os.listdir(args.track):
|
|
|
|
fullpath = os.path.join(args.track, filepath)
|
|
|
|
if not os.path.isfile(fullpath):
|
|
|
|
continue
|
|
|
|
# Here filename is a filepath as no extension
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
if filepath in known:
|
2019-04-30 08:22:27 +02:00
|
|
|
videosTracked.add(filepath)
|
|
|
|
else:
|
|
|
|
os.unlink(fullpath)
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
return videosTracked
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
def main() -> None:
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
args = get_args()
|
|
|
|
|
|
|
|
os.makedirs(args.videos, exist_ok=True)
|
|
|
|
os.makedirs(args.track, exist_ok=True)
|
|
|
|
ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}
|
|
|
|
|
|
|
|
print("→ Retrieveing RSS feed")
|
|
|
|
links = get_links(args)
|
2021-10-17 14:29:25 +02:00
|
|
|
# Oldest first
|
|
|
|
links = links[::-1]
|
2020-12-27 14:20:44 +01:00
|
|
|
|
|
|
|
print(f"→ Getting infos on {len(links)} unread articles")
|
|
|
|
videosInfos = get_video_infos(args, ydl_opts, links)
|
|
|
|
|
|
|
|
print(f"→ Deciding on what to do for {len(videosInfos)} videos")
|
|
|
|
videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(
|
|
|
|
args, videosInfos
|
|
|
|
)
|
|
|
|
videosTracked = get_tracked_videos(args, set(videosInfos.keys()))
|
|
|
|
|
|
|
|
# Deciding for the rest based on the informations
|
|
|
|
|
|
|
|
def markTracked(filename: str) -> None:
|
|
|
|
markerPath = os.path.join(args.track, onlineFilename)
|
|
|
|
open(markerPath, "a").close()
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2021-10-17 14:29:25 +02:00
|
|
|
videosToDownload: set[str] = set()
|
|
|
|
videosReads: set[str] = set()
|
2019-04-30 08:22:27 +02:00
|
|
|
for onlineFilename in videosInfos.keys():
|
|
|
|
# If the video was once downloaded but manually deleted,
|
|
|
|
# the marker should be left
|
|
|
|
if onlineFilename in videosTracked:
|
|
|
|
print(f"Should be marked as read: {onlineFilename}")
|
|
|
|
# TODO Automatically do that one day maybe?
|
|
|
|
# Need to login to the FreshRSS API and keep track of
|
|
|
|
# the item id along the process
|
|
|
|
videosReads.add(onlineFilename)
|
|
|
|
elif onlineFilename in videosDownloaded:
|
|
|
|
markTracked(onlineFilename)
|
|
|
|
print(f"Already downloaded: {onlineFilename}")
|
|
|
|
else:
|
|
|
|
if onlineFilename in videosPartiallyDownloaded:
|
|
|
|
print(f"Will be continued: {onlineFilename}")
|
|
|
|
else:
|
|
|
|
print(f"Will be downloaded: {onlineFilename}")
|
|
|
|
videosToDownload.add(onlineFilename)
|
|
|
|
|
|
|
|
# Download the missing videos
|
|
|
|
print(f"→ Downloading {len(videosToDownload)} videos")
|
|
|
|
|
|
|
|
os.chdir(args.videos)
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
exit_code = 0
|
2021-10-17 14:29:25 +02:00
|
|
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
|
|
|
for onlineFilename, infos in videosInfos.items():
|
|
|
|
if onlineFilename not in videosToDownload:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Really download
|
|
|
|
if args.dryrun:
|
|
|
|
print(f"Would download {onlineFilename}")
|
|
|
|
else:
|
|
|
|
# Apparently that thing is transformed from a LazyList
|
|
|
|
# somewhere in the normal yt_dlp process
|
|
|
|
if isinstance(infos["thumbnails"], youtube_dl.utils.LazyList):
|
|
|
|
infos["thumbnails"] = infos["thumbnails"].exhaust()
|
2019-05-11 14:16:15 +02:00
|
|
|
try:
|
|
|
|
ydl.process_ie_result(infos, True, {})
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2019-05-11 14:16:15 +02:00
|
|
|
markTracked(onlineFilename)
|
2020-12-27 14:20:44 +01:00
|
|
|
except BaseException as e:
|
|
|
|
print(e)
|
|
|
|
exit_code = 1
|
2019-05-11 14:16:15 +02:00
|
|
|
continue
|
2019-04-30 08:22:27 +02:00
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
sys.exit(exit_code)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|