rssVideos: Use yt-dlp and download oldest first

This commit is contained in:
Geoffrey Frogeye 2021-10-17 14:29:25 +02:00
parent cc79262336
commit 8743082b21

View file

@ -8,17 +8,16 @@ with the unread items (non-video links are ignored).
"""
# TODO Distribute this correclty, in the meanwhile please do
# pip install --user youtube-dl ConfigArgParse progressbar2
# pip install --user yt-dlp ConfigArgParse
# TODO Better logging (youtube-dl allow to pass loggers)
import sys
from typing import Dict, Set, Tuple
import urllib.request
import urllib.parse
import os
from xml.dom import minidom
import youtube_dl
import yt_dlp as youtube_dl
import configargparse
@ -95,18 +94,19 @@ def get_args() -> configargparse.Namespace:
return args
def get_links(args: configargparse.Namespace) -> Set[str]:
def get_links(args: configargparse.Namespace) -> list[str]:
"""
Read the feed XML, get the links
"""
links = set()
links = list()
with urllib.request.urlopen(args.feed) as request:
with minidom.parse(request) as xmldoc:
for item in xmldoc.getElementsByTagName("item"):
try:
linkNode = item.getElementsByTagName("link")[0]
link: str = linkNode.childNodes[0].data
links.add(link)
if link not in links:
links.append(link)
except BaseException as e:
print("Error while getting link from item:", e)
continue
@ -114,8 +114,8 @@ def get_links(args: configargparse.Namespace) -> Set[str]:
def get_video_infos(
args: configargparse.Namespace, ydl_opts: Dict, links: Set[str]
) -> Dict[str, Dict]:
args: configargparse.Namespace, ydl_opts: dict, links: list[str]
) -> dict[str, dict]:
"""
Filter out non-video links and store video download info
and associated filename
@ -148,8 +148,8 @@ def get_video_infos(
def get_downloaded_videos(
args: configargparse.Namespace, videosInfos: Dict[str, Dict]
) -> Tuple[Set[str], Set[str]]:
args: configargparse.Namespace, videosInfos: dict[str, dict]
) -> tuple[set[str], set[str]]:
videosDownloaded = set()
videosPartiallyDownloaded = set()
"""
@ -187,7 +187,7 @@ def get_downloaded_videos(
return videosDownloaded, videosPartiallyDownloaded
def get_tracked_videos(args: configargparse.Namespace, known: Set[str]) -> Set[str]:
def get_tracked_videos(args: configargparse.Namespace, known: set[str]) -> set[str]:
"""
Return videos previously downloaded (=tracked) amongst the unread videos.
This is stored in the tracking directory as empty extension-less files.
@ -220,6 +220,8 @@ def main() -> None:
print("→ Retrieveing RSS feed")
links = get_links(args)
# Oldest first
links = links[::-1]
print(f"→ Getting infos on {len(links)} unread articles")
videosInfos = get_video_infos(args, ydl_opts, links)
@ -236,8 +238,8 @@ def main() -> None:
markerPath = os.path.join(args.track, onlineFilename)
open(markerPath, "a").close()
videosToDownload: Set[str] = set()
videosReads: Set[str] = set()
videosToDownload: set[str] = set()
videosReads: set[str] = set()
for onlineFilename in videosInfos.keys():
# If the video was once downloaded but manually deleted,
# the marker should be left
@ -263,14 +265,19 @@ def main() -> None:
os.chdir(args.videos)
exit_code = 0
if not args.dryrun:
# TODO Progressbar one day maybe?
# We have all the info we need to make a reliable one
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
for onlineFilename in videosToDownload:
infos = videosInfos[onlineFilename]
for onlineFilename, infos in videosInfos.items():
if onlineFilename not in videosToDownload:
continue
# Really download
if args.dryrun:
print(f"Would download {onlineFilename}")
else:
# Apparently that thing is transformed from a LazyList
# somewhere in the normal yt_dlp process
if isinstance(infos["thumbnails"], youtube_dl.utils.LazyList):
infos["thumbnails"] = infos["thumbnails"].exhaust()
try:
ydl.process_ie_result(infos, True, {})