dotfiles/config/scripts/rssVideos

660 lines
20 KiB
Plaintext
Raw Normal View History

2019-04-30 08:22:27 +02:00
#!/usr/bin/env python3
2019-04-30 08:22:27 +02:00
"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""
import enum
import functools
import logging
import os
import pickle
2021-12-17 23:16:32 +01:00
import random
2021-12-17 22:13:46 +01:00
import re
2021-12-17 23:16:32 +01:00
import subprocess
2020-12-27 14:20:44 +01:00
import sys
import time
import typing
2019-04-30 08:22:27 +02:00
import urllib.parse
import urllib.request
2021-12-18 11:27:24 +01:00
import urllib.error
2019-04-30 08:22:27 +02:00
from xml.dom import minidom
import coloredlogs
2019-04-30 08:22:27 +02:00
import configargparse
import yt_dlp
log = logging.getLogger(__name__)
2021-12-18 12:44:43 +01:00
# TODO Lockfile, or a way to parallel watch and download
# TODO Save ytdl infos and view info separately
def configure_logging(args: configargparse.Namespace) -> None:
# Configure logging
if args.verbosity:
coloredlogs.install(
level=args.verbosity,
)
else:
coloredlogs.install(
fmt="%(message)s",
logger=log,
)
class SaveInfoPP(yt_dlp.postprocessor.common.PostProcessor):
"""
yt_dlp.process_ie_result() doesn't return a completely updated info dict,
notably the extension is still the one before it realizes the files cannot
be merged. So we use this PostProcessor to catch the info dict in its final
form and save what we need from it (it's not serializable in this state).
"""
def __init__(self, rvelement: "RVElement") -> None:
self.rvelement = rvelement
super().__init__()
def run(self, info: dict) -> tuple[list, dict]:
self.rvelement.update_post_download(info)
return [], info
def parse_duration(string: str) -> int:
DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, "": 1}
mult_index = string[-1].lower()
if mult_index.isdigit():
mult_index = ""
else:
string = string[:-1]
try:
multiplier = DURATION_MULTIPLIERS[mult_index]
except IndexError:
raise ValueError(f"Unknown duration multiplier: {mult_index}")
return int(string) * multiplier
def compare_duration(compstr: str) -> typing.Callable[[int], bool]:
DURATION_COMPARATORS = {
"<": int.__lt__,
"-": int.__lt__,
">": int.__gt__,
"+": int.__gt__,
"=": int.__eq__,
"": int.__le__,
}
comp_index = compstr[0]
if comp_index.isdigit():
comp_index = ""
else:
compstr = compstr[1:]
try:
comparator = DURATION_COMPARATORS[comp_index]
except IndexError:
raise ValueError(f"Unknown duration comparator: {comp_index}")
duration = parse_duration(compstr)
return lambda d: comparator(d, duration)
def format_duration(duration: int) -> str:
return time.strftime("%H:%M:%S", time.gmtime(duration))
class RVElement:
parent: "RVDatabase"
item: minidom.Element
downloaded_filepath: typing.Optional[str]
2021-12-17 23:16:32 +01:00
watched: bool
def __init__(self, parent: "RVDatabase", item: minidom.Element) -> None:
self.parent = parent
self.item = item
self.downloaded_filepath = None
2021-12-17 23:16:32 +01:00
self.watched = False
def get_tag_data(self, tag_name: str) -> str:
nodes = self.item.getElementsByTagName(tag_name)
if len(nodes) != 1:
raise KeyError(f"Exepected 1 tag `{tag_name}`, got {len(nodes)}.")
children = nodes[0].childNodes
if len(children) != 1:
raise KeyError(
f"Exepected 1 children for tag `{tag_name}`, got {len(children)}."
)
return children[0].data
@property
def title(self) -> str:
return self.get_tag_data("title")
@property
def link(self) -> str:
return self.get_tag_data("link")
@property
def creator(self) -> typing.Optional[str]:
try:
return self.get_tag_data("dc:creator")
except KeyError:
return None
@property
def description(self) -> str:
# TODO Testing
return self.get_tag_data("description")
@property
def date(self) -> str:
# TODO datetime format
return self.get_tag_data("pubDate")
@property
def guid(self) -> int:
return int(self.get_tag_data("guid"))
2021-12-18 12:44:43 +01:00
@property
def is_researched(self) -> bool:
return "ytdl_infos" in self.__dict__
2021-12-18 11:27:24 +01:00
def salvage_cache(self, cache: "RVElement") -> None:
if not self.parent.args.research and cache.is_researched:
self.__dict__["ytdl_infos"] = cache.__dict__["ytdl_infos"]
log.debug(f"From cache: {self}")
if cache.downloaded_filepath:
self.downloaded_filepath = cache.downloaded_filepath
2021-12-17 23:16:32 +01:00
if cache.watched:
self.watched = True
def __str__(self) -> str:
str = f"{self.guid}: {self.creator if self.creator else '?'} {self.title}"
if self.is_researched:
if self.is_video:
str += f" ({format_duration(self.duration)})"
else:
str += " (N/A)"
else:
str += " (?)"
str += f" {self.link}"
return str
@property
def downloaded(self) -> bool:
2021-12-18 12:44:43 +01:00
if not self.is_researched:
return False
return os.path.isfile(self.filepath)
@functools.cached_property
def ytdl_infos(self) -> typing.Optional[dict]:
log.info(f"Researching: {self}")
try:
infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
except KeyboardInterrupt as e:
raise e
except yt_dlp.utils.DownloadError as e:
# TODO Still raise in case of temporary network issue
2021-12-18 11:27:24 +01:00
log.warning(e)
infos = None
if infos:
infos = self.parent.ytdl_dry.sanitize_info(infos)
# Save database once it's been computed
self.__dict__["ytdl_infos"] = infos
self.parent.save()
return infos
@property
2021-12-17 22:42:35 +01:00
def duration(self) -> int:
assert self.is_video
assert self.ytdl_infos
2021-12-17 22:42:35 +01:00
return self.ytdl_infos["duration"]
@property
def is_video(self) -> bool:
# Duration might be missing in playlists and stuff
return self.ytdl_infos is not None and "duration" in self.ytdl_infos
@property
def filepath(self) -> str:
assert self.is_video
if self.downloaded_filepath:
return self.downloaded_filepath
return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)
@property
def filename(self) -> str:
assert self.is_video
return os.path.splitext(self.filepath)[0]
def download(self) -> None:
assert self.is_video
log.info(f"Downloading: {self}")
if not self.parent.args.dryrun:
with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
ydl.add_post_processor(SaveInfoPP(self))
ydl.process_ie_result(self.ytdl_infos, download=True)
self.parent.save()
def update_post_download(self, info: dict) -> None:
self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
@property
def was_downloaded(self) -> bool:
return self.downloaded_filepath is not None
2021-12-18 12:44:43 +01:00
def preload(self) -> None:
assert self.is_video
if self.downloaded:
log.debug(f"Currently downloaded: {self}")
return
if self.was_downloaded:
log.debug(f"Downloaded previously: {self}")
return
self.download()
2021-12-18 11:27:24 +01:00
def matches_filter(self, args: configargparse.Namespace) -> bool:
# Inexpensive filters
2021-12-18 22:23:48 +01:00
if args.seen != "any" and (args.seen == "seen") != self.watched:
log.debug(f"Not {args.seen}: {self}")
2021-12-17 23:16:32 +01:00
return False
2021-12-17 22:13:46 +01:00
if args.title and not re.search(args.title, self.title):
2021-12-18 12:44:43 +01:00
log.debug(f"Title not matching {args.title}: {self}")
2021-12-17 22:13:46 +01:00
return False
if args.guid and not re.search(args.guid, str(self.guid)):
2021-12-18 12:44:43 +01:00
log.debug(f"Guid not matching {args.guid}: {self}")
2021-12-17 22:13:46 +01:00
return False
if args.link and not re.search(args.link, self.link):
2021-12-18 12:44:43 +01:00
log.debug(f"Link not matching {args.link}: {self}")
2021-12-17 22:13:46 +01:00
return False
if args.creator and (
not self.creator or not re.search(args.creator, self.creator)
):
2021-12-18 12:44:43 +01:00
log.debug(f"Creator not matching {args.creator}: {self}")
2021-12-18 11:27:24 +01:00
return False
# Expensive filters
2021-12-18 11:27:24 +01:00
if not self.is_video:
2021-12-18 12:44:43 +01:00
log.debug(f"Not a video: {self}")
2021-12-18 11:27:24 +01:00
return False
if args.duration and not compare_duration(args.duration)(self.duration):
log.debug(
f"Duration {self.duration} not matching {args.duration}: {self}"
)
return False
2021-12-17 22:13:46 +01:00
return True
2021-12-17 23:16:32 +01:00
def watch(self) -> None:
if not self.downloaded:
self.download()
2021-12-18 11:27:24 +01:00
cmd = ["mpv", self.filepath]
log.debug(f"Running {cmd}")
if not self.parent.args.dryrun:
proc = subprocess.run(cmd)
proc.check_returncode()
2021-12-17 23:16:32 +01:00
self.watched = True
self.parent.save()
2021-12-18 12:44:43 +01:00
def clean(self) -> None:
assert self.is_video
log.info(f"Removing gone video: {self.filename}*")
for file in os.listdir():
if file.startswith(self.filename):
log.debug(f"Removing file: {file}")
if not self.parent.args.dryrun:
os.unlink(file)
class RVDatabase:
SAVE_FILE = ".cache.p"
args: configargparse.Namespace
elements: list[RVElement]
def __init__(self, args: configargparse.Namespace) -> None:
self.args = args
def save(self) -> None:
log.debug("Saving cache")
if self.args.dryrun:
return
with open(self.SAVE_FILE, "wb") as save_file:
pickle.dump(self, save_file)
@classmethod
def load(cls) -> typing.Optional["RVDatabase"]:
try:
with open(cls.SAVE_FILE, "rb") as save_file:
return pickle.load(save_file)
except (TypeError, AttributeError, EOFError):
2021-12-18 11:27:24 +01:00
log.warning("Corrupt / outdated cache, it will be rebuilt.")
except FileNotFoundError:
pass
return None
2021-12-18 11:27:24 +01:00
def salvage_cache(self, cache: "RVDatabase") -> None:
log.debug(f"Salvaging cache")
cache_els = dict()
for cache_el in cache.elements:
cache_els[cache_el.guid] = cache_el
for el in self.elements:
if el.guid in cache_els:
2021-12-18 11:27:24 +01:00
el.salvage_cache(cache_els[el.guid])
2021-12-18 12:44:43 +01:00
def clean_cache(self, cache: "RVDatabase") -> None:
log.debug(f"Cleaning cache")
self_els = dict()
for self_el in self.elements:
self_els[self_el.guid] = self_el
for el in cache.elements:
if el.guid not in self_els:
if el.is_researched and el.is_video:
el.clean()
2021-12-18 11:27:24 +01:00
def import_cache(self, cache: "RVDatabase") -> None:
log.debug(f"Importing cache")
self.feed_xml = cache.feed_xml
self.read_feed()
@functools.cached_property
def feed_xml(self) -> minidom.Document:
2021-12-18 11:27:24 +01:00
log.info("Fetching RSS feed")
with urllib.request.urlopen(self.args.feed) as request:
return minidom.parse(request)
def read_feed(self) -> None:
2021-12-18 11:27:24 +01:00
self.elements = []
for item in self.feed_xml.getElementsByTagName("item"):
element = RVElement(self, item)
self.elements.insert(0, element)
log.debug(f"Known: {element}")
def clean(self) -> None:
log.debug("Cleaning")
filenames = set()
for element in self.elements:
if element.is_video:
filenames.add(element.filename)
for file in os.listdir():
if file == RVDatabase.SAVE_FILE:
continue
if not os.path.isfile(file):
continue
for filename in filenames:
if file.startswith(filename):
break
else:
2021-12-18 12:44:43 +01:00
log.info(f"Removing unknown file: {file}")
if not self.args.dryrun:
os.unlink(file)
2021-12-18 12:44:43 +01:00
@property
def all_researched(self) -> bool:
for element in self.elements:
if not element.is_researched:
return False
return True
def attempt_clean(self) -> None:
if self.all_researched:
self.clean()
@property
def ytdl_opts(self) -> dict:
return {"format": self.args.format, "allsubtitles": self.args.subtitles}
@property
def ytdl_dry_opts(self) -> dict:
opts = self.ytdl_opts.copy()
opts.update({"quiet": True})
return opts
@property
def ytdl_dry(self) -> yt_dlp.YoutubeDL:
return yt_dlp.YoutubeDL(self.ytdl_dry_opts)
2019-04-30 08:22:27 +02:00
2021-12-18 11:27:24 +01:00
def filter(self, args: configargparse.Namespace) -> typing.Iterable[RVElement]:
elements: typing.Iterable[RVElement]
# Inexpensive sort
if args.order == "new":
2021-12-18 11:27:24 +01:00
elements = reversed(self.elements)
elif args.order == "title":
elements = sorted(self.elements, key=lambda el: el.title)
elif args.order == "creator":
elements = sorted(self.elements, key=lambda el: el.creator or "")
elif args.order == "link":
elements = sorted(self.elements, key=lambda el: el.link)
2021-12-18 11:27:24 +01:00
elif args.order == "random":
elements_random = self.elements.copy()
random.shuffle(elements_random)
elements = elements_random
else:
elements = self.elements
# Possibly expensive filtering
elements = filter(lambda el: el.matches_filter(args), elements)
# Expensive sort
if args.order == "short":
elements = sorted(
elements, key=lambda el: el.duration if el.is_video else 0
)
elif args.order == "long":
elements = sorted(
elements, key=lambda el: el.duration if el.is_video else 0, reverse=True
)
# Post sorting filtering
if args.total_duration:
rem = parse_duration(args.total_duration)
old_els = list(elements)
elements = list()
while rem > 0:
for el in old_els:
if el.duration < rem:
elements.append(el)
rem -= el.duration
old_els.remove(el)
break
else:
break
return elements
2021-12-18 11:27:24 +01:00
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def get_args() -> configargparse.Namespace:
defaultConfigPath = os.path.join(
os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
)
parser = configargparse.ArgParser(
description="Download videos linked in "
+ "a RSS feed (e.g. an unread feed from "
+ "an RSS aggregator",
default_config_files=[defaultConfigPath],
)
# Runtime settings
parser.add_argument(
"-v",
"--verbosity",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default=None,
help="Verbosity of log messages",
)
2020-12-27 14:20:44 +01:00
parser.add(
"-c", "--config", required=False, is_config_file=True, help="Configuration file"
)
parser.add(
"-n",
"--dryrun",
help="Only pretend to do actions",
action="store_const",
const=True,
default=False,
)
# Input/Output
2020-12-27 14:20:44 +01:00
parser.add(
"--feed",
help="URL of the RSS feed (must be public for now)",
env_var="RSS_VIDEOS_FEED",
required=True,
)
parser.add(
"--research",
help="Fetch video info again",
action="store_true",
)
parser.add(
"--no-refresh",
dest="refresh",
help="Don't fetch feed",
action="store_false",
)
2020-12-27 14:20:44 +01:00
parser.add(
"--videos",
help="Directory to store videos",
env_var="RSS_VIDEOS_VIDEO_DIR",
required=True,
)
# Which videos
2020-12-27 14:20:44 +01:00
parser.add(
"--order",
choices=("old", "new", "title", "creator", "link", "short", "long", "random"),
default="old",
help="Sorting mechanism",
2020-12-27 14:20:44 +01:00
)
parser.add("--guid", help="Regex to filter guid")
parser.add("--creator", help="Regex to filter by creator")
parser.add("--title", help="Regex to filter by title")
parser.add("--link", help="Regex to filter by link")
parser.add("--duration", help="Comparative to filter by duration")
parser.add(
"--seen",
choices=("seen", "unseen", "any"),
default="unseen",
help="Only include seen/unseen/any videos",
)
parser.add(
"--total-duration",
help="Use videos that fit under the total given",
)
# TODO Envrionment variables
2020-12-27 14:20:44 +01:00
parser.add(
"--max-duration",
help="(Deprecated, use --duration instead)",
2020-12-27 14:20:44 +01:00
env_var="RSS_VIDEOS_MAX_DURATION",
type=int,
default=0,
)
# TODO Allow to ask
# How to download
2020-12-27 14:20:44 +01:00
parser.add(
"--format",
help="Use this format to download videos."
+ " See FORMAT SELECTION in youtube-dl(1)",
env_var="RSS_VIDEOS_FORMAT",
default="bestvideo+bestaudio/best",
)
parser.add(
"--subtitles",
help="Download all subtitles",
env_var="RSS_VIDEOS_SUBTITLES",
action="store_true",
)
2019-04-30 08:22:27 +02:00
parser.add(
"action",
nargs="?",
choices=(
"download",
"list",
"watch",
"binge",
"clean",
"seen",
"unseen",
),
default="download",
)
2019-04-30 08:22:27 +02:00
args = parser.parse_args()
args.videos = os.path.realpath(os.path.expanduser(args.videos))
if not args.duration and args.max_duration:
args.duration = str(args.max_duration)
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
return args
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def main() -> None:
args = get_args()
configure_logging(args)
2020-12-27 14:20:44 +01:00
os.makedirs(args.videos, exist_ok=True)
2019-04-30 08:22:27 +02:00
os.chdir(args.videos)
2021-12-18 11:27:24 +01:00
database = RVDatabase(args)
cache = RVDatabase.load()
feed_fetched = False
if args.refresh:
try:
database.read_feed()
feed_fetched = True
except urllib.error.URLError as err:
if args.action == "download":
raise RuntimeError("Couldn't fetch feed, refusing to download")
# This is a quirky failsafe in case of no internet connection,
# so the script doesn't go noting that no element is a video.
if not feed_fetched:
if cache:
log.warning("Using cached feed.")
2021-12-18 11:27:24 +01:00
database.import_cache(cache)
else:
raise FileNotFoundError("Feed not fetched and no cached feed.")
2021-12-18 11:27:24 +01:00
if cache:
database.salvage_cache(cache)
2021-12-18 12:44:43 +01:00
database.clean_cache(cache)
database.save()
log.debug(f"Running action")
2021-12-18 12:44:43 +01:00
if args.action == "clean":
database.clean()
else:
duration = 0
2021-12-18 12:44:43 +01:00
for element in database.filter(args):
if args.action == "download":
element.preload()
elif args.action == "list":
print(element)
elif args.action in ("watch", "binge"):
element.watch()
if args.action == "watch":
break
elif args.action == "seen":
if not element.watched:
log.info(f"Maked as seen: {element}")
element.watched = True
elif args.action == "unseen":
if element.watched:
log.info(f"Maked as unseen: {element}")
element.watched = False
else:
raise NotImplementedError(f"Unimplemented action: {args.action}")
duration += element.duration if element.is_video else 0
log.info(f"Total duration: {format_duration(duration)}")
2021-12-18 12:44:43 +01:00
database.attempt_clean()
2021-12-18 11:27:24 +01:00
database.save()
2021-12-17 23:16:32 +01:00
2020-12-27 14:20:44 +01:00
if __name__ == "__main__":
main()