dotfiles/hm/scripts/rssVideos

713 lines
22 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env nix-shell
#! nix-shell -i python3
#! nix-shell -p python3 python3Packages.coloredlogs python3Packages.configargparse python3Packages.filelock python3Packages.filelock python3Packages.requests python3Packages.yt-dlp ffmpeg
# Also needs mpv but if I put it there it's not using the configured one
2019-04-30 08:22:27 +02:00
2019-04-30 08:22:27 +02:00
"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""
2021-12-29 14:40:00 +01:00
import datetime
import functools
import logging
import os
import pickle
2021-12-17 23:16:32 +01:00
import random
2021-12-17 22:13:46 +01:00
import re
2021-12-17 23:16:32 +01:00
import subprocess
2023-11-23 22:59:09 +01:00
import sys
import time
import typing
import coloredlogs
2019-04-30 08:22:27 +02:00
import configargparse
2023-11-23 22:59:09 +01:00
import filelock
import requests
import yt_dlp
log = logging.getLogger(__name__)
2021-12-28 12:35:08 +01:00
def configure_logging(args: configargparse.Namespace) -> None:
# Configure logging
if args.verbosity:
coloredlogs.install(
level=args.verbosity,
)
else:
coloredlogs.install(
fmt="%(message)s",
logger=log,
)
class SaveInfoPP(yt_dlp.postprocessor.common.PostProcessor):
"""
yt_dlp.process_ie_result() doesn't return a completely updated info dict,
notably the extension is still the one before it realizes the files cannot
be merged. So we use this PostProcessor to catch the info dict in its final
form and save what we need from it (it's not serializable in this state).
"""
def __init__(self, rvelement: "RVElement") -> None:
self.rvelement = rvelement
super().__init__()
def run(self, info: dict) -> tuple[list, dict]:
self.rvelement.update_post_download(info)
return [], info
2021-12-28 12:35:08 +01:00
def parse_duration(string: str) -> int:
DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, "": 1}
mult_index = string[-1].lower()
if mult_index.isdigit():
mult_index = ""
else:
string = string[:-1]
try:
multiplier = DURATION_MULTIPLIERS[mult_index]
except IndexError:
raise ValueError(f"Unknown duration multiplier: {mult_index}")
return int(string) * multiplier
def compare_duration(compstr: str) -> typing.Callable[[int], bool]:
DURATION_COMPARATORS = {
"<": int.__lt__,
"-": int.__lt__,
">": int.__gt__,
"+": int.__gt__,
"=": int.__eq__,
"": int.__le__,
}
comp_index = compstr[0]
if comp_index.isdigit():
comp_index = ""
else:
compstr = compstr[1:]
try:
comparator = DURATION_COMPARATORS[comp_index]
except IndexError:
raise ValueError(f"Unknown duration comparator: {comp_index}")
duration = parse_duration(compstr)
return lambda d: comparator(d, duration)
2021-12-28 12:35:08 +01:00
def format_duration(duration: int) -> str:
return time.strftime("%H:%M:%S", time.gmtime(duration))
class RVElement:
parent: "RVDatabase"
2021-12-28 12:35:08 +01:00
item: dict
2022-03-23 18:54:05 +01:00
RERESEARCH_AFTER = datetime.timedelta(hours=1)
2021-12-28 12:35:08 +01:00
def __init__(self, parent: "RVDatabase", item: dict) -> None:
self.parent = parent
self.item = item
@property
def id(self) -> str:
return self.item["id"]
2022-03-23 18:54:05 +01:00
@property
def sid(self) -> str:
return self.id.split("/")[-1]
def metafile(self, extension: str) -> str:
return os.path.join(self.parent.METADATA_FOLDER, f"{self.sid}.{extension}")
def metafile_read(self, extension: str) -> typing.Any:
return self.parent.metafile_read(f"{self.sid}.{extension}")
def metafile_write(self, extension: str, data: typing.Any) -> None:
return self.parent.metafile_write(f"{self.sid}.{extension}", data)
def save(self) -> None:
self.metafile_write("item", self.item)
@property
def title(self) -> str:
2021-12-28 12:35:08 +01:00
return self.item["title"]
@property
def link(self) -> str:
2021-12-28 12:35:08 +01:00
return self.item["canonical"][0]["href"]
@property
2021-12-28 12:35:08 +01:00
def creator(self) -> str:
return self.item["origin"]["title"]
@property
2021-12-29 14:40:00 +01:00
def date(self) -> datetime.datetime:
2022-11-04 14:07:37 +01:00
timestamp = (
int(self.item.get("timestampUsec", "0")) / 1000000
or int(self.item.get("crawlTimeMsec", "0")) / 1000
or self.item["published"]
)
return datetime.datetime.fromtimestamp(timestamp)
2021-12-18 12:44:43 +01:00
@property
def is_researched(self) -> bool:
2022-03-23 18:54:05 +01:00
metafile = self.metafile("ytdl")
return os.path.isfile(metafile)
def __str__(self) -> str:
2021-12-29 14:40:00 +01:00
str = f"{self.date.strftime('%y-%m-%d %H:%M')} ("
if self.is_researched:
if self.is_video:
2021-12-29 14:40:00 +01:00
str += format_duration(self.duration)
else:
2021-12-29 14:40:00 +01:00
str += "--:--:--"
else:
2021-12-29 14:40:00 +01:00
str += "??:??:??"
str += (
f") {self.creator if self.creator else '?'} "
f" {self.title} "
f" {self.link}"
)
return str
@property
def downloaded(self) -> bool:
2021-12-18 12:44:43 +01:00
if not self.is_researched:
return False
return os.path.isfile(self.filepath)
@functools.cached_property
def ytdl_infos(self) -> typing.Optional[dict]:
2022-03-23 18:54:05 +01:00
try:
return self.metafile_read("ytdl")
except (FileNotFoundError, TypeError, AttributeError, EOFError):
infos = self._ytdl_infos()
self.metafile_write("ytdl", infos)
return infos
def _ytdl_infos(self) -> typing.Optional[dict]:
log.info(f"Researching: {self}")
try:
infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
except KeyboardInterrupt as e:
raise e
except yt_dlp.utils.DownloadError as e:
# TODO Still raise in case of temporary network issue
2021-12-18 11:27:24 +01:00
log.warning(e)
infos = None
if infos:
infos = self.parent.ytdl_dry.sanitize_info(infos)
return infos
@property
2021-12-17 22:42:35 +01:00
def duration(self) -> int:
assert self.is_video
assert self.ytdl_infos
2023-06-24 13:36:50 +02:00
return int(self.ytdl_infos["duration"])
2021-12-17 22:42:35 +01:00
@property
def is_video(self) -> bool:
# Duration might be missing in playlists and stuff
return self.ytdl_infos is not None and "duration" in self.ytdl_infos
2022-03-23 18:54:05 +01:00
@functools.cached_property
def downloaded_filepath(self) -> typing.Optional[str]:
try:
return self.metafile_read("path")
except FileNotFoundError:
return None
@property
def was_downloaded(self) -> bool:
metafile = self.metafile("path")
return os.path.exists(metafile)
@property
def filepath(self) -> str:
assert self.is_video
if self.downloaded_filepath:
return self.downloaded_filepath
return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)
@property
2022-03-23 18:54:05 +01:00
def basename(self) -> str:
assert self.is_video
return os.path.splitext(self.filepath)[0]
2022-03-23 18:54:05 +01:00
def expire_info(self) -> None:
metafile = self.metafile("ytdl")
if os.path.isfile(metafile):
stat = os.stat(metafile)
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
diff = datetime.datetime.now() - mtime
if diff > self.RERESEARCH_AFTER:
os.unlink(metafile)
del self.ytdl_infos
def download(self) -> None:
assert self.is_video
2022-03-23 18:54:05 +01:00
if self.downloaded:
return
self.expire_info()
log.info(f"Downloading: {self}")
2022-03-23 18:54:05 +01:00
lockfile = self.metafile("lock")
with filelock.FileLock(lockfile):
if not self.parent.args.dryrun:
with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
ydl.add_post_processor(SaveInfoPP(self))
ydl.process_ie_result(self.ytdl_infos, download=True)
def update_post_download(self, info: dict) -> None:
self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
2022-03-27 13:02:55 +02:00
assert self.downloaded_filepath
assert self.downloaded_filepath.startswith(self.basename)
2022-03-23 18:54:05 +01:00
self.metafile_write("path", self.downloaded_filepath)
@property
def watched(self) -> bool:
if not self.is_researched:
return False
return self.was_downloaded and not self.downloaded
2021-12-18 11:27:24 +01:00
def matches_filter(self, args: configargparse.Namespace) -> bool:
# Inexpensive filters
2021-12-18 22:23:48 +01:00
if args.seen != "any" and (args.seen == "seen") != self.watched:
log.debug(f"Not {args.seen}: {self}")
2021-12-17 23:16:32 +01:00
return False
2021-12-17 22:13:46 +01:00
if args.title and not re.search(args.title, self.title):
2021-12-18 12:44:43 +01:00
log.debug(f"Title not matching {args.title}: {self}")
2021-12-17 22:13:46 +01:00
return False
if args.link and not re.search(args.link, self.link):
2021-12-18 12:44:43 +01:00
log.debug(f"Link not matching {args.link}: {self}")
2021-12-17 22:13:46 +01:00
return False
if args.creator and (
not self.creator or not re.search(args.creator, self.creator)
):
2021-12-18 12:44:43 +01:00
log.debug(f"Creator not matching {args.creator}: {self}")
2021-12-18 11:27:24 +01:00
return False
# Expensive filters
2021-12-18 11:27:24 +01:00
if not self.is_video:
2021-12-18 12:44:43 +01:00
log.debug(f"Not a video: {self}")
2021-12-18 11:27:24 +01:00
return False
if args.duration and not compare_duration(args.duration)(self.duration):
2021-12-28 12:35:08 +01:00
log.debug(f"Duration {self.duration} not matching {args.duration}: {self}")
return False
2021-12-17 22:13:46 +01:00
return True
2021-12-17 23:16:32 +01:00
def watch(self) -> None:
2022-03-23 18:54:05 +01:00
self.download()
2021-12-17 23:16:32 +01:00
2021-12-18 11:27:24 +01:00
cmd = ["mpv", self.filepath]
log.debug(f"Running {cmd}")
if not self.parent.args.dryrun:
proc = subprocess.run(cmd)
proc.check_returncode()
2021-12-17 23:16:32 +01:00
2022-03-23 18:54:05 +01:00
self.undownload()
self.try_mark_read()
2021-12-17 23:16:32 +01:00
2022-03-23 18:54:05 +01:00
def clean_file(self, folder: str, basename: str) -> None:
for file in os.listdir(folder):
if file.startswith(basename):
path = os.path.join(folder, file)
log.debug(f"Removing file: {path}")
2021-12-18 12:44:43 +01:00
if not self.parent.args.dryrun:
2022-03-23 18:54:05 +01:00
os.unlink(path)
def undownload(self) -> None:
assert self.is_video
log.info(f"Removing gone video: {self.basename}*")
self.clean_file(".", self.basename)
def clean(self) -> None:
2022-04-02 20:53:06 +02:00
if self.is_researched and self.is_video:
2022-03-23 18:54:05 +01:00
self.undownload()
log.info(f"Removing gone metadata: {self.sid}*")
self.clean_file(self.parent.METADATA_FOLDER, self.sid)
2021-12-18 12:44:43 +01:00
def mark_read(self) -> None:
log.debug(f"Marking {self} read")
if self.parent.args.dryrun:
return
r = requests.post(
f"{self.parent.args.url}/reader/api/0/edit-tag",
data={
"i": self.id,
"a": "user/-/state/com.google/read",
"ac": "edit",
"token": self.parent.feed_token,
},
headers=self.parent.auth_headers,
)
r.raise_for_status()
if r.text.strip() != "OK":
raise RuntimeError(f"Couldn't mark {self} as read: {r.text}")
log.info(f"Marked {self} as read")
2022-03-23 18:54:05 +01:00
self.clean()
def try_mark_read(self) -> None:
try:
self.mark_read()
except requests.ConnectionError:
log.warning(f"Couldn't mark {self} as read")
class RVDatabase:
2022-03-23 18:54:05 +01:00
METADATA_FOLDER = ".metadata"
args: configargparse.Namespace
elements: list[RVElement]
def __init__(self, args: configargparse.Namespace) -> None:
self.args = args
2022-03-23 18:54:05 +01:00
def metafile_read(self, name: str) -> typing.Any:
path = os.path.join(self.METADATA_FOLDER, name)
log.debug(f"Reading {path}")
with open(path, "rb") as mf:
return pickle.load(mf)
2022-03-23 18:54:05 +01:00
def metafile_write(self, name: str, data: typing.Any) -> None:
path = os.path.join(self.METADATA_FOLDER, name)
log.debug(f"Writing {path}")
if not self.args.dryrun:
with open(path, "wb") as mf:
pickle.dump(data, mf)
2021-12-18 11:27:24 +01:00
2021-12-18 12:44:43 +01:00
def clean_cache(self, cache: "RVDatabase") -> None:
2021-12-28 21:39:10 +01:00
log.debug("Cleaning cache")
2022-03-23 18:54:05 +01:00
fresh_ids = set(el.id for el in self.elements)
2021-12-18 12:44:43 +01:00
for el in cache.elements:
2022-03-23 18:54:05 +01:00
if el.id not in fresh_ids:
el.clean()
2022-03-23 18:54:05 +01:00
def _auth_headers(self) -> dict[str, str]:
2021-12-28 12:35:08 +01:00
r = requests.get(
f"{self.args.url}/accounts/ClientLogin",
params={"Email": self.args.email, "Passwd": self.args.passwd},
)
r.raise_for_status()
for line in r.text.split("\n"):
if line.lower().startswith("auth="):
val = "=".join(line.split("=")[1:])
return {"Authorization": f"GoogleLogin auth={val}"}
raise RuntimeError("Couldn't find auth= key")
2022-03-23 18:54:05 +01:00
@functools.cached_property
def auth_headers(self) -> dict[str, str]:
try:
return self.metafile_read(".auth_headers")
except FileNotFoundError:
headers = self._auth_headers()
self.metafile_write(".auth_headers", headers)
return headers
2021-12-28 12:35:08 +01:00
def fetch_feed_elements(self) -> typing.Generator[dict, None, None]:
2021-12-18 11:27:24 +01:00
log.info("Fetching RSS feed")
2021-12-28 12:35:08 +01:00
continuation: typing.Optional[str] = None
with requests.Session() as s:
def next_page() -> typing.Generator[dict, None, None]:
nonlocal continuation
r = s.get(
f"{self.args.url}/reader/api/0/stream/contents",
params={
"xt": "user/-/state/com.google/read",
"c": continuation,
},
headers=self.auth_headers,
)
r.raise_for_status()
json = r.json()
yield from json["items"]
continuation = json.get("continuation")
yield from next_page()
while continuation:
yield from next_page()
2022-03-23 18:54:05 +01:00
def fetch_cache_elements(self) -> typing.Generator[dict, None, None]:
log.info("Fetching from cache")
for file in os.listdir(self.METADATA_FOLDER):
if not file.endswith(".item"):
continue
yield self.metafile_read(file)
def build_list(self, items: typing.Iterable[dict], save: bool = False) -> None:
2021-12-18 11:27:24 +01:00
self.elements = []
2021-12-28 12:35:08 +01:00
for item in items:
element = RVElement(self, item)
self.elements.insert(0, element)
log.debug(f"Known: {element}")
2022-03-23 18:54:05 +01:00
if save:
element.save()
2021-12-28 12:35:08 +01:00
def read_feed(self) -> None:
2022-03-23 18:54:05 +01:00
self.build_list(self.fetch_feed_elements(), save=True)
2021-12-28 12:35:08 +01:00
2022-03-23 18:54:05 +01:00
def read_cache(self) -> None:
self.build_list(self.fetch_cache_elements())
def clean_folder(self, folder: str, basenames: set[str]) -> None:
for file in os.listdir(folder):
path = os.path.join(folder, file)
if not os.path.isfile(path) or file[0] == ".":
continue
2022-03-23 18:54:05 +01:00
for basename in basenames:
if file.startswith(basename):
break
else:
2022-03-23 18:54:05 +01:00
log.info(f"Removing unknown file: {path}")
if not self.args.dryrun:
2022-03-23 18:54:05 +01:00
os.unlink(path)
2021-12-18 12:44:43 +01:00
2022-03-23 18:54:05 +01:00
def clean(self) -> None:
log.debug("Cleaning")
filenames = set(el.basename for el in self.elements if el.is_video)
self.clean_folder(".", filenames)
ids = set(el.sid for el in self.elements)
self.clean_folder(self.METADATA_FOLDER, ids)
2021-12-18 12:44:43 +01:00
@property
def ytdl_opts(self) -> dict:
# Get user/system options
2022-04-10 09:58:06 +02:00
prev_argv = sys.argv
2022-11-04 14:07:37 +01:00
sys.argv = ["yt-dlp"]
2022-04-10 09:58:06 +02:00
_, _, _, ydl_opts = yt_dlp.parse_options()
sys.argv = prev_argv
return ydl_opts
@property
def ytdl_dry_opts(self) -> dict:
opts = self.ytdl_opts.copy()
opts.update({"quiet": True})
return opts
@property
def ytdl_dry(self) -> yt_dlp.YoutubeDL:
return yt_dlp.YoutubeDL(self.ytdl_dry_opts)
2019-04-30 08:22:27 +02:00
2021-12-18 11:27:24 +01:00
def filter(self, args: configargparse.Namespace) -> typing.Iterable[RVElement]:
2022-01-08 12:36:04 +01:00
elements_src = self.elements.copy()
2021-12-18 11:27:24 +01:00
elements: typing.Iterable[RVElement]
# Inexpensive sort
if args.order == "new":
2022-03-23 18:54:05 +01:00
elements = sorted(elements_src, key=lambda el: el.date, reverse=True)
elif args.order == "old":
elements = sorted(elements_src, key=lambda el: el.date)
elif args.order == "title":
2022-01-08 12:36:04 +01:00
elements = sorted(elements_src, key=lambda el: el.title)
elif args.order == "creator":
2022-01-08 12:36:04 +01:00
elements = sorted(elements_src, key=lambda el: el.creator or "")
elif args.order == "link":
2022-01-08 12:36:04 +01:00
elements = sorted(elements_src, key=lambda el: el.link)
2021-12-18 11:27:24 +01:00
elif args.order == "random":
2022-01-08 12:36:04 +01:00
elements = elements_src
random.shuffle(elements)
# Possibly expensive filtering
elements = filter(lambda el: el.matches_filter(args), elements)
# Expensive sort
if args.order == "short":
elements = sorted(
elements, key=lambda el: el.duration if el.is_video else 0
)
elif args.order == "long":
elements = sorted(
elements, key=lambda el: el.duration if el.is_video else 0, reverse=True
)
# Post sorting filtering
if args.total_duration:
rem = parse_duration(args.total_duration)
old_els = list(elements)
elements = list()
while rem > 0:
for el in old_els:
if el.duration < rem:
elements.append(el)
rem -= el.duration
old_els.remove(el)
break
else:
break
return elements
2021-12-18 11:27:24 +01:00
@functools.cached_property
def feed_token(self) -> str:
r = requests.get(
f"{self.args.url}/reader/api/0/token",
headers=self.auth_headers,
)
r.raise_for_status()
return r.text.strip()
def try_mark_watched_read(self) -> None:
for element in self.elements:
if element.watched:
element.try_mark_read()
2019-04-30 08:22:27 +02:00
2020-12-27 14:20:44 +01:00
def get_args() -> configargparse.Namespace:
defaultConfigPath = os.path.join(
os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
)
parser = configargparse.ArgParser(
2021-12-28 12:35:08 +01:00
description="Download videos in unread articles from a feed aggregator",
2020-12-27 14:20:44 +01:00
default_config_files=[defaultConfigPath],
)
# Runtime settings
parser.add_argument(
"-v",
"--verbosity",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default=None,
help="Verbosity of log messages",
)
2020-12-27 14:20:44 +01:00
parser.add(
"-c", "--config", required=False, is_config_file=True, help="Configuration file"
)
parser.add(
"-n",
"--dryrun",
help="Only pretend to do actions",
action="store_const",
const=True,
default=False,
)
# Input/Output
2020-12-27 14:20:44 +01:00
parser.add(
2021-12-28 12:35:08 +01:00
"--url",
help="URL of the Google Reader API of the aggregator",
env_var="RSS_VIDEOS_URL",
required=True,
)
parser.add(
"--email",
help="E-mail / user to connect to the aggregator",
env_var="RSS_VIDEOS_EMAIL",
required=True,
)
parser.add(
"--passwd",
help="Password to connect to the aggregator",
env_var="RSS_VIDEOS_PASSWD",
2020-12-27 14:20:44 +01:00
required=True,
)
parser.add(
"--no-refresh",
dest="refresh",
help="Don't fetch feed",
action="store_false",
)
2020-12-27 14:20:44 +01:00
parser.add(
"--videos",
help="Directory to store videos",
env_var="RSS_VIDEOS_VIDEO_DIR",
required=True,
)
# Which videos
2020-12-27 14:20:44 +01:00
parser.add(
"--order",
choices=("old", "new", "title", "creator", "link", "short", "long", "random"),
default="old",
help="Sorting mechanism",
2020-12-27 14:20:44 +01:00
)
parser.add("--creator", help="Regex to filter by creator")
parser.add("--title", help="Regex to filter by title")
parser.add("--link", help="Regex to filter by link")
parser.add("--duration", help="Comparative to filter by duration")
2021-12-29 14:40:00 +01:00
# TODO Date selector
parser.add(
"--seen",
choices=("seen", "unseen", "any"),
default="unseen",
help="Only include seen/unseen/any videos",
)
parser.add(
"--total-duration",
help="Use videos that fit under the total given",
)
# TODO Envrionment variables
# TODO Allow to ask
parser.add(
"action",
nargs="?",
choices=(
"download",
"list",
"watch",
"binge",
),
default="download",
)
2019-04-30 08:22:27 +02:00
args = parser.parse_args()
args.videos = os.path.realpath(os.path.expanduser(args.videos))
2020-12-27 14:20:44 +01:00
return args
2019-04-30 08:22:27 +02:00
2021-12-28 21:39:10 +01:00
def get_database(args: configargparse.Namespace) -> RVDatabase:
2022-03-23 18:54:05 +01:00
cache = RVDatabase(args)
cache.read_cache()
if not args.refresh:
return cache
2022-03-23 18:54:05 +01:00
fresh = RVDatabase(args)
fresh.read_feed()
fresh.clean_cache(cache)
return fresh
2021-12-28 21:39:10 +01:00
def main() -> None:
args = get_args()
configure_logging(args)
2022-03-23 18:54:05 +01:00
metadata_dir = os.path.join(args.videos, RVDatabase.METADATA_FOLDER)
for dir in (args.videos, metadata_dir):
os.makedirs(dir, exist_ok=True)
2021-12-28 21:39:10 +01:00
os.chdir(args.videos)
database = get_database(args)
log.debug("Running action")
2022-03-23 18:54:05 +01:00
duration = 0
for element in database.filter(args):
duration += element.duration if element.is_video else 0
if args.action == "download":
element.download()
elif args.action == "list":
print(element)
elif args.action in ("watch", "binge"):
element.watch()
if args.action == "watch":
break
else:
raise NotImplementedError(f"Unimplemented action: {args.action}")
log.info(f"Total duration: {format_duration(duration)}")
database.try_mark_watched_read()
2022-03-27 13:02:55 +02:00
database.clean()
2021-12-17 23:16:32 +01:00
2020-12-27 14:20:44 +01:00
if __name__ == "__main__":
main()