2019-04-30 08:22:27 +02:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2019-04-30 08:22:27 +02:00
|
|
|
|
"""
|
|
|
|
|
Script that download videos that are linked as an article
|
|
|
|
|
in a RSS feed.
|
|
|
|
|
The common use case would be a feed from an RSS aggregator
|
|
|
|
|
with the unread items (non-video links are ignored).
|
|
|
|
|
"""
|
|
|
|
|
|
2021-12-29 14:40:00 +01:00
|
|
|
|
import datetime
|
2022-03-23 18:54:05 +01:00
|
|
|
|
import filelock
|
2021-12-10 22:59:39 +01:00
|
|
|
|
import functools
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
import pickle
|
2021-12-17 23:16:32 +01:00
|
|
|
|
import random
|
2021-12-28 12:35:08 +01:00
|
|
|
|
import requests
|
2021-12-17 22:13:46 +01:00
|
|
|
|
import re
|
2021-12-17 23:16:32 +01:00
|
|
|
|
import subprocess
|
2021-12-19 11:45:41 +01:00
|
|
|
|
import time
|
2021-12-10 22:59:39 +01:00
|
|
|
|
import typing
|
2022-04-10 09:58:06 +02:00
|
|
|
|
import sys
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
import coloredlogs
|
2019-04-30 08:22:27 +02:00
|
|
|
|
import configargparse
|
2021-12-19 15:10:16 +01:00
|
|
|
|
import yt_dlp
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
2021-12-28 12:35:08 +01:00
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
def configure_logging(args: configargparse.Namespace) -> None:
|
|
|
|
|
# Configure logging
|
|
|
|
|
if args.verbosity:
|
|
|
|
|
coloredlogs.install(
|
|
|
|
|
level=args.verbosity,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
coloredlogs.install(
|
|
|
|
|
fmt="%(message)s",
|
|
|
|
|
logger=log,
|
|
|
|
|
)
|
|
|
|
|
|
2021-12-19 15:10:16 +01:00
|
|
|
|
|
|
|
|
|
class SaveInfoPP(yt_dlp.postprocessor.common.PostProcessor):
|
|
|
|
|
"""
|
|
|
|
|
yt_dlp.process_ie_result() doesn't return a completely updated info dict,
|
|
|
|
|
notably the extension is still the one before it realizes the files cannot
|
|
|
|
|
be merged. So we use this PostProcessor to catch the info dict in its final
|
2021-12-20 18:57:13 +01:00
|
|
|
|
form and save what we need from it (it's not serializable in this state).
|
2021-12-19 15:10:16 +01:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, rvelement: "RVElement") -> None:
|
|
|
|
|
self.rvelement = rvelement
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def run(self, info: dict) -> tuple[list, dict]:
|
2021-12-20 18:57:13 +01:00
|
|
|
|
self.rvelement.update_post_download(info)
|
2021-12-19 15:10:16 +01:00
|
|
|
|
return [], info
|
|
|
|
|
|
2021-12-28 12:35:08 +01:00
|
|
|
|
|
2021-12-19 23:13:41 +01:00
|
|
|
|
def parse_duration(string: str) -> int:
|
|
|
|
|
DURATION_MULTIPLIERS = {"s": 1, "m": 60, "h": 3600, "": 1}
|
|
|
|
|
|
|
|
|
|
mult_index = string[-1].lower()
|
|
|
|
|
if mult_index.isdigit():
|
|
|
|
|
mult_index = ""
|
|
|
|
|
else:
|
|
|
|
|
string = string[:-1]
|
|
|
|
|
try:
|
|
|
|
|
multiplier = DURATION_MULTIPLIERS[mult_index]
|
|
|
|
|
except IndexError:
|
|
|
|
|
raise ValueError(f"Unknown duration multiplier: {mult_index}")
|
|
|
|
|
|
|
|
|
|
return int(string) * multiplier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compare_duration(compstr: str) -> typing.Callable[[int], bool]:
|
|
|
|
|
DURATION_COMPARATORS = {
|
|
|
|
|
"<": int.__lt__,
|
|
|
|
|
"-": int.__lt__,
|
|
|
|
|
">": int.__gt__,
|
|
|
|
|
"+": int.__gt__,
|
|
|
|
|
"=": int.__eq__,
|
|
|
|
|
"": int.__le__,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
comp_index = compstr[0]
|
|
|
|
|
if comp_index.isdigit():
|
|
|
|
|
comp_index = ""
|
|
|
|
|
else:
|
|
|
|
|
compstr = compstr[1:]
|
|
|
|
|
try:
|
|
|
|
|
comparator = DURATION_COMPARATORS[comp_index]
|
|
|
|
|
except IndexError:
|
|
|
|
|
raise ValueError(f"Unknown duration comparator: {comp_index}")
|
|
|
|
|
|
|
|
|
|
duration = parse_duration(compstr)
|
|
|
|
|
|
|
|
|
|
return lambda d: comparator(d, duration)
|
2021-12-19 15:10:16 +01:00
|
|
|
|
|
2021-12-28 12:35:08 +01:00
|
|
|
|
|
2021-12-19 15:10:16 +01:00
|
|
|
|
def format_duration(duration: int) -> str:
|
2021-12-19 11:45:41 +01:00
|
|
|
|
return time.strftime("%H:%M:%S", time.gmtime(duration))
|
|
|
|
|
|
2021-12-12 13:40:24 +01:00
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
class RVElement:
|
|
|
|
|
parent: "RVDatabase"
|
2021-12-28 12:35:08 +01:00
|
|
|
|
item: dict
|
2022-03-23 18:54:05 +01:00
|
|
|
|
|
|
|
|
|
RERESEARCH_AFTER = datetime.timedelta(hours=1)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2021-12-28 12:35:08 +01:00
|
|
|
|
def __init__(self, parent: "RVDatabase", item: dict) -> None:
|
2021-12-10 22:59:39 +01:00
|
|
|
|
self.parent = parent
|
2021-12-12 14:27:08 +01:00
|
|
|
|
self.item = item
|
2021-12-29 12:56:07 +01:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def id(self) -> str:
|
|
|
|
|
return self.item["id"]
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
@property
|
|
|
|
|
def sid(self) -> str:
|
|
|
|
|
return self.id.split("/")[-1]
|
|
|
|
|
|
|
|
|
|
def metafile(self, extension: str) -> str:
|
|
|
|
|
return os.path.join(self.parent.METADATA_FOLDER, f"{self.sid}.{extension}")
|
|
|
|
|
|
|
|
|
|
def metafile_read(self, extension: str) -> typing.Any:
|
|
|
|
|
return self.parent.metafile_read(f"{self.sid}.{extension}")
|
|
|
|
|
|
|
|
|
|
def metafile_write(self, extension: str, data: typing.Any) -> None:
|
|
|
|
|
return self.parent.metafile_write(f"{self.sid}.{extension}", data)
|
|
|
|
|
|
|
|
|
|
def save(self) -> None:
|
|
|
|
|
self.metafile_write("item", self.item)
|
|
|
|
|
|
2021-12-12 14:27:08 +01:00
|
|
|
|
@property
|
|
|
|
|
def title(self) -> str:
|
2021-12-28 12:35:08 +01:00
|
|
|
|
return self.item["title"]
|
2021-12-12 14:27:08 +01:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def link(self) -> str:
|
2021-12-28 12:35:08 +01:00
|
|
|
|
return self.item["canonical"][0]["href"]
|
2021-12-12 14:27:08 +01:00
|
|
|
|
|
|
|
|
|
@property
|
2021-12-28 12:35:08 +01:00
|
|
|
|
def creator(self) -> str:
|
|
|
|
|
return self.item["origin"]["title"]
|
2021-12-12 14:27:08 +01:00
|
|
|
|
|
|
|
|
|
@property
|
2021-12-29 14:40:00 +01:00
|
|
|
|
def date(self) -> datetime.datetime:
|
2022-11-04 14:07:37 +01:00
|
|
|
|
timestamp = (
|
|
|
|
|
int(self.item.get("timestampUsec", "0")) / 1000000
|
|
|
|
|
or int(self.item.get("crawlTimeMsec", "0")) / 1000
|
|
|
|
|
or self.item["published"]
|
|
|
|
|
)
|
|
|
|
|
return datetime.datetime.fromtimestamp(timestamp)
|
2021-12-12 14:27:08 +01:00
|
|
|
|
|
2021-12-18 12:44:43 +01:00
|
|
|
|
@property
|
|
|
|
|
def is_researched(self) -> bool:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
metafile = self.metafile("ytdl")
|
|
|
|
|
return os.path.isfile(metafile)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
def __str__(self) -> str:
|
2021-12-29 14:40:00 +01:00
|
|
|
|
str = f"{self.date.strftime('%y-%m-%d %H:%M')} ("
|
2021-12-19 11:45:41 +01:00
|
|
|
|
if self.is_researched:
|
|
|
|
|
if self.is_video:
|
2021-12-29 14:40:00 +01:00
|
|
|
|
str += format_duration(self.duration)
|
2021-12-19 11:45:41 +01:00
|
|
|
|
else:
|
2021-12-29 14:40:00 +01:00
|
|
|
|
str += "--:--:--"
|
2021-12-19 11:45:41 +01:00
|
|
|
|
else:
|
2021-12-29 14:40:00 +01:00
|
|
|
|
str += "??:??:??"
|
|
|
|
|
str += (
|
|
|
|
|
f") {self.creator if self.creator else '?'} "
|
|
|
|
|
f"– {self.title} "
|
|
|
|
|
f"– {self.link}"
|
|
|
|
|
)
|
2021-12-19 11:45:41 +01:00
|
|
|
|
return str
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def downloaded(self) -> bool:
|
2021-12-18 12:44:43 +01:00
|
|
|
|
if not self.is_researched:
|
2021-12-10 22:59:39 +01:00
|
|
|
|
return False
|
|
|
|
|
return os.path.isfile(self.filepath)
|
|
|
|
|
|
|
|
|
|
@functools.cached_property
|
|
|
|
|
def ytdl_infos(self) -> typing.Optional[dict]:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
try:
|
|
|
|
|
return self.metafile_read("ytdl")
|
|
|
|
|
except (FileNotFoundError, TypeError, AttributeError, EOFError):
|
|
|
|
|
infos = self._ytdl_infos()
|
|
|
|
|
self.metafile_write("ytdl", infos)
|
|
|
|
|
return infos
|
|
|
|
|
|
|
|
|
|
def _ytdl_infos(self) -> typing.Optional[dict]:
|
2021-12-10 22:59:39 +01:00
|
|
|
|
log.info(f"Researching: {self}")
|
|
|
|
|
try:
|
2021-12-19 15:10:16 +01:00
|
|
|
|
infos = self.parent.ytdl_dry.extract_info(self.link, download=False)
|
2021-12-12 14:52:21 +01:00
|
|
|
|
except KeyboardInterrupt as e:
|
|
|
|
|
raise e
|
2021-12-19 15:10:16 +01:00
|
|
|
|
except yt_dlp.utils.DownloadError as e:
|
2021-12-10 22:59:39 +01:00
|
|
|
|
# TODO Still raise in case of temporary network issue
|
2021-12-18 11:27:24 +01:00
|
|
|
|
log.warning(e)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
infos = None
|
2021-12-20 18:57:13 +01:00
|
|
|
|
if infos:
|
|
|
|
|
infos = self.parent.ytdl_dry.sanitize_info(infos)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
return infos
|
|
|
|
|
|
|
|
|
|
@property
|
2021-12-17 22:42:35 +01:00
|
|
|
|
def duration(self) -> int:
|
2021-12-10 22:59:39 +01:00
|
|
|
|
assert self.is_video
|
|
|
|
|
assert self.ytdl_infos
|
2021-12-17 22:42:35 +01:00
|
|
|
|
return self.ytdl_infos["duration"]
|
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
@property
|
|
|
|
|
def is_video(self) -> bool:
|
|
|
|
|
# Duration might be missing in playlists and stuff
|
|
|
|
|
return self.ytdl_infos is not None and "duration" in self.ytdl_infos
|
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
@functools.cached_property
|
|
|
|
|
def downloaded_filepath(self) -> typing.Optional[str]:
|
|
|
|
|
try:
|
|
|
|
|
return self.metafile_read("path")
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def was_downloaded(self) -> bool:
|
|
|
|
|
metafile = self.metafile("path")
|
|
|
|
|
return os.path.exists(metafile)
|
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
@property
|
|
|
|
|
def filepath(self) -> str:
|
|
|
|
|
assert self.is_video
|
2021-12-20 18:57:13 +01:00
|
|
|
|
if self.downloaded_filepath:
|
|
|
|
|
return self.downloaded_filepath
|
2021-12-10 22:59:39 +01:00
|
|
|
|
return self.parent.ytdl_dry.prepare_filename(self.ytdl_infos)
|
|
|
|
|
|
|
|
|
|
@property
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def basename(self) -> str:
|
2021-12-10 22:59:39 +01:00
|
|
|
|
assert self.is_video
|
|
|
|
|
return os.path.splitext(self.filepath)[0]
|
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def expire_info(self) -> None:
|
|
|
|
|
metafile = self.metafile("ytdl")
|
|
|
|
|
if os.path.isfile(metafile):
|
|
|
|
|
stat = os.stat(metafile)
|
|
|
|
|
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
|
|
|
|
|
diff = datetime.datetime.now() - mtime
|
|
|
|
|
if diff > self.RERESEARCH_AFTER:
|
|
|
|
|
os.unlink(metafile)
|
|
|
|
|
del self.ytdl_infos
|
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
def download(self) -> None:
|
|
|
|
|
assert self.is_video
|
2022-03-23 18:54:05 +01:00
|
|
|
|
if self.downloaded:
|
|
|
|
|
return
|
|
|
|
|
self.expire_info()
|
2021-12-10 22:59:39 +01:00
|
|
|
|
log.info(f"Downloading: {self}")
|
2022-03-23 18:54:05 +01:00
|
|
|
|
lockfile = self.metafile("lock")
|
|
|
|
|
with filelock.FileLock(lockfile):
|
|
|
|
|
if not self.parent.args.dryrun:
|
|
|
|
|
with yt_dlp.YoutubeDL(self.parent.ytdl_opts) as ydl:
|
|
|
|
|
ydl.add_post_processor(SaveInfoPP(self))
|
|
|
|
|
ydl.process_ie_result(self.ytdl_infos, download=True)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2021-12-20 18:57:13 +01:00
|
|
|
|
def update_post_download(self, info: dict) -> None:
|
|
|
|
|
self.downloaded_filepath = self.parent.ytdl_dry.prepare_filename(info)
|
2022-03-27 13:02:55 +02:00
|
|
|
|
assert self.downloaded_filepath
|
|
|
|
|
assert self.downloaded_filepath.startswith(self.basename)
|
2022-03-23 18:54:05 +01:00
|
|
|
|
self.metafile_write("path", self.downloaded_filepath)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2021-12-29 12:56:07 +01:00
|
|
|
|
@property
|
|
|
|
|
def watched(self) -> bool:
|
|
|
|
|
if not self.is_researched:
|
|
|
|
|
return False
|
|
|
|
|
return self.was_downloaded and not self.downloaded
|
|
|
|
|
|
2021-12-18 11:27:24 +01:00
|
|
|
|
def matches_filter(self, args: configargparse.Namespace) -> bool:
|
2021-12-19 11:45:41 +01:00
|
|
|
|
# Inexpensive filters
|
2021-12-18 22:23:48 +01:00
|
|
|
|
if args.seen != "any" and (args.seen == "seen") != self.watched:
|
|
|
|
|
log.debug(f"Not {args.seen}: {self}")
|
2021-12-17 23:16:32 +01:00
|
|
|
|
return False
|
2021-12-17 22:13:46 +01:00
|
|
|
|
if args.title and not re.search(args.title, self.title):
|
2021-12-18 12:44:43 +01:00
|
|
|
|
log.debug(f"Title not matching {args.title}: {self}")
|
2021-12-17 22:13:46 +01:00
|
|
|
|
return False
|
|
|
|
|
if args.link and not re.search(args.link, self.link):
|
2021-12-18 12:44:43 +01:00
|
|
|
|
log.debug(f"Link not matching {args.link}: {self}")
|
2021-12-17 22:13:46 +01:00
|
|
|
|
return False
|
2021-12-19 15:10:16 +01:00
|
|
|
|
if args.creator and (
|
|
|
|
|
not self.creator or not re.search(args.creator, self.creator)
|
|
|
|
|
):
|
2021-12-18 12:44:43 +01:00
|
|
|
|
log.debug(f"Creator not matching {args.creator}: {self}")
|
2021-12-18 11:27:24 +01:00
|
|
|
|
return False
|
2021-12-19 11:45:41 +01:00
|
|
|
|
|
|
|
|
|
# Expensive filters
|
2021-12-18 11:27:24 +01:00
|
|
|
|
if not self.is_video:
|
2021-12-18 12:44:43 +01:00
|
|
|
|
log.debug(f"Not a video: {self}")
|
2021-12-18 11:27:24 +01:00
|
|
|
|
return False
|
2021-12-19 23:13:41 +01:00
|
|
|
|
if args.duration and not compare_duration(args.duration)(self.duration):
|
2021-12-28 12:35:08 +01:00
|
|
|
|
log.debug(f"Duration {self.duration} not matching {args.duration}: {self}")
|
2021-12-19 23:13:41 +01:00
|
|
|
|
return False
|
2021-12-19 11:45:41 +01:00
|
|
|
|
|
2021-12-17 22:13:46 +01:00
|
|
|
|
return True
|
|
|
|
|
|
2021-12-17 23:16:32 +01:00
|
|
|
|
def watch(self) -> None:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
self.download()
|
2021-12-17 23:16:32 +01:00
|
|
|
|
|
2021-12-18 11:27:24 +01:00
|
|
|
|
cmd = ["mpv", self.filepath]
|
|
|
|
|
log.debug(f"Running {cmd}")
|
|
|
|
|
if not self.parent.args.dryrun:
|
|
|
|
|
proc = subprocess.run(cmd)
|
|
|
|
|
proc.check_returncode()
|
2021-12-17 23:16:32 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
self.undownload()
|
2021-12-29 12:56:07 +01:00
|
|
|
|
self.try_mark_read()
|
2021-12-17 23:16:32 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def clean_file(self, folder: str, basename: str) -> None:
|
|
|
|
|
for file in os.listdir(folder):
|
|
|
|
|
if file.startswith(basename):
|
|
|
|
|
path = os.path.join(folder, file)
|
|
|
|
|
log.debug(f"Removing file: {path}")
|
2021-12-18 12:44:43 +01:00
|
|
|
|
if not self.parent.args.dryrun:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
os.unlink(path)
|
|
|
|
|
|
|
|
|
|
def undownload(self) -> None:
|
|
|
|
|
assert self.is_video
|
|
|
|
|
log.info(f"Removing gone video: {self.basename}*")
|
|
|
|
|
self.clean_file(".", self.basename)
|
|
|
|
|
|
|
|
|
|
def clean(self) -> None:
|
2022-04-02 20:53:06 +02:00
|
|
|
|
if self.is_researched and self.is_video:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
self.undownload()
|
|
|
|
|
log.info(f"Removing gone metadata: {self.sid}*")
|
|
|
|
|
self.clean_file(self.parent.METADATA_FOLDER, self.sid)
|
2021-12-18 12:44:43 +01:00
|
|
|
|
|
2021-12-29 12:56:07 +01:00
|
|
|
|
def mark_read(self) -> None:
|
|
|
|
|
log.debug(f"Marking {self} read")
|
|
|
|
|
if self.parent.args.dryrun:
|
|
|
|
|
return
|
|
|
|
|
r = requests.post(
|
|
|
|
|
f"{self.parent.args.url}/reader/api/0/edit-tag",
|
|
|
|
|
data={
|
|
|
|
|
"i": self.id,
|
|
|
|
|
"a": "user/-/state/com.google/read",
|
|
|
|
|
"ac": "edit",
|
|
|
|
|
"token": self.parent.feed_token,
|
|
|
|
|
},
|
|
|
|
|
headers=self.parent.auth_headers,
|
|
|
|
|
)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
if r.text.strip() != "OK":
|
|
|
|
|
raise RuntimeError(f"Couldn't mark {self} as read: {r.text}")
|
|
|
|
|
log.info(f"Marked {self} as read")
|
2022-03-23 18:54:05 +01:00
|
|
|
|
self.clean()
|
2021-12-29 12:56:07 +01:00
|
|
|
|
|
|
|
|
|
def try_mark_read(self) -> None:
|
|
|
|
|
try:
|
|
|
|
|
self.mark_read()
|
|
|
|
|
except requests.ConnectionError:
|
|
|
|
|
log.warning(f"Couldn't mark {self} as read")
|
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
class RVDatabase:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
METADATA_FOLDER = ".metadata"
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
args: configargparse.Namespace
|
|
|
|
|
elements: list[RVElement]
|
|
|
|
|
|
|
|
|
|
def __init__(self, args: configargparse.Namespace) -> None:
|
|
|
|
|
self.args = args
|
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def metafile_read(self, name: str) -> typing.Any:
|
|
|
|
|
path = os.path.join(self.METADATA_FOLDER, name)
|
|
|
|
|
log.debug(f"Reading {path}")
|
|
|
|
|
with open(path, "rb") as mf:
|
|
|
|
|
return pickle.load(mf)
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def metafile_write(self, name: str, data: typing.Any) -> None:
|
|
|
|
|
path = os.path.join(self.METADATA_FOLDER, name)
|
|
|
|
|
log.debug(f"Writing {path}")
|
|
|
|
|
if not self.args.dryrun:
|
|
|
|
|
with open(path, "wb") as mf:
|
|
|
|
|
pickle.dump(data, mf)
|
2021-12-18 11:27:24 +01:00
|
|
|
|
|
2021-12-18 12:44:43 +01:00
|
|
|
|
def clean_cache(self, cache: "RVDatabase") -> None:
|
2021-12-28 21:39:10 +01:00
|
|
|
|
log.debug("Cleaning cache")
|
2022-03-23 18:54:05 +01:00
|
|
|
|
fresh_ids = set(el.id for el in self.elements)
|
2021-12-18 12:44:43 +01:00
|
|
|
|
for el in cache.elements:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
if el.id not in fresh_ids:
|
|
|
|
|
el.clean()
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def _auth_headers(self) -> dict[str, str]:
|
2021-12-28 12:35:08 +01:00
|
|
|
|
r = requests.get(
|
|
|
|
|
f"{self.args.url}/accounts/ClientLogin",
|
|
|
|
|
params={"Email": self.args.email, "Passwd": self.args.passwd},
|
|
|
|
|
)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
for line in r.text.split("\n"):
|
|
|
|
|
if line.lower().startswith("auth="):
|
|
|
|
|
val = "=".join(line.split("=")[1:])
|
|
|
|
|
return {"Authorization": f"GoogleLogin auth={val}"}
|
|
|
|
|
raise RuntimeError("Couldn't find auth= key")
|
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
@functools.cached_property
|
|
|
|
|
def auth_headers(self) -> dict[str, str]:
|
|
|
|
|
try:
|
|
|
|
|
return self.metafile_read(".auth_headers")
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
headers = self._auth_headers()
|
|
|
|
|
self.metafile_write(".auth_headers", headers)
|
|
|
|
|
return headers
|
|
|
|
|
|
2021-12-28 12:35:08 +01:00
|
|
|
|
def fetch_feed_elements(self) -> typing.Generator[dict, None, None]:
|
2021-12-18 11:27:24 +01:00
|
|
|
|
log.info("Fetching RSS feed")
|
2021-12-28 12:35:08 +01:00
|
|
|
|
continuation: typing.Optional[str] = None
|
|
|
|
|
with requests.Session() as s:
|
|
|
|
|
|
|
|
|
|
def next_page() -> typing.Generator[dict, None, None]:
|
|
|
|
|
nonlocal continuation
|
|
|
|
|
r = s.get(
|
|
|
|
|
f"{self.args.url}/reader/api/0/stream/contents",
|
|
|
|
|
params={
|
|
|
|
|
"xt": "user/-/state/com.google/read",
|
|
|
|
|
"c": continuation,
|
|
|
|
|
},
|
|
|
|
|
headers=self.auth_headers,
|
|
|
|
|
)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
json = r.json()
|
|
|
|
|
yield from json["items"]
|
|
|
|
|
continuation = json.get("continuation")
|
|
|
|
|
|
|
|
|
|
yield from next_page()
|
|
|
|
|
while continuation:
|
|
|
|
|
yield from next_page()
|
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def fetch_cache_elements(self) -> typing.Generator[dict, None, None]:
|
|
|
|
|
log.info("Fetching from cache")
|
|
|
|
|
for file in os.listdir(self.METADATA_FOLDER):
|
|
|
|
|
if not file.endswith(".item"):
|
|
|
|
|
continue
|
|
|
|
|
yield self.metafile_read(file)
|
|
|
|
|
|
|
|
|
|
def build_list(self, items: typing.Iterable[dict], save: bool = False) -> None:
|
2021-12-18 11:27:24 +01:00
|
|
|
|
self.elements = []
|
2021-12-28 12:35:08 +01:00
|
|
|
|
for item in items:
|
2021-12-12 14:27:08 +01:00
|
|
|
|
element = RVElement(self, item)
|
|
|
|
|
self.elements.insert(0, element)
|
|
|
|
|
log.debug(f"Known: {element}")
|
2022-03-23 18:54:05 +01:00
|
|
|
|
if save:
|
|
|
|
|
element.save()
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
2021-12-28 12:35:08 +01:00
|
|
|
|
def read_feed(self) -> None:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
self.build_list(self.fetch_feed_elements(), save=True)
|
2021-12-28 12:35:08 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def read_cache(self) -> None:
|
|
|
|
|
self.build_list(self.fetch_cache_elements())
|
|
|
|
|
|
|
|
|
|
def clean_folder(self, folder: str, basenames: set[str]) -> None:
|
|
|
|
|
for file in os.listdir(folder):
|
|
|
|
|
path = os.path.join(folder, file)
|
|
|
|
|
if not os.path.isfile(path) or file[0] == ".":
|
2021-12-10 22:59:39 +01:00
|
|
|
|
continue
|
2022-03-23 18:54:05 +01:00
|
|
|
|
for basename in basenames:
|
|
|
|
|
if file.startswith(basename):
|
2021-12-10 22:59:39 +01:00
|
|
|
|
break
|
|
|
|
|
else:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
log.info(f"Removing unknown file: {path}")
|
2021-12-10 22:59:39 +01:00
|
|
|
|
if not self.args.dryrun:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
os.unlink(path)
|
2021-12-18 12:44:43 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
def clean(self) -> None:
|
|
|
|
|
log.debug("Cleaning")
|
|
|
|
|
filenames = set(el.basename for el in self.elements if el.is_video)
|
|
|
|
|
self.clean_folder(".", filenames)
|
|
|
|
|
ids = set(el.sid for el in self.elements)
|
|
|
|
|
self.clean_folder(self.METADATA_FOLDER, ids)
|
2021-12-18 12:44:43 +01:00
|
|
|
|
|
2021-12-10 22:59:39 +01:00
|
|
|
|
@property
|
|
|
|
|
def ytdl_opts(self) -> dict:
|
2022-03-27 15:18:41 +02:00
|
|
|
|
# Get user/system options
|
2022-04-10 09:58:06 +02:00
|
|
|
|
prev_argv = sys.argv
|
2022-11-04 14:07:37 +01:00
|
|
|
|
sys.argv = ["yt-dlp"]
|
2022-04-10 09:58:06 +02:00
|
|
|
|
_, _, _, ydl_opts = yt_dlp.parse_options()
|
|
|
|
|
sys.argv = prev_argv
|
2022-03-27 15:18:41 +02:00
|
|
|
|
return ydl_opts
|
2021-12-10 22:59:39 +01:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def ytdl_dry_opts(self) -> dict:
|
|
|
|
|
opts = self.ytdl_opts.copy()
|
2021-12-19 15:10:16 +01:00
|
|
|
|
opts.update({"quiet": True})
|
2021-12-10 22:59:39 +01:00
|
|
|
|
return opts
|
|
|
|
|
|
|
|
|
|
@property
|
2021-12-19 15:10:16 +01:00
|
|
|
|
def ytdl_dry(self) -> yt_dlp.YoutubeDL:
|
|
|
|
|
return yt_dlp.YoutubeDL(self.ytdl_dry_opts)
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
2021-12-18 11:27:24 +01:00
|
|
|
|
def filter(self, args: configargparse.Namespace) -> typing.Iterable[RVElement]:
|
2022-01-08 12:36:04 +01:00
|
|
|
|
elements_src = self.elements.copy()
|
2021-12-18 11:27:24 +01:00
|
|
|
|
elements: typing.Iterable[RVElement]
|
2021-12-19 11:45:41 +01:00
|
|
|
|
# Inexpensive sort
|
|
|
|
|
if args.order == "new":
|
2022-03-23 18:54:05 +01:00
|
|
|
|
elements = sorted(elements_src, key=lambda el: el.date, reverse=True)
|
|
|
|
|
elif args.order == "old":
|
|
|
|
|
elements = sorted(elements_src, key=lambda el: el.date)
|
2021-12-19 11:45:41 +01:00
|
|
|
|
elif args.order == "title":
|
2022-01-08 12:36:04 +01:00
|
|
|
|
elements = sorted(elements_src, key=lambda el: el.title)
|
2021-12-19 11:45:41 +01:00
|
|
|
|
elif args.order == "creator":
|
2022-01-08 12:36:04 +01:00
|
|
|
|
elements = sorted(elements_src, key=lambda el: el.creator or "")
|
2021-12-19 11:45:41 +01:00
|
|
|
|
elif args.order == "link":
|
2022-01-08 12:36:04 +01:00
|
|
|
|
elements = sorted(elements_src, key=lambda el: el.link)
|
2021-12-18 11:27:24 +01:00
|
|
|
|
elif args.order == "random":
|
2022-01-08 12:36:04 +01:00
|
|
|
|
elements = elements_src
|
|
|
|
|
random.shuffle(elements)
|
2021-12-19 11:45:41 +01:00
|
|
|
|
|
|
|
|
|
# Possibly expensive filtering
|
|
|
|
|
elements = filter(lambda el: el.matches_filter(args), elements)
|
|
|
|
|
|
|
|
|
|
# Expensive sort
|
|
|
|
|
if args.order == "short":
|
2021-12-19 15:10:16 +01:00
|
|
|
|
elements = sorted(
|
|
|
|
|
elements, key=lambda el: el.duration if el.is_video else 0
|
|
|
|
|
)
|
2021-12-19 23:13:41 +01:00
|
|
|
|
elif args.order == "long":
|
2021-12-19 15:10:16 +01:00
|
|
|
|
elements = sorted(
|
|
|
|
|
elements, key=lambda el: el.duration if el.is_video else 0, reverse=True
|
|
|
|
|
)
|
2021-12-19 11:45:41 +01:00
|
|
|
|
|
2021-12-19 23:13:41 +01:00
|
|
|
|
# Post sorting filtering
|
|
|
|
|
if args.total_duration:
|
|
|
|
|
rem = parse_duration(args.total_duration)
|
|
|
|
|
old_els = list(elements)
|
|
|
|
|
elements = list()
|
|
|
|
|
while rem > 0:
|
|
|
|
|
for el in old_els:
|
|
|
|
|
if el.duration < rem:
|
|
|
|
|
elements.append(el)
|
|
|
|
|
rem -= el.duration
|
|
|
|
|
old_els.remove(el)
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
|
2021-12-19 11:45:41 +01:00
|
|
|
|
return elements
|
2021-12-18 11:27:24 +01:00
|
|
|
|
|
2021-12-29 12:56:07 +01:00
|
|
|
|
@functools.cached_property
|
|
|
|
|
def feed_token(self) -> str:
|
|
|
|
|
r = requests.get(
|
|
|
|
|
f"{self.args.url}/reader/api/0/token",
|
|
|
|
|
headers=self.auth_headers,
|
|
|
|
|
)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
return r.text.strip()
|
|
|
|
|
|
|
|
|
|
def try_mark_watched_read(self) -> None:
|
|
|
|
|
for element in self.elements:
|
|
|
|
|
if element.watched:
|
|
|
|
|
element.try_mark_read()
|
|
|
|
|
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
|
def get_args() -> configargparse.Namespace:
|
|
|
|
|
defaultConfigPath = os.path.join(
|
|
|
|
|
os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
parser = configargparse.ArgParser(
|
2021-12-28 12:35:08 +01:00
|
|
|
|
description="Download videos in unread articles from a feed aggregator",
|
2020-12-27 14:20:44 +01:00
|
|
|
|
default_config_files=[defaultConfigPath],
|
|
|
|
|
)
|
2021-12-18 11:56:28 +01:00
|
|
|
|
|
|
|
|
|
# Runtime settings
|
2021-12-10 22:59:39 +01:00
|
|
|
|
parser.add_argument(
|
|
|
|
|
"-v",
|
|
|
|
|
"--verbosity",
|
|
|
|
|
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
|
|
|
|
default=None,
|
|
|
|
|
help="Verbosity of log messages",
|
|
|
|
|
)
|
2020-12-27 14:20:44 +01:00
|
|
|
|
parser.add(
|
|
|
|
|
"-c", "--config", required=False, is_config_file=True, help="Configuration file"
|
|
|
|
|
)
|
2021-12-18 11:56:28 +01:00
|
|
|
|
parser.add(
|
|
|
|
|
"-n",
|
|
|
|
|
"--dryrun",
|
|
|
|
|
help="Only pretend to do actions",
|
|
|
|
|
action="store_const",
|
|
|
|
|
const=True,
|
|
|
|
|
default=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Input/Output
|
2020-12-27 14:20:44 +01:00
|
|
|
|
parser.add(
|
2021-12-28 12:35:08 +01:00
|
|
|
|
"--url",
|
|
|
|
|
help="URL of the Google Reader API of the aggregator",
|
|
|
|
|
env_var="RSS_VIDEOS_URL",
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
|
|
|
|
parser.add(
|
|
|
|
|
"--email",
|
|
|
|
|
help="E-mail / user to connect to the aggregator",
|
|
|
|
|
env_var="RSS_VIDEOS_EMAIL",
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
|
|
|
|
parser.add(
|
|
|
|
|
"--passwd",
|
|
|
|
|
help="Password to connect to the aggregator",
|
|
|
|
|
env_var="RSS_VIDEOS_PASSWD",
|
2020-12-27 14:20:44 +01:00
|
|
|
|
required=True,
|
|
|
|
|
)
|
2021-12-19 22:29:16 +01:00
|
|
|
|
parser.add(
|
|
|
|
|
"--no-refresh",
|
|
|
|
|
dest="refresh",
|
|
|
|
|
help="Don't fetch feed",
|
|
|
|
|
action="store_false",
|
|
|
|
|
)
|
2020-12-27 14:20:44 +01:00
|
|
|
|
parser.add(
|
|
|
|
|
"--videos",
|
|
|
|
|
help="Directory to store videos",
|
|
|
|
|
env_var="RSS_VIDEOS_VIDEO_DIR",
|
|
|
|
|
required=True,
|
|
|
|
|
)
|
2021-12-18 11:56:28 +01:00
|
|
|
|
|
|
|
|
|
# Which videos
|
2020-12-27 14:20:44 +01:00
|
|
|
|
parser.add(
|
2021-12-18 11:56:28 +01:00
|
|
|
|
"--order",
|
2021-12-19 11:45:41 +01:00
|
|
|
|
choices=("old", "new", "title", "creator", "link", "short", "long", "random"),
|
2021-12-18 11:56:28 +01:00
|
|
|
|
default="old",
|
|
|
|
|
help="Sorting mechanism",
|
2020-12-27 14:20:44 +01:00
|
|
|
|
)
|
2021-12-18 11:56:28 +01:00
|
|
|
|
parser.add("--creator", help="Regex to filter by creator")
|
|
|
|
|
parser.add("--title", help="Regex to filter by title")
|
|
|
|
|
parser.add("--link", help="Regex to filter by link")
|
|
|
|
|
parser.add("--duration", help="Comparative to filter by duration")
|
2021-12-29 14:40:00 +01:00
|
|
|
|
# TODO Date selector
|
2021-12-19 15:10:16 +01:00
|
|
|
|
parser.add(
|
|
|
|
|
"--seen",
|
|
|
|
|
choices=("seen", "unseen", "any"),
|
|
|
|
|
default="unseen",
|
|
|
|
|
help="Only include seen/unseen/any videos",
|
|
|
|
|
)
|
2021-12-19 23:13:41 +01:00
|
|
|
|
parser.add(
|
|
|
|
|
"--total-duration",
|
|
|
|
|
help="Use videos that fit under the total given",
|
|
|
|
|
)
|
2021-12-18 11:56:28 +01:00
|
|
|
|
# TODO Envrionment variables
|
|
|
|
|
# TODO Allow to ask
|
|
|
|
|
|
|
|
|
|
parser.add(
|
|
|
|
|
"action",
|
|
|
|
|
nargs="?",
|
2021-12-19 15:10:16 +01:00
|
|
|
|
choices=(
|
|
|
|
|
"download",
|
|
|
|
|
"list",
|
|
|
|
|
"watch",
|
|
|
|
|
"binge",
|
|
|
|
|
),
|
2021-12-18 11:56:28 +01:00
|
|
|
|
default="download",
|
|
|
|
|
)
|
2021-12-10 23:13:29 +01:00
|
|
|
|
|
2019-04-30 08:22:27 +02:00
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
args.videos = os.path.realpath(os.path.expanduser(args.videos))
|
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
|
return args
|
2019-04-30 08:22:27 +02:00
|
|
|
|
|
|
|
|
|
|
2021-12-28 21:39:10 +01:00
|
|
|
|
def get_database(args: configargparse.Namespace) -> RVDatabase:
|
2022-03-23 18:54:05 +01:00
|
|
|
|
cache = RVDatabase(args)
|
|
|
|
|
cache.read_cache()
|
|
|
|
|
if not args.refresh:
|
|
|
|
|
return cache
|
2021-12-18 11:56:28 +01:00
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
fresh = RVDatabase(args)
|
|
|
|
|
fresh.read_feed()
|
|
|
|
|
fresh.clean_cache(cache)
|
|
|
|
|
return fresh
|
2021-12-28 21:39:10 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
|
args = get_args()
|
|
|
|
|
configure_logging(args)
|
|
|
|
|
|
2022-03-23 18:54:05 +01:00
|
|
|
|
metadata_dir = os.path.join(args.videos, RVDatabase.METADATA_FOLDER)
|
|
|
|
|
for dir in (args.videos, metadata_dir):
|
|
|
|
|
os.makedirs(dir, exist_ok=True)
|
2021-12-28 21:39:10 +01:00
|
|
|
|
os.chdir(args.videos)
|
|
|
|
|
|
|
|
|
|
database = get_database(args)
|
|
|
|
|
|
|
|
|
|
log.debug("Running action")
|
2022-03-23 18:54:05 +01:00
|
|
|
|
duration = 0
|
|
|
|
|
for element in database.filter(args):
|
|
|
|
|
duration += element.duration if element.is_video else 0
|
|
|
|
|
if args.action == "download":
|
|
|
|
|
element.download()
|
|
|
|
|
elif args.action == "list":
|
|
|
|
|
print(element)
|
|
|
|
|
elif args.action in ("watch", "binge"):
|
|
|
|
|
element.watch()
|
|
|
|
|
if args.action == "watch":
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
raise NotImplementedError(f"Unimplemented action: {args.action}")
|
|
|
|
|
log.info(f"Total duration: {format_duration(duration)}")
|
2021-12-29 12:56:07 +01:00
|
|
|
|
database.try_mark_watched_read()
|
2022-03-27 13:02:55 +02:00
|
|
|
|
database.clean()
|
2021-12-17 23:16:32 +01:00
|
|
|
|
|
2020-12-27 14:20:44 +01:00
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|