rssViedos mostly

2020-12-27 14:20:44 +01:00 · 2020-12-27 14:20:44 +01:00 · 709239dfca
commit 709239dfca
parent ceb1e40964
17 changed files with 479 additions and 111 deletions
--- a/config/scripts/rssVideos
+++ b/config/scripts/rssVideos
@ -10,10 +10,10 @@ with the unread items (non-video links are ignored).
 # TODO Distribute this correclty, in the meanwhile please do
 # pip install --user youtube-dl ConfigArgParse progressbar2

-# TODO Allow to specify youtube_dl options (e.g. subtitles)
-# TODO Restrict quality (it's not that I don't like 8GB 4K videos but...)
+# TODO Better logging (youtube-dl allow to pass loggers)

-from typing import Dict, Set
+import sys
+from typing import Dict, Set, Tuple
 import urllib.request
 import urllib.parse
 import os
@ -22,27 +22,69 @@ import youtube_dl
 import configargparse


-if __name__ == "__main__":
+def get_args() -> configargparse.Namespace:
+    defaultConfigPath = os.path.join(
+        os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
+    )

-    defaultConfigPath = os.path.join(os.path.expanduser(
-        os.getenv('XDG_CONFIG_PATH', '~/.config/')), 'rssVideos')
-
-
-    parser = configargparse.ArgParser(description="Download videos linked in " +
-                                      "a RSS feed (e.g. an unread feed from " +
-                                      "an RSS aggregator",
-                                      default_config_files=[defaultConfigPath])
-    parser.add('-c', '--config', required=False, is_config_file=True,
-               help='Configuration file')
-    parser.add('--feed', help='URL of the RSS feed (must be public for now)',
-               env_var='RSS_VIDEOS_FEED', required=True)
-    parser.add('--videos', help='Directory to store videos',
-               env_var='RSS_VIDEOS_VIDEO_DIR', required=True)
-    parser.add('-n', '--dryrun', help='Do not download the videos',
-               action='store_const', const=True, default=False)
-    # TODO This feature might require additional documentation and an on/off switc
-    parser.add('--track', help='Directory where download videos are maked (so they are not downloaded twice)',
-               env_var='RSS_VIDEOS_TRACK', required=False, default='.rssVideos')
+    parser = configargparse.ArgParser(
+        description="Download videos linked in "
+        + "a RSS feed (e.g. an unread feed from "
+        + "an RSS aggregator",
+        default_config_files=[defaultConfigPath],
+    )
+    parser.add(
+        "-c", "--config", required=False, is_config_file=True, help="Configuration file"
+    )
+    parser.add(
+        "--feed",
+        help="URL of the RSS feed (must be public for now)",
+        env_var="RSS_VIDEOS_FEED",
+        required=True,
+    )
+    parser.add(
+        "--videos",
+        help="Directory to store videos",
+        env_var="RSS_VIDEOS_VIDEO_DIR",
+        required=True,
+    )
+    parser.add(
+        "-n",
+        "--dryrun",
+        help="Do not download the videos",
+        action="store_const",
+        const=True,
+        default=False,
+    )
+    # TODO This feature might require additional documentation and an on/off switch
+    parser.add(
+        "--track",
+        help="Directory where download videos are marked "
+        + "to not download them after deletion.",
+        env_var="RSS_VIDEOS_TRACK",
+        required=False,
+        default=".rssVideos",
+    )
+    parser.add(
+        "--max-duration",
+        help="Skip video longer than this amount of seconds",
+        env_var="RSS_VIDEOS_MAX_DURATION",
+        type=int,
+        default=0,
+    )
+    parser.add(
+        "--format",
+        help="Use this format to download videos."
+        + " See FORMAT SELECTION in youtube-dl(1)",
+        env_var="RSS_VIDEOS_FORMAT",
+        default="bestvideo+bestaudio/best",
+    )
+    parser.add(
+        "--subtitles",
+        help="Download all subtitles",
+        env_var="RSS_VIDEOS_SUBTITLES",
+        action="store_true",
+    )

    args = parser.parse_args()
    args.videos = os.path.realpath(os.path.expanduser(args.videos))
@ -50,54 +92,70 @@ if __name__ == "__main__":
    if not os.path.isabs(args.track):
        args.track = os.path.realpath(os.path.join(args.videos, args.track))

-    os.makedirs(args.videos, exist_ok=True)
-    os.makedirs(args.track, exist_ok=True)
+    return args

-    # Read the feed XML, get the links
-    print("→ Retrieveing RSS feed")

-    links: Set[str] = set()
+def get_links(args: configargparse.Namespace) -> Set[str]:
+    """
+    Read the feed XML, get the links
+    """
+    links = set()
    with urllib.request.urlopen(args.feed) as request:
        with minidom.parse(request) as xmldoc:
-            for item in xmldoc.getElementsByTagName('item'):
+            for item in xmldoc.getElementsByTagName("item"):
                try:
-                    linkNode = item.getElementsByTagName('link')[0]
+                    linkNode = item.getElementsByTagName("link")[0]
                    link: str = linkNode.childNodes[0].data
                    links.add(link)
                except BaseException as e:
                    print("Error while getting link from item:", e)
                    continue
+    return links

-    # Filter out non-video links and store video download info
-    # and associated filename
-    print(f"→ Getting infos on {len(links)} unread articles")

-    videosInfos: Dict[str, str] = {}
+def get_video_infos(
+    args: configargparse.Namespace, ydl_opts: Dict, links: Set[str]
+) -> Dict[str, Dict]:
+    """
+    Filter out non-video links and store video download info
+    and associated filename
+    """
+    videosInfos = dict()

-    ydl_opts = {
-        "simulate": True,
-        "quiet": True
-    }
-    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+    dry_ydl_opts = ydl_opts.copy()
+    dry_ydl_opts.update({"simulate": True, "quiet": True})
+    with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:
        for link in links:
            print(f"Researching {link}...")
            try:
                infos = ydl.extract_info(link)
+                if args.max_duration > 0 and infos["duration"] > args.max_duration:
+                    print(
+                        f"{infos['title']}: Skipping as longer than max duration: "
+                        f"{infos['duration']} > {args.max_duration}"
+                    )
+                    continue
                filepath = ydl.prepare_filename(infos)
                filename, extension = os.path.splitext(filepath)
                videosInfos[filename] = infos
+                print(f"{infos['title']}: Added")
+
            except BaseException as e:
                print(e)
                continue

-    # Read the directory content, delete everything that's not a
-    # video on the download list or already downloaded
-    print(f"→ Deciding on what to do for {len(videosInfos)} videos")
+    return videosInfos

-    # Getting information on the video directory

-    videosDownloaded: Set[str] = set()
-    videosPartiallyDownloaded: Set[str] = set()
+def get_downloaded_videos(
+    args: configargparse.Namespace, videosInfos: Dict[str, Dict]
+) -> Tuple[Set[str], Set[str]]:
+    videosDownloaded = set()
+    videosPartiallyDownloaded = set()
+    """
+    Read the directory content, delete everything that's not a
+    video on the download list or already downloaded
+    """

    for filepath in os.listdir(args.videos):
        fullpath = os.path.join(args.videos, filepath)
@ -106,12 +164,19 @@ if __name__ == "__main__":
        filename, extension = os.path.splitext(filepath)

        for onlineFilename in videosInfos.keys():
-            # Full name already there: completly downloaded → remove from the download list
+            # Full name already there: completly downloaded
+            # → remove from the download list
            if filename == onlineFilename:
                videosDownloaded.add(onlineFilename)
                break
-            # Partial name already there: not completly downloaded → keep on the download list
            elif filename.startswith(onlineFilename):
+                # Subtitle file
+                # → ignore
+                if filename.endswith(".vtt"):
+                    break
+
+                # Partial name already there: not completly downloaded
+                # → keep on the download list
                videosPartiallyDownloaded.add(onlineFilename)
                break
        # Unrelated filename: delete
@ -119,10 +184,17 @@ if __name__ == "__main__":
            print(f"Deleting: {filename}")
            os.unlink(fullpath)

-    # Getting informations on the tracking directory
+    return videosDownloaded, videosPartiallyDownloaded

-    # Videos that were once downloaded using this tool
-    videosTracked: Set[str] = set()
+
+def get_tracked_videos(args: configargparse.Namespace, known: Set[str]) -> Set[str]:
+    """
+    Return videos previously downloaded (=tracked) amongst the unread videos.
+    This is stored in the tracking directory as empty extension-less files.
+    Other tracking markers (e.g. for now read videos) are deleted.
+    """
+
+    videosTracked = set()

    for filepath in os.listdir(args.track):
        fullpath = os.path.join(args.track, filepath)
@ -130,18 +202,39 @@ if __name__ == "__main__":
            continue
        # Here filename is a filepath as no extension

-        if filepath in videosInfos:
+        if filepath in known:
            videosTracked.add(filepath)
        else:
            os.unlink(fullpath)

+    return videosTracked
+
+
+def main() -> None:
+
+    args = get_args()
+
+    os.makedirs(args.videos, exist_ok=True)
+    os.makedirs(args.track, exist_ok=True)
+    ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}
+
+    print("→ Retrieveing RSS feed")
+    links = get_links(args)
+
+    print(f"→ Getting infos on {len(links)} unread articles")
+    videosInfos = get_video_infos(args, ydl_opts, links)
+
+    print(f"→ Deciding on what to do for {len(videosInfos)} videos")
+    videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(
+        args, videosInfos
+    )
+    videosTracked = get_tracked_videos(args, set(videosInfos.keys()))
+
    # Deciding for the rest based on the informations

-
-    def markTracked(filename):
+    def markTracked(filename: str) -> None:
        markerPath = os.path.join(args.track, onlineFilename)
-        open(markerPath, 'a').close()
-
+        open(markerPath, "a").close()

    videosToDownload: Set[str] = set()
    videosReads: Set[str] = set()
@ -169,11 +262,10 @@ if __name__ == "__main__":

    os.chdir(args.videos)

+    exit_code = 0
    if not args.dryrun:
        # TODO Progressbar one day maybe?
        # We have all the info we need to make a reliable one
-        ydl_opts = {
-        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            for onlineFilename in videosToDownload:
                infos = videosInfos[onlineFilename]
@ -183,6 +275,13 @@ if __name__ == "__main__":
                    ydl.process_ie_result(infos, True, {})

                    markTracked(onlineFilename)
-                except:
+                except BaseException as e:
+                    print(e)
+                    exit_code = 1
                    continue

+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()