dotfiles/config/scripts/rssVideos

#!/usr/bin/env python3

"""
Script that download videos that are linked as an article
in a RSS feed.
The common use case would be a feed from an RSS aggregator
with the unread items (non-video links are ignored).
"""

# TODO Distribute this correclty, in the meanwhile please do
# pip install --user youtube-dl ConfigArgParse progressbar2

# TODO Allow to specify youtube_dl options (e.g. subtitles)
# TODO Restrict quality (it's not that I don't like 8GB 4K videos but...)

from typing import Dict, Set
import urllib.request
import urllib.parse
import os
from xml.dom import minidom
import youtube_dl
import configargparse


if __name__ == "__main__":

    defaultConfigPath = os.path.join(os.path.expanduser(
        os.getenv('XDG_CONFIG_PATH', '~/.config/')), 'rssVideos')


    parser = configargparse.ArgParser(description="Download videos linked in " +
                                      "a RSS feed (e.g. an unread feed from " +
                                      "an RSS aggregator",
                                      default_config_files=[defaultConfigPath])
    parser.add('-c', '--config', required=False, is_config_file=True,
               help='Configuration file')
    parser.add('--feed', help='URL of the RSS feed (must be public for now)',
               env_var='RSS_VIDEOS_FEED', required=True)
    parser.add('--videos', help='Directory to store videos',
               env_var='RSS_VIDEOS_VIDEO_DIR', required=True)
    parser.add('-n', '--dryrun', help='Do not download the videos',
               action='store_const', const=True, default=False)
    # TODO This feature might require additional documentation and an on/off switc
    parser.add('--track', help='Directory where download videos are maked (so they are not downloaded twice)',
               env_var='RSS_VIDEOS_TRACK', required=False, default='.rssVideos')

    args = parser.parse_args()
    args.videos = os.path.realpath(os.path.expanduser(args.videos))
    args.track = os.path.expanduser(args.track)
    if not os.path.isabs(args.track):
        args.track = os.path.realpath(os.path.join(args.videos, args.track))

    os.makedirs(args.videos, exist_ok=True)
    os.makedirs(args.track, exist_ok=True)

    # Read the feed XML, get the links
    print("→ Retrieveing RSS feed")

    links: Set[str] = set()
    with urllib.request.urlopen(args.feed) as request:
        with minidom.parse(request) as xmldoc:
            for item in xmldoc.getElementsByTagName('item'):
                try:
                    linkNode = item.getElementsByTagName('link')[0]
                    link: str = linkNode.childNodes[0].data
                    links.add(link)
                except BaseException as e:
                    print("Error while getting link from item:", e)
                    continue

    # Filter out non-video links and store video download info
    # and associated filename
    print(f"→ Getting infos on {len(links)} unread articles")

    videosInfos: Dict[str, str] = {}

    ydl_opts = {
        "simulate": True,
        "quiet": True
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        for link in links:
            print(f"Researching {link}...")
            try:
                infos = ydl.extract_info(link)
                filepath = ydl.prepare_filename(infos)
                filename, extension = os.path.splitext(filepath)
                videosInfos[filename] = infos
            except BaseException as e:
                print(e)
                continue

    # Read the directory content, delete everything that's not a
    # video on the download list or already downloaded
    print(f"→ Deciding on what to do for {len(videosInfos)} videos")

    # Getting information on the video directory

    videosDownloaded: Set[str] = set()
    videosPartiallyDownloaded: Set[str] = set()

    for filepath in os.listdir(args.videos):
        fullpath = os.path.join(args.videos, filepath)
        if not os.path.isfile(fullpath):
            continue
        filename, extension = os.path.splitext(filepath)

        for onlineFilename in videosInfos.keys():
            # Full name already there: completly downloaded → remove from the download list
            if filename == onlineFilename:
                videosDownloaded.add(onlineFilename)
                break
            # Partial name already there: not completly downloaded → keep on the download list
            elif filename.startswith(onlineFilename):
                videosPartiallyDownloaded.add(onlineFilename)
                break
        # Unrelated filename: delete
        else:
            print(f"Deleting: {filename}")
            os.unlink(fullpath)

    # Getting informations on the tracking directory

    # Videos that were once downloaded using this tool
    videosTracked: Set[str] = set()

    for filepath in os.listdir(args.track):
        fullpath = os.path.join(args.track, filepath)
        if not os.path.isfile(fullpath):
            continue
        # Here filename is a filepath as no extension

        if filepath in videosInfos:
            videosTracked.add(filepath)
        else:
            os.unlink(fullpath)

    # Deciding for the rest based on the informations


    def markTracked(filename):
        markerPath = os.path.join(args.track, onlineFilename)
        open(markerPath, 'a').close()


    videosToDownload: Set[str] = set()
    videosReads: Set[str] = set()
    for onlineFilename in videosInfos.keys():
        # If the video was once downloaded but manually deleted,
        # the marker should be left
        if onlineFilename in videosTracked:
            print(f"Should be marked as read: {onlineFilename}")
            # TODO Automatically do that one day maybe?
            # Need to login to the FreshRSS API and keep track of
            # the item id along the process
            videosReads.add(onlineFilename)
        elif onlineFilename in videosDownloaded:
            markTracked(onlineFilename)
            print(f"Already downloaded: {onlineFilename}")
        else:
            if onlineFilename in videosPartiallyDownloaded:
                print(f"Will be continued: {onlineFilename}")
            else:
                print(f"Will be downloaded: {onlineFilename}")
            videosToDownload.add(onlineFilename)

    # Download the missing videos
    print(f"→ Downloading {len(videosToDownload)} videos")

    os.chdir(args.videos)

    if not args.dryrun:
        # TODO Progressbar one day maybe?
        # We have all the info we need to make a reliable one
        ydl_opts = {
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            for onlineFilename in videosToDownload:
                infos = videosInfos[onlineFilename]

                # Really download
                ydl.process_ie_result(infos, True, {})

                markTracked(onlineFilename)
rssVideos 2019-04-30 08:22:27 +02:00			`#!/usr/bin/env python3`

			`"""`
			`Script that download videos that are linked as an article`
			`in a RSS feed.`
			`The common use case would be a feed from an RSS aggregator`
			`with the unread items (non-video links are ignored).`
			`"""`

			`# TODO Distribute this correclty, in the meanwhile please do`
			`# pip install --user youtube-dl ConfigArgParse progressbar2`

			`# TODO Allow to specify youtube_dl options (e.g. subtitles)`
			`# TODO Restrict quality (it's not that I don't like 8GB 4K videos but...)`

			`from typing import Dict, Set`
			`import urllib.request`
			`import urllib.parse`
			`import os`
			`from xml.dom import minidom`
			`import youtube_dl`
			`import configargparse`


			`if __name__ == "__main__":`

			`defaultConfigPath = os.path.join(os.path.expanduser(`
			`os.getenv('XDG_CONFIG_PATH', '~/.config/')), 'rssVideos')`


			`parser = configargparse.ArgParser(description="Download videos linked in " +`
			`"a RSS feed (e.g. an unread feed from " +`
			`"an RSS aggregator",`
			`default_config_files=[defaultConfigPath])`
			`parser.add('-c', '--config', required=False, is_config_file=True,`
			`help='Configuration file')`
			`parser.add('--feed', help='URL of the RSS feed (must be public for now)',`
			`env_var='RSS_VIDEOS_FEED', required=True)`
			`parser.add('--videos', help='Directory to store videos',`
			`env_var='RSS_VIDEOS_VIDEO_DIR', required=True)`
rssVideos dryRun 2019-05-08 17:25:23 +02:00			`parser.add('-n', '--dryrun', help='Do not download the videos',`
			`action='store_const', const=True, default=False)`
rssVideos 2019-04-30 08:22:27 +02:00			`# TODO This feature might require additional documentation and an on/off switc`
			`parser.add('--track', help='Directory where download videos are maked (so they are not downloaded twice)',`
			`env_var='RSS_VIDEOS_TRACK', required=False, default='.rssVideos')`

			`args = parser.parse_args()`
			`args.videos = os.path.realpath(os.path.expanduser(args.videos))`
			`args.track = os.path.expanduser(args.track)`
			`if not os.path.isabs(args.track):`
			`args.track = os.path.realpath(os.path.join(args.videos, args.track))`

			`os.makedirs(args.videos, exist_ok=True)`
			`os.makedirs(args.track, exist_ok=True)`

			`# Read the feed XML, get the links`
			`print("→ Retrieveing RSS feed")`

			`links: Set[str] = set()`
			`with urllib.request.urlopen(args.feed) as request:`
			`with minidom.parse(request) as xmldoc:`
			`for item in xmldoc.getElementsByTagName('item'):`
			`try:`
			`linkNode = item.getElementsByTagName('link')[0]`
			`link: str = linkNode.childNodes[0].data`
			`links.add(link)`
			`except BaseException as e:`
			`print("Error while getting link from item:", e)`
			`continue`

			`# Filter out non-video links and store video download info`
			`# and associated filename`
			`print(f"→ Getting infos on {len(links)} unread articles")`

			`videosInfos: Dict[str, str] = {}`

			`ydl_opts = {`
			`"simulate": True,`
			`"quiet": True`
			`}`
			`with youtube_dl.YoutubeDL(ydl_opts) as ydl:`
			`for link in links:`
			`print(f"Researching {link}...")`
			`try:`
			`infos = ydl.extract_info(link)`
			`filepath = ydl.prepare_filename(infos)`
			`filename, extension = os.path.splitext(filepath)`
			`videosInfos[filename] = infos`
			`except BaseException as e:`
			`print(e)`
			`continue`

			`# Read the directory content, delete everything that's not a`
			`# video on the download list or already downloaded`
			`print(f"→ Deciding on what to do for {len(videosInfos)} videos")`

			`# Getting information on the video directory`

			`videosDownloaded: Set[str] = set()`
			`videosPartiallyDownloaded: Set[str] = set()`

			`for filepath in os.listdir(args.videos):`
			`fullpath = os.path.join(args.videos, filepath)`
			`if not os.path.isfile(fullpath):`
			`continue`
			`filename, extension = os.path.splitext(filepath)`

			`for onlineFilename in videosInfos.keys():`
			`# Full name already there: completly downloaded → remove from the download list`
			`if filename == onlineFilename:`
			`videosDownloaded.add(onlineFilename)`
			`break`
			`# Partial name already there: not completly downloaded → keep on the download list`
			`elif filename.startswith(onlineFilename):`
			`videosPartiallyDownloaded.add(onlineFilename)`
			`break`
			`# Unrelated filename: delete`
			`else:`
			`print(f"Deleting: {filename}")`
			`os.unlink(fullpath)`

			`# Getting informations on the tracking directory`

			`# Videos that were once downloaded using this tool`
			`videosTracked: Set[str] = set()`

			`for filepath in os.listdir(args.track):`
			`fullpath = os.path.join(args.track, filepath)`
			`if not os.path.isfile(fullpath):`
			`continue`
			`# Here filename is a filepath as no extension`

			`if filepath in videosInfos:`
			`videosTracked.add(filepath)`
			`else:`
			`os.unlink(fullpath)`

			`# Deciding for the rest based on the informations`


			`def markTracked(filename):`
			`markerPath = os.path.join(args.track, onlineFilename)`
			`open(markerPath, 'a').close()`


			`videosToDownload: Set[str] = set()`
			`videosReads: Set[str] = set()`
			`for onlineFilename in videosInfos.keys():`
			`# If the video was once downloaded but manually deleted,`
			`# the marker should be left`
			`if onlineFilename in videosTracked:`
			`print(f"Should be marked as read: {onlineFilename}")`
			`# TODO Automatically do that one day maybe?`
			`# Need to login to the FreshRSS API and keep track of`
			`# the item id along the process`
			`videosReads.add(onlineFilename)`
			`elif onlineFilename in videosDownloaded:`
			`markTracked(onlineFilename)`
			`print(f"Already downloaded: {onlineFilename}")`
			`else:`
			`if onlineFilename in videosPartiallyDownloaded:`
			`print(f"Will be continued: {onlineFilename}")`
			`else:`
			`print(f"Will be downloaded: {onlineFilename}")`
			`videosToDownload.add(onlineFilename)`

			`# Download the missing videos`
			`print(f"→ Downloading {len(videosToDownload)} videos")`

			`os.chdir(args.videos)`

rssVideos dryRun 2019-05-08 17:25:23 +02:00			`if not args.dryrun:`
			`# TODO Progressbar one day maybe?`
			`# We have all the info we need to make a reliable one`
			`ydl_opts = {`
			`}`
			`with youtube_dl.YoutubeDL(ydl_opts) as ydl:`
			`for onlineFilename in videosToDownload:`
			`infos = videosInfos[onlineFilename]`
rssVideos 2019-04-30 08:22:27 +02:00
rssVideos dryRun 2019-05-08 17:25:23 +02:00			`# Really download`
			`ydl.process_ie_result(infos, True, {})`
rssVideos 2019-04-30 08:22:27 +02:00
rssVideos dryRun 2019-05-08 17:25:23 +02:00			`markTracked(onlineFilename)`
rssVideos 2019-04-30 08:22:27 +02:00