287 lines
		
	
	
	
		
			9 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			287 lines
		
	
	
	
		
			9 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| """
 | |
| Script that download videos that are linked as an article
 | |
| in a RSS feed.
 | |
| The common use case would be a feed from an RSS aggregator
 | |
| with the unread items (non-video links are ignored).
 | |
| """
 | |
| 
 | |
| # TODO Distribute this correclty, in the meanwhile please do
 | |
| # pip install --user youtube-dl ConfigArgParse progressbar2
 | |
| 
 | |
| # TODO Better logging (youtube-dl allow to pass loggers)
 | |
| 
 | |
| import sys
 | |
| from typing import Dict, Set, Tuple
 | |
| import urllib.request
 | |
| import urllib.parse
 | |
| import os
 | |
| from xml.dom import minidom
 | |
| import youtube_dl
 | |
| import configargparse
 | |
| 
 | |
| 
 | |
| def get_args() -> configargparse.Namespace:
 | |
|     defaultConfigPath = os.path.join(
 | |
|         os.path.expanduser(os.getenv("XDG_CONFIG_PATH", "~/.config/")), "rssVideos"
 | |
|     )
 | |
| 
 | |
|     parser = configargparse.ArgParser(
 | |
|         description="Download videos linked in "
 | |
|         + "a RSS feed (e.g. an unread feed from "
 | |
|         + "an RSS aggregator",
 | |
|         default_config_files=[defaultConfigPath],
 | |
|     )
 | |
|     parser.add(
 | |
|         "-c", "--config", required=False, is_config_file=True, help="Configuration file"
 | |
|     )
 | |
|     parser.add(
 | |
|         "--feed",
 | |
|         help="URL of the RSS feed (must be public for now)",
 | |
|         env_var="RSS_VIDEOS_FEED",
 | |
|         required=True,
 | |
|     )
 | |
|     parser.add(
 | |
|         "--videos",
 | |
|         help="Directory to store videos",
 | |
|         env_var="RSS_VIDEOS_VIDEO_DIR",
 | |
|         required=True,
 | |
|     )
 | |
|     parser.add(
 | |
|         "-n",
 | |
|         "--dryrun",
 | |
|         help="Do not download the videos",
 | |
|         action="store_const",
 | |
|         const=True,
 | |
|         default=False,
 | |
|     )
 | |
|     # TODO This feature might require additional documentation and an on/off switch
 | |
|     parser.add(
 | |
|         "--track",
 | |
|         help="Directory where download videos are marked "
 | |
|         + "to not download them after deletion.",
 | |
|         env_var="RSS_VIDEOS_TRACK",
 | |
|         required=False,
 | |
|         default=".rssVideos",
 | |
|     )
 | |
|     parser.add(
 | |
|         "--max-duration",
 | |
|         help="Skip video longer than this amount of seconds",
 | |
|         env_var="RSS_VIDEOS_MAX_DURATION",
 | |
|         type=int,
 | |
|         default=0,
 | |
|     )
 | |
|     parser.add(
 | |
|         "--format",
 | |
|         help="Use this format to download videos."
 | |
|         + " See FORMAT SELECTION in youtube-dl(1)",
 | |
|         env_var="RSS_VIDEOS_FORMAT",
 | |
|         default="bestvideo+bestaudio/best",
 | |
|     )
 | |
|     parser.add(
 | |
|         "--subtitles",
 | |
|         help="Download all subtitles",
 | |
|         env_var="RSS_VIDEOS_SUBTITLES",
 | |
|         action="store_true",
 | |
|     )
 | |
| 
 | |
|     args = parser.parse_args()
 | |
|     args.videos = os.path.realpath(os.path.expanduser(args.videos))
 | |
|     args.track = os.path.expanduser(args.track)
 | |
|     if not os.path.isabs(args.track):
 | |
|         args.track = os.path.realpath(os.path.join(args.videos, args.track))
 | |
| 
 | |
|     return args
 | |
| 
 | |
| 
 | |
| def get_links(args: configargparse.Namespace) -> Set[str]:
 | |
|     """
 | |
|     Read the feed XML, get the links
 | |
|     """
 | |
|     links = set()
 | |
|     with urllib.request.urlopen(args.feed) as request:
 | |
|         with minidom.parse(request) as xmldoc:
 | |
|             for item in xmldoc.getElementsByTagName("item"):
 | |
|                 try:
 | |
|                     linkNode = item.getElementsByTagName("link")[0]
 | |
|                     link: str = linkNode.childNodes[0].data
 | |
|                     links.add(link)
 | |
|                 except BaseException as e:
 | |
|                     print("Error while getting link from item:", e)
 | |
|                     continue
 | |
|     return links
 | |
| 
 | |
| 
 | |
| def get_video_infos(
 | |
|     args: configargparse.Namespace, ydl_opts: Dict, links: Set[str]
 | |
| ) -> Dict[str, Dict]:
 | |
|     """
 | |
|     Filter out non-video links and store video download info
 | |
|     and associated filename
 | |
|     """
 | |
|     videosInfos = dict()
 | |
| 
 | |
|     dry_ydl_opts = ydl_opts.copy()
 | |
|     dry_ydl_opts.update({"simulate": True, "quiet": True})
 | |
|     with youtube_dl.YoutubeDL(dry_ydl_opts) as ydl:
 | |
|         for link in links:
 | |
|             print(f"Researching {link}...")
 | |
|             try:
 | |
|                 infos = ydl.extract_info(link)
 | |
|                 if args.max_duration > 0 and infos["duration"] > args.max_duration:
 | |
|                     print(
 | |
|                         f"{infos['title']}: Skipping as longer than max duration: "
 | |
|                         f"{infos['duration']} > {args.max_duration}"
 | |
|                     )
 | |
|                     continue
 | |
|                 filepath = ydl.prepare_filename(infos)
 | |
|                 filename, extension = os.path.splitext(filepath)
 | |
|                 videosInfos[filename] = infos
 | |
|                 print(f"{infos['title']}: Added")
 | |
| 
 | |
|             except BaseException as e:
 | |
|                 print(e)
 | |
|                 continue
 | |
| 
 | |
|     return videosInfos
 | |
| 
 | |
| 
 | |
| def get_downloaded_videos(
 | |
|     args: configargparse.Namespace, videosInfos: Dict[str, Dict]
 | |
| ) -> Tuple[Set[str], Set[str]]:
 | |
|     videosDownloaded = set()
 | |
|     videosPartiallyDownloaded = set()
 | |
|     """
 | |
|     Read the directory content, delete everything that's not a
 | |
|     video on the download list or already downloaded
 | |
|     """
 | |
| 
 | |
|     for filepath in os.listdir(args.videos):
 | |
|         fullpath = os.path.join(args.videos, filepath)
 | |
|         if not os.path.isfile(fullpath):
 | |
|             continue
 | |
|         filename, extension = os.path.splitext(filepath)
 | |
| 
 | |
|         for onlineFilename in videosInfos.keys():
 | |
|             # Full name already there: completly downloaded
 | |
|             # → remove from the download list
 | |
|             if filename == onlineFilename:
 | |
|                 videosDownloaded.add(onlineFilename)
 | |
|                 break
 | |
|             elif filename.startswith(onlineFilename):
 | |
|                 # Subtitle file
 | |
|                 # → ignore
 | |
|                 if filename.endswith(".vtt"):
 | |
|                     break
 | |
| 
 | |
|                 # Partial name already there: not completly downloaded
 | |
|                 # → keep on the download list
 | |
|                 videosPartiallyDownloaded.add(onlineFilename)
 | |
|                 break
 | |
|         # Unrelated filename: delete
 | |
|         else:
 | |
|             print(f"Deleting: {filename}")
 | |
|             os.unlink(fullpath)
 | |
| 
 | |
|     return videosDownloaded, videosPartiallyDownloaded
 | |
| 
 | |
| 
 | |
| def get_tracked_videos(args: configargparse.Namespace, known: Set[str]) -> Set[str]:
 | |
|     """
 | |
|     Return videos previously downloaded (=tracked) amongst the unread videos.
 | |
|     This is stored in the tracking directory as empty extension-less files.
 | |
|     Other tracking markers (e.g. for now read videos) are deleted.
 | |
|     """
 | |
| 
 | |
|     videosTracked = set()
 | |
| 
 | |
|     for filepath in os.listdir(args.track):
 | |
|         fullpath = os.path.join(args.track, filepath)
 | |
|         if not os.path.isfile(fullpath):
 | |
|             continue
 | |
|         # Here filename is a filepath as no extension
 | |
| 
 | |
|         if filepath in known:
 | |
|             videosTracked.add(filepath)
 | |
|         else:
 | |
|             os.unlink(fullpath)
 | |
| 
 | |
|     return videosTracked
 | |
| 
 | |
| 
 | |
| def main() -> None:
 | |
| 
 | |
|     args = get_args()
 | |
| 
 | |
|     os.makedirs(args.videos, exist_ok=True)
 | |
|     os.makedirs(args.track, exist_ok=True)
 | |
|     ydl_opts = {"format": args.format, "allsubtitles": args.subtitles}
 | |
| 
 | |
|     print("→ Retrieveing RSS feed")
 | |
|     links = get_links(args)
 | |
| 
 | |
|     print(f"→ Getting infos on {len(links)} unread articles")
 | |
|     videosInfos = get_video_infos(args, ydl_opts, links)
 | |
| 
 | |
|     print(f"→ Deciding on what to do for {len(videosInfos)} videos")
 | |
|     videosDownloaded, videosPartiallyDownloaded = get_downloaded_videos(
 | |
|         args, videosInfos
 | |
|     )
 | |
|     videosTracked = get_tracked_videos(args, set(videosInfos.keys()))
 | |
| 
 | |
|     # Deciding for the rest based on the informations
 | |
| 
 | |
|     def markTracked(filename: str) -> None:
 | |
|         markerPath = os.path.join(args.track, onlineFilename)
 | |
|         open(markerPath, "a").close()
 | |
| 
 | |
|     videosToDownload: Set[str] = set()
 | |
|     videosReads: Set[str] = set()
 | |
|     for onlineFilename in videosInfos.keys():
 | |
|         # If the video was once downloaded but manually deleted,
 | |
|         # the marker should be left
 | |
|         if onlineFilename in videosTracked:
 | |
|             print(f"Should be marked as read: {onlineFilename}")
 | |
|             # TODO Automatically do that one day maybe?
 | |
|             # Need to login to the FreshRSS API and keep track of
 | |
|             # the item id along the process
 | |
|             videosReads.add(onlineFilename)
 | |
|         elif onlineFilename in videosDownloaded:
 | |
|             markTracked(onlineFilename)
 | |
|             print(f"Already downloaded: {onlineFilename}")
 | |
|         else:
 | |
|             if onlineFilename in videosPartiallyDownloaded:
 | |
|                 print(f"Will be continued: {onlineFilename}")
 | |
|             else:
 | |
|                 print(f"Will be downloaded: {onlineFilename}")
 | |
|             videosToDownload.add(onlineFilename)
 | |
| 
 | |
|     # Download the missing videos
 | |
|     print(f"→ Downloading {len(videosToDownload)} videos")
 | |
| 
 | |
|     os.chdir(args.videos)
 | |
| 
 | |
|     exit_code = 0
 | |
|     if not args.dryrun:
 | |
|         # TODO Progressbar one day maybe?
 | |
|         # We have all the info we need to make a reliable one
 | |
|         with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 | |
|             for onlineFilename in videosToDownload:
 | |
|                 infos = videosInfos[onlineFilename]
 | |
| 
 | |
|                 # Really download
 | |
|                 try:
 | |
|                     ydl.process_ie_result(infos, True, {})
 | |
| 
 | |
|                     markTracked(onlineFilename)
 | |
|                 except BaseException as e:
 | |
|                     print(e)
 | |
|                     exit_code = 1
 | |
|                     continue
 | |
| 
 | |
|     sys.exit(exit_code)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 |