dotfiles/config/scripts/compressPictureMovies

#!/usr/bin/env python3

import os
import shutil
import subprocess
import sys
import logging
import coloredlogs
import progressbar
import time
import hashlib
import tempfile
import json
import statistics
import datetime

coloredlogs.install(level='DEBUG', fmt='%(levelname)s %(message)s')
log = logging.getLogger()

# Constants
PICTURES_FOLDER = os.path.join(os.path.expanduser("~"), "Images")
ORIGINAL_FOLDER = os.path.join(os.path.expanduser("~"), ".ImagesOriginaux")
MOVIE_EXTENSIONS = ["mov", "avi", "mp4", "3gp", "webm", "mkv"]
OUTPUT_EXTENSION = "webm"
OUTPUT_FFMPEG_PARAMETERS = ["-c:v", "libvpx-vp9", "-crf", "30", "-b:v", "0"]
# OUTPUT_FFMPEG_PARAMETERS = ["-c:v", "libaom-av1", "-crf", "30", "-strict", "experimental", "-c:a", "libopus"]
DURATION_MAX_DEV = 1


def videoMetadata(filename):
    assert os.path.isfile(filename)
    cmd = ["ffmpeg", "-i", filename, "-f", "ffmetadata", "-"]
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    p.check_returncode()
    metadataRaw = p.stdout
    data = dict()
    for metadataLine in metadataRaw.split(b'\n'):
        # Skip empty lines
        if not len(metadataLine):
            continue
        # Skip comments
        if metadataLine.startswith(b';'):
            continue
        # Parse key-value
        metadataLineSplit = metadataLine.split(b'=')
        if len(metadataLineSplit) != 2:
            log.warning("Unparsed metadata line: `{}`".format(metadataLine))
            continue
        key, val = metadataLineSplit
        key = key.decode().lower()
        val = val.decode()
        data[key] = val
    return data

def videoInfos(filename):
    assert os.path.isfile(filename)
    cmd = ["ffprobe", filename, "-print_format", "json", "-show_streams"]
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    p.check_returncode()
    infosRaw = p.stdout
    infos = json.loads(infosRaw)
    return infos

from pprint import pprint
def streamDuration(stream):
    if "duration" in stream:
        return float(stream["duration"])
    elif "sample_rate" in stream and "nb_frames" in stream:
        return int(stream["nb_frames"]) / int(stream["sample_rate"])
    elif "tags" in stream and "DURATION" in stream["tags"]:
        durRaw = stream["tags"]["DURATION"]
        durSplit = durRaw.split(":")
        assert len(durSplit) == 3
        durSplitFloat = [float(a) for a in durSplit]
        hours, minutes, seconds = durSplitFloat
        return (hours * 60 + minutes) * 60 + seconds
    else:
        raise KeyError("Can't find duration information in stream")

def videoDuration(filename):
    # TODO Doesn't work with VP8 / webm
    infos = videoInfos(filename)
    durations = [streamDuration(stream) for stream in infos["streams"]]
    dev = statistics.stdev(durations)
    assert dev <= DURATION_MAX_DEV, "Too much deviation ({} s)".format(dev)
    return sum(durations)/len(durations)


todos = set()
totalSize = 0
totalDuration = 0

# Walk folders
log.info("Listing files in {}".format(PICTURES_FOLDER))
allVideos = list()
for root, dirs, files in os.walk(PICTURES_FOLDER):
    # If folder is in ORIGINAL_FOLDER, skip it
    if root.startswith(ORIGINAL_FOLDER):
        continue
    # Iterate over files
    for inputName in files:
        # If the file is not a video, skip it
        inputNameBase, inputExt = os.path.splitext(inputName)
        inputExt = inputExt[1:].lower()
        if inputExt not in MOVIE_EXTENSIONS:
            continue

        allVideos.append((root, inputName))

log.info("Analyzing videos")
for root, inputName in progressbar.progressbar(allVideos):
    inputNameBase, inputExt = os.path.splitext(inputName)
    inputExt = inputExt[1:].lower()

    # Generates all needed filepaths
    ## Found file
    inputFull = os.path.join(root, inputName)
    inputRel = os.path.relpath(inputFull, PICTURES_FOLDER)
    ## Original file
    originalFull = os.path.join(ORIGINAL_FOLDER, inputRel)
    originalRel = inputRel
    assert not os.path.isfile(originalFull), originalFile + " exists"

    ## Compressed file
    outputFull = os.path.join(root, inputNameBase + "." + OUTPUT_EXTENSION)

    # If the extension is the same of the output one
    if inputExt == OUTPUT_EXTENSION:
        # Read the metadata of the video
        meta = videoMetadata(inputFull)

        # If it has the field with the original file
        if 'original' in meta:
            # Skip file
            continue
    else:
        assert not os.path.isfile(outputFull), outputFull + " exists"


    size = os.stat(inputFull).st_size
    try:
        duration = videoDuration(inputFull)
    except Exception as e:
        log.warning("Can't determine duration of {}, skipping".format(inputFull))
        log.debug(e, exc_info=True)
        continue

    todo = (inputFull, originalFull, outputFull, size, duration)

    totalDuration += duration
    totalSize += size
    todos.add(todo)

log.info("Converting {} videos ({})".format(len(todos), datetime.timedelta(seconds=totalDuration)))

# From https://stackoverflow.com/a/3431838
def sha256(fname):
    hash_sha256 = hashlib.sha256()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(131072), b""):
            hash_sha256.update(chunk)
    return hash_sha256.hexdigest()

# Progress bar things
totalDataSize = progressbar.widgets.DataSize()
totalDataSize.variable = 'max_value'
barWidgets = [progressbar.widgets.DataSize(), ' of ', totalDataSize, ' ', progressbar.widgets.Bar(), ' ', progressbar.widgets.FileTransferSpeed(), ' ', progressbar.widgets.AdaptiveETA()]
bar = progressbar.DataTransferBar(max_value=totalSize, widgets=barWidgets)
bar.start()
processedSize = 0


for inputFull, originalFull, outputFull, size, duration in todos:
    tmpfile = tempfile.mkstemp(prefix="compressPictureMovies", suffix="."+OUTPUT_EXTENSION)[1]
    try:
        # Calculate the sum of the original file
        checksum = sha256(inputFull)

        # Initiate a conversion in a temporary file
        originalRel = os.path.relpath(originalFull, ORIGINAL_FOLDER)
        originalContent = "{} {}".format(originalRel, checksum)
        metadataCmd = ["-metadata", 'original="{}"'.format(originalContent)]
        cmd = ["ffmpeg", "-hide_banner", "-y", "-i", inputFull] + OUTPUT_FFMPEG_PARAMETERS + metadataCmd + [tmpfile]
        p = subprocess.run(cmd)
        p.check_returncode()

        # Verify the durartion of the new file
        newDuration = videoDuration(tmpfile)
        dev = statistics.stdev((duration, newDuration))
        assert dev < DURATION_MAX_DEV, "Too much deviation in duration"

        # Move the original to the corresponding original folder
        originalDir = os.path.dirname(originalFull)
        os.makedirs(originalDir, exist_ok=True)
        shutil.move(inputFull, originalFull)

        # Move the converted file in place of the original
        shutil.move(tmpfile, outputFull)
    except Exception as e:
        log.error("Couldn't process file {}".format(inputFull))
        log.error(e, exc_info=True)
        try:
            os.unlink(tmpfile)
        except Exception:
            pass
    # Progress bar things
    processedSize += size
    bar.update(processedSize)
bar.finish()


# TODO Iterate over the already compressed videos to assert the originals are
# in their correct place, else move them