2018-10-06 10:27:36 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import os
|
|
|
|
import subprocess
|
2020-03-12 17:56:10 +01:00
|
|
|
import sys
|
|
|
|
import logging
|
|
|
|
import magic
|
|
|
|
import typing
|
|
|
|
import coloredlogs
|
2018-10-06 10:27:36 +02:00
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
# TODO Able to ignore extensions everywhere
|
|
|
|
|
2020-08-08 11:19:48 +02:00
|
|
|
|
|
|
|
class ArchiveType:
|
|
|
|
suffix: str = ""
|
|
|
|
fullname: str = ""
|
|
|
|
dest_suffix: str = ""
|
2020-03-12 17:56:10 +01:00
|
|
|
mime: typing.Optional[str] = None
|
|
|
|
header: typing.Optional[bytes] = None
|
|
|
|
extract_cmd: typing.Optional[typing.List[str]] = None
|
|
|
|
single_file = False
|
|
|
|
append_dest = False
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
self.log = logging.getLogger(self.__class__.__name__)
|
2018-10-06 10:27:36 +02:00
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
def dest_name(self, archive: str) -> str:
|
2020-08-08 11:19:48 +02:00
|
|
|
return archive + self.dest_suffix
|
2018-10-06 10:27:36 +02:00
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
def fits(self, name_lower: str, mime: str, header: bytes) -> bool:
|
|
|
|
if not name_lower.endswith(self.suffix):
|
|
|
|
return False
|
|
|
|
if self.mime is not None and mime != self.mime:
|
|
|
|
return False
|
|
|
|
if self.header is not None and not header.startswith(self.header):
|
|
|
|
return False
|
|
|
|
return True
|
2018-10-06 10:27:36 +02:00
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
def _get_cmd(self, archive: str, dest: str) -> typing.List[str]:
|
|
|
|
assert self.extract_cmd
|
|
|
|
cmd = self.extract_cmd + [archive]
|
|
|
|
if self.append_dest:
|
|
|
|
cmd.append(dest)
|
|
|
|
return cmd
|
|
|
|
|
|
|
|
def extract(self, archive: str, dest: str) -> None:
|
|
|
|
cmd = self._get_cmd(archive, dest)
|
|
|
|
if not self.single_file:
|
|
|
|
os.mkdir(dest)
|
|
|
|
self.log.info("Extracting '%s' into '%s'", archive, dest)
|
|
|
|
self.log.debug("%s", cmd)
|
|
|
|
if self.single_file:
|
|
|
|
r = subprocess.run(cmd)
|
|
|
|
else:
|
|
|
|
r = subprocess.run(cmd, cwd=dest)
|
2018-10-06 10:27:36 +02:00
|
|
|
r.check_returncode()
|
2020-03-12 17:56:10 +01:00
|
|
|
if self.single_file:
|
|
|
|
assert os.path.isfile(dest)
|
|
|
|
|
|
|
|
extract_fun: typing.Optional[typing.Callable[[str, str], None]] = None
|
|
|
|
|
2020-08-08 11:19:48 +02:00
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
class ArchiveZip(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".zip"
|
|
|
|
mime = "application/zip"
|
|
|
|
extract_cmd = ["unzip"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
class Archive7z(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".7z"
|
|
|
|
mime = "application/x-7z-compressed"
|
|
|
|
extract_cmd = ["7z", "x"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
class ArchiveRar(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".rar"
|
|
|
|
mime = "application/x-rar"
|
|
|
|
extract_cmd = ["unrar", "x"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
class ArchiveTar(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".tar"
|
|
|
|
mime = "application/x-tar"
|
|
|
|
extract_cmd = ["tar", "--extract", "--file"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
class ArchiveTarGz(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".tar.gz"
|
|
|
|
mime = "application/gzip"
|
|
|
|
extract_cmd = ["tar", "--extract", "--gzip", "--file"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
class ArchiveTarXz(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".tar.xz"
|
|
|
|
mime = "application/x-xz"
|
|
|
|
extract_cmd = ["tar", "--extract", "--xz", "--file"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
class ArchiveGzip(ArchiveType):
|
2020-08-08 11:19:48 +02:00
|
|
|
suffix = ".gz"
|
|
|
|
mime = "application/gzip"
|
2020-03-12 17:56:10 +01:00
|
|
|
single_file = True
|
2020-08-08 11:19:48 +02:00
|
|
|
extract_cmd = ["gunzip"]
|
|
|
|
|
2020-03-12 17:56:10 +01:00
|
|
|
|
2020-08-08 11:19:48 +02:00
|
|
|
class TreeExtractor:
|
2020-03-12 17:56:10 +01:00
|
|
|
ARCHIVE_TYPES: typing.List[ArchiveType] = [
|
2020-08-08 11:19:48 +02:00
|
|
|
ArchiveZip(),
|
|
|
|
Archive7z(),
|
|
|
|
ArchiveRar(),
|
|
|
|
ArchiveTar(),
|
|
|
|
ArchiveTarGz(),
|
|
|
|
ArchiveTarXz(),
|
|
|
|
ArchiveGzip(),
|
2020-03-12 17:56:10 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
2020-08-08 11:19:48 +02:00
|
|
|
self.log = logging.getLogger("TreeExtractor")
|
|
|
|
self.suffixes = set()
|
2020-03-12 17:56:10 +01:00
|
|
|
for archive_type in self.ARCHIVE_TYPES:
|
2020-08-08 11:19:48 +02:00
|
|
|
self.suffixes.add(archive_type.suffix)
|
2020-03-12 17:56:10 +01:00
|
|
|
|
2020-08-08 11:19:48 +02:00
|
|
|
def extract_tree(self, directory: str = ".") -> None:
|
2020-03-12 17:56:10 +01:00
|
|
|
for root, dirs, files in os.walk(directory):
|
|
|
|
real_root = os.path.realpath(root)
|
|
|
|
for name in files:
|
|
|
|
self.log.debug("Handling '%s' '%s'", real_root, name)
|
|
|
|
|
2020-08-08 11:19:48 +02:00
|
|
|
# Initial filtering with suffix
|
|
|
|
name_lower = name.lower()
|
|
|
|
for suffix in self.suffixes:
|
|
|
|
if name_lower.endswith(suffix):
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
self.log.debug("Suffix not matched: %s", name)
|
2020-03-12 17:56:10 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
filepath = os.path.join(real_root, name)
|
2020-08-08 11:19:48 +02:00
|
|
|
with open(filepath, "rb") as filedesc:
|
2020-03-12 17:56:10 +01:00
|
|
|
header = filedesc.read(1024)
|
2021-06-13 11:42:37 +02:00
|
|
|
mime = magic.detect_from_content(header).mime_type
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
archive_type = None
|
|
|
|
for archtyp in self.ARCHIVE_TYPES:
|
|
|
|
if archtyp.fits(name_lower, mime, header):
|
|
|
|
archive_type = archtyp
|
|
|
|
break
|
|
|
|
if not archive_type:
|
|
|
|
self.log.debug("Not matched: %s", filepath)
|
|
|
|
continue
|
|
|
|
|
|
|
|
dest_name = archive_type.dest_name(name)
|
|
|
|
dest = os.path.join(real_root, dest_name)
|
2020-08-08 11:19:48 +02:00
|
|
|
dest_tmp = dest + ".tmp"
|
2020-03-12 17:56:10 +01:00
|
|
|
try:
|
2020-08-08 11:19:48 +02:00
|
|
|
archive_type.extract(filepath, dest_tmp)
|
2020-03-12 17:56:10 +01:00
|
|
|
except BaseException as e:
|
|
|
|
# TODO Parameters stop on error
|
|
|
|
self.log.error(e, exc_info=True)
|
2020-08-08 11:19:48 +02:00
|
|
|
else:
|
|
|
|
os.unlink(filepath)
|
|
|
|
os.rename(dest_tmp, dest)
|
2020-03-12 17:56:10 +01:00
|
|
|
|
|
|
|
if os.path.isdir(dest):
|
|
|
|
self.extract_tree(dest)
|
|
|
|
|
|
|
|
def main(self) -> None:
|
2020-08-08 11:19:48 +02:00
|
|
|
directory = sys.argv[1] if len(sys.argv) > 1 else "."
|
2020-03-12 17:56:10 +01:00
|
|
|
self.extract_tree(directory)
|
2018-10-06 10:27:36 +02:00
|
|
|
|
2020-08-08 11:19:48 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
coloredlogs.install(level="DEBUG", fmt="%(levelname)s %(message)s")
|
2020-03-12 17:56:10 +01:00
|
|
|
TreeExtractor().main()
|