From 300fe8e15eac8c2f666352bcbbc6f381206df772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Thu, 14 Nov 2019 15:37:32 +0100 Subject: [PATCH] Added real argument parser Just so we can have color output when running the script :) --- filter_subdomains.py | 34 +++++++++++++++++++++------------- filter_subdomains.sh | 2 +- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/filter_subdomains.py b/filter_subdomains.py index c7a0bfc..61e4a60 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -5,6 +5,7 @@ From a list of subdomains, output only the ones resolving to a first-party tracker. """ +import argparse import logging import os import queue @@ -194,11 +195,29 @@ def main() -> None: Also shows a nice progressbar. """ + # Initialization coloredlogs.install( level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' ) + # Parsing arguments + parser = argparse.ArgumentParser( + description="Filter first-party trackers from a list of subdomains") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="Input file with one subdomain per line") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="Outptut file with one tracking subdomain per line") + # parser.add_argument( + # '-n', '--nameserver', type=argparse.FileType('r'), + # default='nameservers', help="File with one nameserver per line") + # parser.add_argument( + # '-j', '--workers', type=int, default=512, + # help="Number of threads to use") + args = parser.parse_args() + # Progress bar widgets = [ progressbar.Percentage(), @@ -210,19 +229,8 @@ def main() -> None: ] progress = progressbar.ProgressBar(widgets=widgets) - # Parsing arguments - assert len(sys.argv) <= 2 - filename = None - - if len(sys.argv) == 2 and sys.argv[1] != '-': - filename = sys.argv[1] - progress.max_value = sum(1 for line in open(filename)) - textio = open(filename) - else: - textio = sys.stdin - # Cleaning input - iterator = iter(textio) + iterator = iter(args.input) iterator = map(str.strip, iterator) iterator = filter(None, iterator) @@ -236,7 +244,7 @@ def main() -> None: for subdomain, matching in Orchestrator(iterator, servers).run(): progress.update(progress.value + 1) if matching: - print(subdomain) + print(subdomain, file=args.output) progress.finish() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 036c467..c1c3255 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -3,7 +3,7 @@ # Filter out the subdomains not pointing to a first-party tracker cat subdomains/*.list | sort -u > temp/all_subdomains.list -./filter_subdomains.py temp/all_subdomains.list > temp/all_toblock.list +./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list sort -u temp/all_toblock.list > dist/firstparty-trackers.txt # Format the blocklist so it can be used as a hostlist