#!/usr/bin/env python3 # pylint: disable=C0103 """ From a list of subdomains to block, filter out the ones explicitely matching a regex. It should be already handled by the ad blocker. """ import argparse import sys import progressbar import adblockparser OPTIONS = {"third-party": True} def explicitely_match(subdomain: str) -> bool: url = f"https://{subdomain}/" return rules.should_block(url, OPTIONS) if __name__ == '__main__': # Parsing arguments parser = argparse.ArgumentParser( description="Filter first-party trackers from a list of subdomains") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Input file with one subdomain per line") parser.add_argument( '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Outptut file with one tracking subdomain per line") parser.add_argument( '-r', '--rules', type=argparse.FileType('r'), default='rules', help="Rules file") args = parser.parse_args() # Reading rules rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules) # Progress bar widgets = [ progressbar.Percentage(), ' ', progressbar.SimpleProgress(), ' ', progressbar.Bar(), ' ', progressbar.Timer(), ' ', progressbar.AdaptiveTransferSpeed(unit='req'), ' ', progressbar.AdaptiveETA(), ] progress = progressbar.ProgressBar(widgets=widgets) if args.input.seekable(): progress.max_value = len(args.input.readlines()) args.input.seek(0) # Cleaning input iterator = iter(args.input) iterator = map(str.strip, iterator) iterator = filter(None, iterator) # Filtering progress.start() for subdomain in iterator: progress.update(progress.value + 1) if not explicitely_match(subdomain): print(subdomain, file=args.output) progress.finish()