2019-11-14 12:31:39 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# pylint: disable=C0103
|
|
|
|
|
|
|
|
"""
|
|
|
|
From a list of subdomains to block,
|
|
|
|
filter out the ones explicitely matching a regex.
|
|
|
|
It should be already handled by the ad blocker.
|
|
|
|
"""
|
|
|
|
|
2019-11-15 07:57:31 +00:00
|
|
|
import argparse
|
2019-11-14 12:31:39 +00:00
|
|
|
import sys
|
2019-11-15 07:57:31 +00:00
|
|
|
import progressbar
|
2019-11-14 12:31:39 +00:00
|
|
|
|
2019-11-15 07:57:31 +00:00
|
|
|
import adblockparser
|
|
|
|
|
|
|
|
OPTIONS = {"third-party": True}
|
2019-11-14 12:31:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
def explicitely_match(subdomain: str) -> bool:
|
2019-11-15 07:57:31 +00:00
|
|
|
url = f"https://{subdomain}/"
|
|
|
|
return rules.should_block(url, OPTIONS)
|
2019-11-14 12:31:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
# Parsing arguments
|
2019-11-15 07:57:31 +00:00
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Filter first-party trackers from a list of subdomains")
|
|
|
|
parser.add_argument(
|
|
|
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
|
|
|
help="Input file with one subdomain per line")
|
|
|
|
parser.add_argument(
|
|
|
|
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
|
|
|
help="Outptut file with one tracking subdomain per line")
|
|
|
|
parser.add_argument(
|
|
|
|
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
|
|
|
help="Rules file")
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# Reading rules
|
|
|
|
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
|
2019-11-14 12:31:39 +00:00
|
|
|
|
2019-11-15 07:57:31 +00:00
|
|
|
# Progress bar
|
|
|
|
widgets = [
|
|
|
|
progressbar.Percentage(),
|
|
|
|
' ', progressbar.SimpleProgress(),
|
|
|
|
' ', progressbar.Bar(),
|
|
|
|
' ', progressbar.Timer(),
|
|
|
|
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
|
|
|
|
' ', progressbar.AdaptiveETA(),
|
|
|
|
]
|
|
|
|
progress = progressbar.ProgressBar(widgets=widgets)
|
|
|
|
if args.input.seekable():
|
|
|
|
progress.max_value = len(args.input.readlines())
|
|
|
|
args.input.seek(0)
|
2019-11-14 12:31:39 +00:00
|
|
|
|
|
|
|
# Cleaning input
|
2019-11-15 07:57:31 +00:00
|
|
|
iterator = iter(args.input)
|
2019-11-14 12:31:39 +00:00
|
|
|
iterator = map(str.strip, iterator)
|
|
|
|
iterator = filter(None, iterator)
|
|
|
|
|
2019-11-15 07:57:31 +00:00
|
|
|
# Filtering
|
|
|
|
progress.start()
|
2019-11-14 12:31:39 +00:00
|
|
|
for subdomain in iterator:
|
2019-11-15 07:57:31 +00:00
|
|
|
progress.update(progress.value + 1)
|
2019-11-14 12:31:39 +00:00
|
|
|
if not explicitely_match(subdomain):
|
2019-11-15 07:57:31 +00:00
|
|
|
print(subdomain, file=args.output)
|
|
|
|
progress.finish()
|