Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
1.9 KiB

#!/usr/bin/env python3
# pylint: disable=C0103
From a list of subdomains to block,
filter out the ones explicitely matching a regex.
It should be already handled by the ad blocker.
import argparse
import sys
import progressbar
import adblockparser
OPTIONS = {"third-party": True}
def explicitely_match(subdomain: str) -> bool:
url = f"https://{subdomain}/"
return rules.should_block(url, OPTIONS)
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="Filter first-party trackers from a list of subdomains")
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with one subdomain per line")
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line")
'-r', '--rules', type=argparse.FileType('r'), default='rules',
help="Rules file")
args = parser.parse_args()
# Reading rules
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
# Progress bar
widgets = [
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
progress = progressbar.ProgressBar(widgets=widgets)
if args.input.seekable():
progress.max_value = len(args.input.readlines())
# Cleaning input
iterator = iter(args.input)
iterator = map(str.strip, iterator)
iterator = filter(None, iterator)
# Filtering
for subdomain in iterator:
progress.update(progress.value + 1)
if not explicitely_match(subdomain):
print(subdomain, file=args.output)