eulaurarien/filter_subdomains.py

107 lines
3.1 KiB
Python
Raw Normal View History

2019-11-10 18:14:25 +01:00
#!/usr/bin/env python3
# pylint: disable=C0103
2019-11-10 18:14:25 +01:00
"""
From a list of subdomains, output only
the ones resolving to a first-party tracker.
"""
import argparse
2019-11-10 18:14:25 +01:00
import sys
import progressbar
import csv
2019-11-14 10:45:06 +01:00
import typing
2019-11-10 18:14:25 +01:00
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
DomainRule = typing.Union[bool, typing.Dict]
2019-11-14 15:03:20 +01:00
RULES_DICT: DomainRule = dict()
2019-11-14 10:45:06 +01:00
def subdomain_matching(subdomain: str) -> bool:
parts = subdomain.split('.')
parts.reverse()
dic = RULES_DICT
for part in parts:
if isinstance(dic, bool) or part not in dic:
break
dic = dic[part]
if isinstance(dic, bool):
return dic
return False
2019-11-14 10:45:06 +01:00
def get_matching(chain: typing.List[str], no_explicit: bool = False
) -> typing.Iterable[str]:
initial = chain[0]
cname_destinations = chain[1:-1]
# a_destination = chain[-1]
initial_matching = subdomain_matching(initial)
if no_explicit and initial_matching:
return
cname_matching = any(map(subdomain_matching, cname_destinations))
if cname_matching or initial_matching:
yield initial
2019-11-14 10:45:06 +01:00
def register_rule(subdomain: str) -> None:
# Make a tree with domain parts
parts = subdomain.split('.')
parts.reverse()
dic = RULES_DICT
last_part = len(parts) - 1
for p, part in enumerate(parts):
if isinstance(dic, bool):
return
if p == last_part:
dic[part] = True
else:
dic.setdefault(part, dict())
dic = dic[part]
if __name__ == '__main__':
2019-11-14 10:45:06 +01:00
# Parsing arguments
parser = argparse.ArgumentParser(
description="Filter first-party trackers from a list of subdomains")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with DNS chains")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line")
parser.add_argument(
'-n', '--no-explicit', action='store_true',
help="Don't output domains already blocked with rules without CNAME")
parser.add_argument(
'-r', '--rules', type=argparse.FileType('r'), default='rules',
help="Rules file")
args = parser.parse_args()
2019-11-14 10:45:06 +01:00
# Progress bar
widgets = [
progressbar.Percentage(),
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
]
progress = progressbar.ProgressBar(widgets=widgets)
# Reading rules
for rule in args.rules:
register_rule(rule.strip())
# Reading domains to filter
if args.input.seekable():
progress.max_value = len(args.input.readlines())
args.input.seek(0)
2019-11-14 10:45:06 +01:00
reader = csv.reader(args.input)
2019-11-14 10:45:06 +01:00
progress.start()
for chain in reader:
for match in get_matching(chain, no_explicit=args.no_explicit):
print(match, file=args.output)
2019-11-14 10:45:06 +01:00
progress.update(progress.value + 1)
progress.finish()