2019-11-10 18:14:25 +01:00
|
|
|
#!/usr/bin/env python3
|
2019-12-02 19:03:08 +01:00
|
|
|
# pylint: disable=C0103
|
2019-11-10 18:14:25 +01:00
|
|
|
|
|
|
|
"""
|
|
|
|
From a list of subdomains, output only
|
|
|
|
the ones resolving to a first-party tracker.
|
|
|
|
"""
|
|
|
|
|
2019-11-14 15:37:32 +01:00
|
|
|
import argparse
|
2019-11-10 18:14:25 +01:00
|
|
|
import sys
|
2019-12-02 19:03:08 +01:00
|
|
|
import progressbar
|
|
|
|
import csv
|
2019-11-14 10:45:06 +01:00
|
|
|
import typing
|
2019-12-07 13:51:23 +01:00
|
|
|
import ipaddress
|
2019-11-10 18:14:25 +01:00
|
|
|
|
2019-12-03 08:48:12 +01:00
|
|
|
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
|
|
|
|
DomainRule = typing.Union[bool, typing.Dict]
|
2019-12-08 01:23:36 +01:00
|
|
|
# IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']]
|
|
|
|
IpRule = typing.Union[bool, typing.Dict]
|
2019-11-14 15:03:20 +01:00
|
|
|
|
2019-12-03 08:48:12 +01:00
|
|
|
RULES_DICT: DomainRule = dict()
|
2019-12-08 01:23:36 +01:00
|
|
|
RULES_IP_DICT: IpRule = dict()
|
|
|
|
|
|
|
|
|
|
|
|
def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
|
|
|
|
for char in address.packed:
|
|
|
|
for i in range(7, -1, -1):
|
|
|
|
yield (char >> i) & 0b1
|
2019-12-07 13:51:23 +01:00
|
|
|
|
2019-11-14 10:45:06 +01:00
|
|
|
|
2019-12-02 19:03:08 +01:00
|
|
|
def subdomain_matching(subdomain: str) -> bool:
|
2019-12-03 08:48:12 +01:00
|
|
|
parts = subdomain.split('.')
|
|
|
|
parts.reverse()
|
|
|
|
dic = RULES_DICT
|
|
|
|
for part in parts:
|
|
|
|
if isinstance(dic, bool) or part not in dic:
|
|
|
|
break
|
|
|
|
dic = dic[part]
|
|
|
|
if isinstance(dic, bool):
|
|
|
|
return dic
|
|
|
|
return False
|
2019-11-14 10:45:06 +01:00
|
|
|
|
|
|
|
|
2019-12-07 13:51:23 +01:00
|
|
|
def ip_matching(ip_str: str) -> bool:
|
|
|
|
ip = ipaddress.ip_address(ip_str)
|
2019-12-08 01:23:36 +01:00
|
|
|
dic = RULES_IP_DICT
|
|
|
|
i = 0
|
|
|
|
for bit in get_bits(ip):
|
|
|
|
i += 1
|
|
|
|
if isinstance(dic, bool) or bit not in dic:
|
|
|
|
break
|
|
|
|
dic = dic[bit]
|
|
|
|
if isinstance(dic, bool):
|
|
|
|
return dic
|
2019-12-07 13:51:23 +01:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
2019-12-02 19:03:08 +01:00
|
|
|
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
|
|
|
) -> typing.Iterable[str]:
|
2019-12-07 13:51:23 +01:00
|
|
|
if len(chain) <= 1:
|
|
|
|
return
|
2019-12-02 19:03:08 +01:00
|
|
|
initial = chain[0]
|
|
|
|
cname_destinations = chain[1:-1]
|
2019-12-07 13:51:23 +01:00
|
|
|
a_destination = chain[-1]
|
2019-12-02 19:03:08 +01:00
|
|
|
initial_matching = subdomain_matching(initial)
|
|
|
|
if no_explicit and initial_matching:
|
|
|
|
return
|
|
|
|
cname_matching = any(map(subdomain_matching, cname_destinations))
|
2019-12-07 13:51:23 +01:00
|
|
|
if cname_matching or initial_matching or ip_matching(a_destination):
|
2019-12-02 19:03:08 +01:00
|
|
|
yield initial
|
2019-11-15 08:57:31 +01:00
|
|
|
|
2019-11-14 10:45:06 +01:00
|
|
|
|
2019-12-03 08:48:12 +01:00
|
|
|
def register_rule(subdomain: str) -> None:
|
|
|
|
# Make a tree with domain parts
|
|
|
|
parts = subdomain.split('.')
|
|
|
|
parts.reverse()
|
|
|
|
dic = RULES_DICT
|
|
|
|
last_part = len(parts) - 1
|
|
|
|
for p, part in enumerate(parts):
|
|
|
|
if isinstance(dic, bool):
|
|
|
|
return
|
|
|
|
if p == last_part:
|
|
|
|
dic[part] = True
|
|
|
|
else:
|
|
|
|
dic.setdefault(part, dict())
|
|
|
|
dic = dic[part]
|
|
|
|
|
2019-12-07 13:51:23 +01:00
|
|
|
|
|
|
|
def register_rule_ip(network: str) -> None:
|
|
|
|
net = ipaddress.ip_network(network)
|
2019-12-08 01:23:36 +01:00
|
|
|
ip = net.network_address
|
|
|
|
dic = RULES_IP_DICT
|
|
|
|
last_bit = net.prefixlen - 1
|
|
|
|
for b, bit in enumerate(get_bits(ip)):
|
|
|
|
if isinstance(dic, bool):
|
|
|
|
return
|
|
|
|
if b == last_bit:
|
|
|
|
dic[bit] = True
|
|
|
|
else:
|
|
|
|
dic.setdefault(bit, dict())
|
|
|
|
dic = dic[bit]
|
2019-12-07 13:51:23 +01:00
|
|
|
|
|
|
|
|
2019-12-02 19:03:08 +01:00
|
|
|
if __name__ == '__main__':
|
2019-11-14 10:45:06 +01:00
|
|
|
|
2019-11-14 15:37:32 +01:00
|
|
|
# Parsing arguments
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Filter first-party trackers from a list of subdomains")
|
|
|
|
parser.add_argument(
|
|
|
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
2019-12-02 19:03:08 +01:00
|
|
|
help="Input file with DNS chains")
|
2019-11-14 15:37:32 +01:00
|
|
|
parser.add_argument(
|
|
|
|
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
|
|
|
help="Outptut file with one tracking subdomain per line")
|
2019-12-02 19:03:08 +01:00
|
|
|
parser.add_argument(
|
|
|
|
'-n', '--no-explicit', action='store_true',
|
|
|
|
help="Don't output domains already blocked with rules without CNAME")
|
2019-11-15 08:57:31 +01:00
|
|
|
parser.add_argument(
|
2019-12-07 18:17:53 +01:00
|
|
|
'-r', '--rules', type=argparse.FileType('r'),
|
2019-12-07 13:51:23 +01:00
|
|
|
help="List of domains domains to block (with their subdomains)")
|
|
|
|
parser.add_argument(
|
2019-12-07 18:17:53 +01:00
|
|
|
'-p', '--rules-ip', type=argparse.FileType('r'),
|
2019-12-07 13:51:23 +01:00
|
|
|
help="List of IPs ranges to block")
|
2019-11-14 15:37:32 +01:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2019-11-14 10:45:06 +01:00
|
|
|
# Progress bar
|
|
|
|
widgets = [
|
|
|
|
progressbar.Percentage(),
|
|
|
|
' ', progressbar.SimpleProgress(),
|
|
|
|
' ', progressbar.Bar(),
|
|
|
|
' ', progressbar.Timer(),
|
|
|
|
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
|
|
|
|
' ', progressbar.AdaptiveETA(),
|
|
|
|
]
|
|
|
|
progress = progressbar.ProgressBar(widgets=widgets)
|
2019-12-03 08:48:12 +01:00
|
|
|
|
|
|
|
# Reading rules
|
2019-12-07 18:17:53 +01:00
|
|
|
if args.rules:
|
|
|
|
for rule in args.rules:
|
|
|
|
register_rule(rule.strip())
|
|
|
|
if args.rules_ip:
|
|
|
|
for rule in args.rules_ip:
|
|
|
|
register_rule_ip(rule.strip())
|
2019-12-03 08:48:12 +01:00
|
|
|
|
2019-12-03 18:43:23 +01:00
|
|
|
# Approximating line count
|
2019-11-15 08:57:31 +01:00
|
|
|
if args.input.seekable():
|
2019-12-03 18:43:23 +01:00
|
|
|
lines = 0
|
|
|
|
for line in args.input:
|
|
|
|
lines += 1
|
|
|
|
progress.max_value = lines
|
2019-11-15 08:57:31 +01:00
|
|
|
args.input.seek(0)
|
2019-11-14 10:45:06 +01:00
|
|
|
|
2019-12-03 18:43:23 +01:00
|
|
|
# Reading domains to filter
|
2019-12-02 19:03:08 +01:00
|
|
|
reader = csv.reader(args.input)
|
2019-11-14 10:45:06 +01:00
|
|
|
progress.start()
|
2019-12-02 19:03:08 +01:00
|
|
|
for chain in reader:
|
|
|
|
for match in get_matching(chain, no_explicit=args.no_explicit):
|
|
|
|
print(match, file=args.output)
|
2019-11-14 10:45:06 +01:00
|
|
|
progress.update(progress.value + 1)
|
|
|
|
progress.finish()
|