From a5e768fe0007991ce38ac78611e4cf87be799f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sat, 7 Dec 2019 13:51:23 +0100 Subject: [PATCH] Filtering by IP range Closes #5 --- filter_subdomains.py | 32 +++++++++++++++++++++++++++++--- filter_subdomains.sh | 10 ++++++---- rules_ip/.gitignore | 2 ++ rules_ip/first-party.txt | 7 +++++++ 4 files changed, 44 insertions(+), 7 deletions(-) create mode 100644 rules_ip/.gitignore create mode 100644 rules_ip/first-party.txt diff --git a/filter_subdomains.py b/filter_subdomains.py index 1c13c7c..b7341e1 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -11,11 +11,14 @@ import sys import progressbar import csv import typing +import ipaddress # DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']] DomainRule = typing.Union[bool, typing.Dict] RULES_DICT: DomainRule = dict() +RULES_IP: typing.Set[ipaddress.IPv4Network] = set() + def subdomain_matching(subdomain: str) -> bool: parts = subdomain.split('.') @@ -30,16 +33,26 @@ def subdomain_matching(subdomain: str) -> bool: return False +def ip_matching(ip_str: str) -> bool: + ip = ipaddress.ip_address(ip_str) + for net in RULES_IP: + if ip in net: + return True + return False + + def get_matching(chain: typing.List[str], no_explicit: bool = False ) -> typing.Iterable[str]: + if len(chain) <= 1: + return initial = chain[0] cname_destinations = chain[1:-1] - # a_destination = chain[-1] + a_destination = chain[-1] initial_matching = subdomain_matching(initial) if no_explicit and initial_matching: return cname_matching = any(map(subdomain_matching, cname_destinations)) - if cname_matching or initial_matching: + if cname_matching or initial_matching or ip_matching(a_destination): yield initial @@ -58,6 +71,14 @@ def register_rule(subdomain: str) -> None: dic.setdefault(part, dict()) dic = dic[part] + +def register_rule_ip(network: str) -> None: + net = ipaddress.ip_network(network) + RULES_IP.add(net) + # If RULES_IP start becoming bigger, + # we might implement a binary tree for performance + + if __name__ == '__main__': # Parsing arguments @@ -74,7 +95,10 @@ if __name__ == '__main__': help="Don't output domains already blocked with rules without CNAME") parser.add_argument( '-r', '--rules', type=argparse.FileType('r'), default='rules', - help="Rules file") + help="List of domains domains to block (with their subdomains)") + parser.add_argument( + '-p', '--rules-ip', type=argparse.FileType('r'), default='rules-ip', + help="List of IPs ranges to block") args = parser.parse_args() # Progress bar @@ -91,6 +115,8 @@ if __name__ == '__main__': # Reading rules for rule in args.rules: register_rule(rule.strip()) + for rule in args.rules_ip: + register_rule_ip(rule.strip()) # Approximating line count if args.input.seekable(): diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 2229591..5f0de0e 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -13,21 +13,23 @@ cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/a cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt +cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt echo "Filtering first-party tracking domains..." > /dev/stderr -./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list +./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt echo "Filtering first-party curated tracking domains..." > /dev/stderr -./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list +./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt echo "Filtering multi-party tracking domains..." > /dev/stderr -./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list +./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt echo "Filtering multi-party curated tracking domains..." > /dev/stderr -./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list +./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist diff --git a/rules_ip/.gitignore b/rules_ip/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules_ip/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt diff --git a/rules_ip/first-party.txt b/rules_ip/first-party.txt new file mode 100644 index 0000000..eebf7b6 --- /dev/null +++ b/rules_ip/first-party.txt @@ -0,0 +1,7 @@ +# Eulerian +109.232.192.0/21 +# Criteo +178.250.0.0/21 +91.212.98.0/24 +91.199.242.0/24 +185.235.84.0/24