From 69b82d29fd3dbf924d9cf894aeb9a25ae3b620fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 3 Dec 2019 08:48:12 +0100 Subject: [PATCH] Improved rules handling Rules can now come in 3 different formats: - AdBlock rules - Host lists - Domains lists All will be converted into domain lists and aggregated (only AdBlock rules matching a whole domain will be kept). Subdomains will now be matched if it is a subdomain of any domain of the rule. It is way faster (seconds rather than hours!) but less flexible (although it shouldn't be a problem). --- adblock_to_domain_list.py | 49 +++++++++++++++++++++++++++++++++++++++ eulaurarien.sh | 1 + fetch_resources.sh | 18 ++++++++++---- filter_subdomains.py | 45 ++++++++++++++++++++++++++--------- filter_subdomains.sh | 28 ++++++++++++++-------- resolve_subdomains.sh | 7 ++++++ rules/.gitignore | 4 ++-- rules/first-party.list | 1 + rules/first-party.txt | 1 - rules_adblock/.gitignore | 2 ++ rules_hosts/.gitignore | 2 ++ 11 files changed, 130 insertions(+), 28 deletions(-) create mode 100755 adblock_to_domain_list.py create mode 100644 resolve_subdomains.sh create mode 100644 rules/first-party.list delete mode 100644 rules/first-party.txt create mode 100644 rules_adblock/.gitignore create mode 100644 rules_hosts/.gitignore diff --git a/adblock_to_domain_list.py b/adblock_to_domain_list.py new file mode 100755 index 0000000..7c8bf82 --- /dev/null +++ b/adblock_to_domain_list.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# pylint: disable=C0103 + +""" +Extract the domains to block as a whole +from a AdBlock rules list. +""" + +import argparse +import sys +import typing + +import abp.filters + + +def get_domains(rule: abp.filters.parser.Filter) -> typing.Iterable[str]: + for key, val in rule.options: + if key not in ('third-party',): + return + selector_type = rule.selector['type'] + selector_value = rule.selector['value'] + if selector_type == 'url-pattern' \ + and selector_value.startswith('||') \ + and selector_value.endswith('^'): + yield selector_value[2:-1] + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="Input file with AdBlock rules") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="Outptut file with one rule tracking subdomain per line") + args = parser.parse_args() + + # Reading rules + rules = abp.filters.parse_filterlist(args.input) + + # Filtering + for rule in rules: + if not isinstance(rule, abp.filters.parser.Filter): + continue + for domain in get_domains(rule): + print(domain, file=args.output) diff --git a/eulaurarien.sh b/eulaurarien.sh index 0cb45d7..a0cf887 100755 --- a/eulaurarien.sh +++ b/eulaurarien.sh @@ -4,5 +4,6 @@ ./fetch_resources.sh ./collect_subdomains.sh +./resolve_subdomains.sh ./filter_subdomains.sh diff --git a/fetch_resources.sh b/fetch_resources.sh index ea839fa..705b0c4 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -1,17 +1,27 @@ #!/usr/bin/env bash +function dl() { + echo "Downloading $1 to $2..." + curl --silent "$1" > "$2" + if [ $? -ne 0 ] + then + echo "Failed!" + fi +} + # Get rules -curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt +dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt +dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt # Get a list of nameservers - rm -f nameservers touch nameservers [ -f nameservers.head ] && cat nameservers.head >> nameservers -curl https://public-dns.info/nameservers.txt | sort -R >> nameservers +dl https://public-dns.info/nameservers.txt nameservers.temp +sort -R nameservers.temp >> nameservers +rm nameservers.temp # Get top 1M subdomains - wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip unzip top-1m.csv.zip sed 's|^[0-9]\+,||' top-1m.csv > subdomains/cisco-umbrella_popularity.cache.list diff --git a/filter_subdomains.py b/filter_subdomains.py index 8c2e9c4..d523c69 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -12,14 +12,22 @@ import progressbar import csv import typing -import adblockparser - -OPTIONS = {"third-party": True} +# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']] +DomainRule = typing.Union[bool, typing.Dict] +RULES_DICT: DomainRule = dict() def subdomain_matching(subdomain: str) -> bool: - url = f"https://{subdomain}/" - return rules.should_block(url, OPTIONS) + parts = subdomain.split('.') + parts.reverse() + dic = RULES_DICT + for part in parts: + if isinstance(dic, bool) or part not in dic: + break + dic = dic[part] + if isinstance(dic, bool): + return dic + return False def get_matching(chain: typing.List[str], no_explicit: bool = False @@ -35,6 +43,21 @@ def get_matching(chain: typing.List[str], no_explicit: bool = False yield initial +def register_rule(subdomain: str) -> None: + # Make a tree with domain parts + parts = subdomain.split('.') + parts.reverse() + dic = RULES_DICT + last_part = len(parts) - 1 + for p, part in enumerate(parts): + if isinstance(dic, bool): + return + if p == last_part: + dic[part] = True + else: + dic.setdefault(part, dict()) + dic = dic[part] + if __name__ == '__main__': # Parsing arguments @@ -54,9 +77,6 @@ if __name__ == '__main__': help="Rules file") args = parser.parse_args() - # Reading rules - rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules) - # Progress bar widgets = [ progressbar.Percentage(), @@ -67,14 +87,17 @@ if __name__ == '__main__': ' ', progressbar.AdaptiveETA(), ] progress = progressbar.ProgressBar(widgets=widgets) + + # Reading rules + for rule in args.rules: + register_rule(rule.strip()) + + # Reading domains to filter if args.input.seekable(): progress.max_value = len(args.input.readlines()) args.input.seek(0) - # Cleaning input reader = csv.reader(args.input) - - # Filtering progress.start() for chain in reader: for match in get_matching(chain, no_explicit=args.no_explicit): diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 1d06a3c..7986c2d 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -1,14 +1,22 @@ #!/usr/bin/env bash -# Resolve the CNAME chain of all the known subdomains for later analysis -cat subdomains/*.list | sort -u > temp/all_subdomains.list -./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv -sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv +if [ ! -f temp/all_resolved.csv ] +then + echo "Run ./resolve_subdomains.sh first!" + exit 1 +fi + +# Gather all the rules for filtering +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt +./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list +cat rules/*.list | sort -u > temp/all_rules.list # Filter out the subdomains not pointing to a first-party tracker -cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt -./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --output dist/firstparty-trackers.txt -./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --no-explicit --output dist/firstparty-only-trackers.txt +./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list +sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt +./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list +sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist function generate_hosts { @@ -30,7 +38,7 @@ function generate_hosts { echo "# Generation version: eulaurarien $(git describe --tags)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)" + echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)" echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo @@ -41,5 +49,5 @@ function generate_hosts { ) > "dist/$basename-hosts.txt" } -generate_hosts "firstparty-trackers" "Also contains trackers used in third-party" -generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy." +generate_hosts "firstparty-trackers" "Also contains trackers used as third-party." +generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in combination with third-party lists." diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh new file mode 100644 index 0000000..f4f7a4c --- /dev/null +++ b/resolve_subdomains.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Resolve the CNAME chain of all the known subdomains for later analysis +cat subdomains/*.list | sort -u > temp/all_subdomains.list +./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv +sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv + diff --git a/rules/.gitignore b/rules/.gitignore index d2df6a8..dbd03bc 100644 --- a/rules/.gitignore +++ b/rules/.gitignore @@ -1,2 +1,2 @@ -*.custom.txt -*.cache.txt +*.custom.list +*.cache.list diff --git a/rules/first-party.list b/rules/first-party.list new file mode 100644 index 0000000..96e615f --- /dev/null +++ b/rules/first-party.list @@ -0,0 +1 @@ +at-o.net diff --git a/rules/first-party.txt b/rules/first-party.txt deleted file mode 100644 index 54379b9..0000000 --- a/rules/first-party.txt +++ /dev/null @@ -1 +0,0 @@ -||at-o.net^ diff --git a/rules_adblock/.gitignore b/rules_adblock/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules_adblock/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt diff --git a/rules_hosts/.gitignore b/rules_hosts/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules_hosts/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt