From 7d01d016a5ee36b005b23f5bc5374092ac5ebe5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 15 Nov 2019 08:57:31 +0100 Subject: [PATCH] Can now use AdBlock lists for tracking matching It's not very performant by itself, especially since pyre2 isn't maintained nor really compilableinstallable anymore. The performance seems to have decreased from 200 req/s to 0.2 req/s when using 512 threads, and to 80 req/s using 64 req/s. This might or might not be related,as the CPU doesn't seem to be the bottleneck. I will probably add support for host-based rules, matching the subdomains of such hosts (as for now there doesn't seem to be any other pattern for first-party trackers than subdomains, and this would be a very broad performace / compatibility with existing lists improvement), and convert the AdBlock lists to this format, only keeping domains-only rules. --- README.md | 1 + fetch_resources.sh | 3 +++ filter_out_explicit.py | 59 +++++++++++++++++++++++++++++------------- filter_subdomains.py | 33 ++++++++++++++++------- filter_subdomains.sh | 57 ++++++++++++++++++++++++---------------- rules/.gitignore | 2 ++ rules/first-party.txt | 1 + temp/.gitignore | 1 + 8 files changed, 108 insertions(+), 49 deletions(-) create mode 100644 rules/.gitignore create mode 100644 rules/first-party.txt diff --git a/README.md b/README.md index 0451b77..55fdc45 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Just to build the list, you can find an already-built list in the releases. - [Python 3.4+](https://www.python.org/) - [progressbar2](https://pypi.org/project/progressbar2/) - dnspython +- [A Python wrapper for re2](https://pypi.org/project/google-re2/) (optional, just speeds things up) (if you don't want to collect the subdomains, you can skip the following) diff --git a/fetch_resources.sh b/fetch_resources.sh index 0f7bdb5..ea839fa 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# Get rules +curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt + # Get a list of nameservers rm -f nameservers diff --git a/filter_out_explicit.py b/filter_out_explicit.py index 32277eb..90eed0b 100755 --- a/filter_out_explicit.py +++ b/filter_out_explicit.py @@ -7,39 +7,62 @@ filter out the ones explicitely matching a regex. It should be already handled by the ad blocker. """ -import logging -import multiprocessing -import re +import argparse import sys -import typing +import progressbar -import regexes +import adblockparser + +OPTIONS = {"third-party": True} def explicitely_match(subdomain: str) -> bool: - for regex in regexes.REGEXES: - if re.match(regex, subdomain + '.'): - return True - return False + url = f"https://{subdomain}/" + return rules.should_block(url, OPTIONS) if __name__ == '__main__': # Parsing arguments - assert len(sys.argv) <= 2 - filename = None + parser = argparse.ArgumentParser( + description="Filter first-party trackers from a list of subdomains") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="Input file with one subdomain per line") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="Outptut file with one tracking subdomain per line") + parser.add_argument( + '-r', '--rules', type=argparse.FileType('r'), default='rules', + help="Rules file") + args = parser.parse_args() - if len(sys.argv) == 2 and sys.argv[1] != '-': - filename = sys.argv[1] - textio = open(filename) - else: - textio = sys.stdin + # Reading rules + rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules) + + # Progress bar + widgets = [ + progressbar.Percentage(), + ' ', progressbar.SimpleProgress(), + ' ', progressbar.Bar(), + ' ', progressbar.Timer(), + ' ', progressbar.AdaptiveTransferSpeed(unit='req'), + ' ', progressbar.AdaptiveETA(), + ] + progress = progressbar.ProgressBar(widgets=widgets) + if args.input.seekable(): + progress.max_value = len(args.input.readlines()) + args.input.seek(0) # Cleaning input - iterator = iter(textio) + iterator = iter(args.input) iterator = map(str.strip, iterator) iterator = filter(None, iterator) + # Filtering + progress.start() for subdomain in iterator: + progress.update(progress.value + 1) if not explicitely_match(subdomain): - print(subdomain) + print(subdomain, file=args.output) + progress.finish() diff --git a/filter_subdomains.py b/filter_subdomains.py index 61e4a60..03344bf 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -14,6 +14,7 @@ import sys import threading import typing +import adblockparser import coloredlogs import dns.exception import dns.resolver @@ -21,9 +22,9 @@ import progressbar import regexes -DNS_TIMEOUT = 60.0 -NUMBER_THREADS = 512 -NUMBER_TRIES = 10 +DNS_TIMEOUT = 10.0 +NUMBER_THREADS = 64 +NUMBER_TRIES = 5 class Worker(threading.Thread): @@ -31,6 +32,7 @@ class Worker(threading.Thread): Worker process for a DNS resolver. Will resolve DNS to match first-party subdomains. """ + OPTIONS = {"third-party": True} def change_nameserver(self) -> None: """ @@ -85,10 +87,12 @@ class Worker(threading.Thread): self.log.warning("Empty label for %s", subdomain) return None canonical = query.canonical_name.to_text() - for regex in regexes.REGEXES: - if re.match(regex, canonical): - return True - return False + # for regex in regexes.REGEXES: + # if re.match(regex, canonical): + # return True + # return False + url = f"https://{canonical[:-1]}/" + return self.orchestrator.rules.should_block(url, Worker.OPTIONS) def run(self) -> None: self.log.info("Started") @@ -128,7 +132,9 @@ class Orchestrator(): self.log.info("Refilled nameserver queue") def __init__(self, subdomains: typing.Iterable[str], - nameservers: typing.List[str] = None): + rules: typing.Iterable[str], + nameservers: typing.List[str] = None, + ): self.log = logging.getLogger('orchestrator') self.subdomains = subdomains @@ -140,6 +146,9 @@ class Orchestrator(): self.results_queue: queue.Queue = queue.Queue() self.nameservers_queue: queue.Queue = queue.Queue() + # Rules + self.rules = adblockparser.AdblockRules(rules) + self.refill_nameservers_queue() def fill_subdomain_queue(self) -> None: @@ -210,6 +219,9 @@ def main() -> None: parser.add_argument( '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Outptut file with one tracking subdomain per line") + parser.add_argument( + '-r', '--rules', type=argparse.FileType('r'), default='rules', + help="Rules file") # parser.add_argument( # '-n', '--nameserver', type=argparse.FileType('r'), # default='nameservers', help="File with one nameserver per line") @@ -228,6 +240,9 @@ def main() -> None: ' ', progressbar.AdaptiveETA(), ] progress = progressbar.ProgressBar(widgets=widgets) + if args.input.seekable(): + progress.max_value = len(args.input.readlines()) + args.input.seek(0) # Cleaning input iterator = iter(args.input) @@ -241,7 +256,7 @@ def main() -> None: servers = list(filter(None, map(str.strip, servers))) progress.start() - for subdomain, matching in Orchestrator(iterator, servers).run(): + for subdomain, matching in Orchestrator(iterator, args.rules, servers).run(): progress.update(progress.value + 1) if matching: print(subdomain, file=args.output) diff --git a/filter_subdomains.sh b/filter_subdomains.sh index dc5b090..220dae2 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -1,30 +1,43 @@ #!/usr/bin/env bash # Filter out the subdomains not pointing to a first-party tracker - cat subdomains/*.list | sort -u > temp/all_subdomains.list -./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list +cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt +./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_subdomains.list --output temp/all_toblock.list sort -u temp/all_toblock.list > dist/firstparty-trackers.txt +./filter_out_explicit.py --rules temp/all_rules.txt --input dist/firstparty-trackers.txt --output dist/firstparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist -( - echo "# First-party trackers host list" - echo "#" - echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" - echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" - echo "# Latest version of this list: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" - echo "#" - echo "# Generation date: $(date -Isec)" - echo "# Generation version: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')" - echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" - echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc -l)" - echo - cat dist/firstparty-trackers.txt | while read host; - do - echo "0.0.0.0 $host" - done -) > dist/firstparty-trackers-hosts.txt +function generate_hosts { + basename="$1" + description="$2" + + ( + echo "# First-party trackers host list" + echo "# $description" + echo "#" + echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" + echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" + echo "#" + echo "# Latest version:" + echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" + echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" + echo "#" + echo "# Generation date: $(date -Isec)" + echo "# Generation version: eulaurarien $(git describe --tags)" + echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" + echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)" + echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" + echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" + echo + cat "dist/$basename.txt" | while read host; + do + echo "0.0.0.0 $host" + done + ) > "dist/$basename-hosts.txt" +} + +generate_hosts "firstparty-trackers" "Also contains trackers used in third-party" +generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy." diff --git a/rules/.gitignore b/rules/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt diff --git a/rules/first-party.txt b/rules/first-party.txt new file mode 100644 index 0000000..54379b9 --- /dev/null +++ b/rules/first-party.txt @@ -0,0 +1 @@ +||at-o.net^ diff --git a/temp/.gitignore b/temp/.gitignore index b31be08..bc44c53 100644 --- a/temp/.gitignore +++ b/temp/.gitignore @@ -1 +1,2 @@ *.list +*.txt