diff --git a/README.md b/README.md index 0451b77..55fdc45 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Just to build the list, you can find an already-built list in the releases. - [Python 3.4+](https://www.python.org/) - [progressbar2](https://pypi.org/project/progressbar2/) - dnspython +- [A Python wrapper for re2](https://pypi.org/project/google-re2/) (optional, just speeds things up) (if you don't want to collect the subdomains, you can skip the following) diff --git a/fetch_resources.sh b/fetch_resources.sh index 0f7bdb5..ea839fa 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# Get rules +curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt + # Get a list of nameservers rm -f nameservers diff --git a/filter_out_explicit.py b/filter_out_explicit.py index 32277eb..90eed0b 100755 --- a/filter_out_explicit.py +++ b/filter_out_explicit.py @@ -7,39 +7,62 @@ filter out the ones explicitely matching a regex. It should be already handled by the ad blocker. """ -import logging -import multiprocessing -import re +import argparse import sys -import typing +import progressbar -import regexes +import adblockparser + +OPTIONS = {"third-party": True} def explicitely_match(subdomain: str) -> bool: - for regex in regexes.REGEXES: - if re.match(regex, subdomain + '.'): - return True - return False + url = f"https://{subdomain}/" + return rules.should_block(url, OPTIONS) if __name__ == '__main__': # Parsing arguments - assert len(sys.argv) <= 2 - filename = None + parser = argparse.ArgumentParser( + description="Filter first-party trackers from a list of subdomains") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="Input file with one subdomain per line") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="Outptut file with one tracking subdomain per line") + parser.add_argument( + '-r', '--rules', type=argparse.FileType('r'), default='rules', + help="Rules file") + args = parser.parse_args() - if len(sys.argv) == 2 and sys.argv[1] != '-': - filename = sys.argv[1] - textio = open(filename) - else: - textio = sys.stdin + # Reading rules + rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules) + + # Progress bar + widgets = [ + progressbar.Percentage(), + ' ', progressbar.SimpleProgress(), + ' ', progressbar.Bar(), + ' ', progressbar.Timer(), + ' ', progressbar.AdaptiveTransferSpeed(unit='req'), + ' ', progressbar.AdaptiveETA(), + ] + progress = progressbar.ProgressBar(widgets=widgets) + if args.input.seekable(): + progress.max_value = len(args.input.readlines()) + args.input.seek(0) # Cleaning input - iterator = iter(textio) + iterator = iter(args.input) iterator = map(str.strip, iterator) iterator = filter(None, iterator) + # Filtering + progress.start() for subdomain in iterator: + progress.update(progress.value + 1) if not explicitely_match(subdomain): - print(subdomain) + print(subdomain, file=args.output) + progress.finish() diff --git a/filter_subdomains.py b/filter_subdomains.py index 61e4a60..03344bf 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -14,6 +14,7 @@ import sys import threading import typing +import adblockparser import coloredlogs import dns.exception import dns.resolver @@ -21,9 +22,9 @@ import progressbar import regexes -DNS_TIMEOUT = 60.0 -NUMBER_THREADS = 512 -NUMBER_TRIES = 10 +DNS_TIMEOUT = 10.0 +NUMBER_THREADS = 64 +NUMBER_TRIES = 5 class Worker(threading.Thread): @@ -31,6 +32,7 @@ class Worker(threading.Thread): Worker process for a DNS resolver. Will resolve DNS to match first-party subdomains. """ + OPTIONS = {"third-party": True} def change_nameserver(self) -> None: """ @@ -85,10 +87,12 @@ class Worker(threading.Thread): self.log.warning("Empty label for %s", subdomain) return None canonical = query.canonical_name.to_text() - for regex in regexes.REGEXES: - if re.match(regex, canonical): - return True - return False + # for regex in regexes.REGEXES: + # if re.match(regex, canonical): + # return True + # return False + url = f"https://{canonical[:-1]}/" + return self.orchestrator.rules.should_block(url, Worker.OPTIONS) def run(self) -> None: self.log.info("Started") @@ -128,7 +132,9 @@ class Orchestrator(): self.log.info("Refilled nameserver queue") def __init__(self, subdomains: typing.Iterable[str], - nameservers: typing.List[str] = None): + rules: typing.Iterable[str], + nameservers: typing.List[str] = None, + ): self.log = logging.getLogger('orchestrator') self.subdomains = subdomains @@ -140,6 +146,9 @@ class Orchestrator(): self.results_queue: queue.Queue = queue.Queue() self.nameservers_queue: queue.Queue = queue.Queue() + # Rules + self.rules = adblockparser.AdblockRules(rules) + self.refill_nameservers_queue() def fill_subdomain_queue(self) -> None: @@ -210,6 +219,9 @@ def main() -> None: parser.add_argument( '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Outptut file with one tracking subdomain per line") + parser.add_argument( + '-r', '--rules', type=argparse.FileType('r'), default='rules', + help="Rules file") # parser.add_argument( # '-n', '--nameserver', type=argparse.FileType('r'), # default='nameservers', help="File with one nameserver per line") @@ -228,6 +240,9 @@ def main() -> None: ' ', progressbar.AdaptiveETA(), ] progress = progressbar.ProgressBar(widgets=widgets) + if args.input.seekable(): + progress.max_value = len(args.input.readlines()) + args.input.seek(0) # Cleaning input iterator = iter(args.input) @@ -241,7 +256,7 @@ def main() -> None: servers = list(filter(None, map(str.strip, servers))) progress.start() - for subdomain, matching in Orchestrator(iterator, servers).run(): + for subdomain, matching in Orchestrator(iterator, args.rules, servers).run(): progress.update(progress.value + 1) if matching: print(subdomain, file=args.output) diff --git a/filter_subdomains.sh b/filter_subdomains.sh index dc5b090..220dae2 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -1,30 +1,43 @@ #!/usr/bin/env bash # Filter out the subdomains not pointing to a first-party tracker - cat subdomains/*.list | sort -u > temp/all_subdomains.list -./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list +cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt +./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_subdomains.list --output temp/all_toblock.list sort -u temp/all_toblock.list > dist/firstparty-trackers.txt +./filter_out_explicit.py --rules temp/all_rules.txt --input dist/firstparty-trackers.txt --output dist/firstparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist -( - echo "# First-party trackers host list" - echo "#" - echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" - echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" - echo "# Latest version of this list: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" - echo "#" - echo "# Generation date: $(date -Isec)" - echo "# Generation version: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')" - echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" - echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc -l)" - echo - cat dist/firstparty-trackers.txt | while read host; - do - echo "0.0.0.0 $host" - done -) > dist/firstparty-trackers-hosts.txt +function generate_hosts { + basename="$1" + description="$2" + + ( + echo "# First-party trackers host list" + echo "# $description" + echo "#" + echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" + echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" + echo "#" + echo "# Latest version:" + echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" + echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" + echo "#" + echo "# Generation date: $(date -Isec)" + echo "# Generation version: eulaurarien $(git describe --tags)" + echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" + echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)" + echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" + echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" + echo + cat "dist/$basename.txt" | while read host; + do + echo "0.0.0.0 $host" + done + ) > "dist/$basename-hosts.txt" +} + +generate_hosts "firstparty-trackers" "Also contains trackers used in third-party" +generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy." diff --git a/rules/.gitignore b/rules/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt diff --git a/rules/first-party.txt b/rules/first-party.txt new file mode 100644 index 0000000..54379b9 --- /dev/null +++ b/rules/first-party.txt @@ -0,0 +1 @@ +||at-o.net^ diff --git a/temp/.gitignore b/temp/.gitignore index b31be08..bc44c53 100644 --- a/temp/.gitignore +++ b/temp/.gitignore @@ -1 +1,2 @@ *.list +*.txt