Can now use AdBlock lists for tracking matching

It's not very performant by itself, especially since pyre2 isn't
maintained nor really compilableinstallable anymore.

The performance seems to have decreased from 200 req/s to 0.2 req/s when
using 512 threads, and to 80 req/s using 64 req/s.
This might or might not be related,as the CPU doesn't seem to be the
bottleneck.

I will probably add support for host-based rules, matching the
subdomains of such hosts (as for now there doesn't seem to be any other
pattern for first-party trackers than subdomains, and this would be a
very broad performace / compatibility with existing lists improvement),
and convert the AdBlock lists to this format, only keeping domains-only
rules.
This commit is contained in:
Geoffrey Frogeye 2019-11-15 08:57:31 +01:00
parent 87bb24c511
commit 7d01d016a5
8 changed files with 108 additions and 49 deletions

View file

@ -44,6 +44,7 @@ Just to build the list, you can find an already-built list in the releases.
- [Python 3.4+](https://www.python.org/) - [Python 3.4+](https://www.python.org/)
- [progressbar2](https://pypi.org/project/progressbar2/) - [progressbar2](https://pypi.org/project/progressbar2/)
- dnspython - dnspython
- [A Python wrapper for re2](https://pypi.org/project/google-re2/) (optional, just speeds things up)
(if you don't want to collect the subdomains, you can skip the following) (if you don't want to collect the subdomains, you can skip the following)

View file

@ -1,5 +1,8 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Get rules
curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt
# Get a list of nameservers # Get a list of nameservers
rm -f nameservers rm -f nameservers

View file

@ -7,39 +7,62 @@ filter out the ones explicitely matching a regex.
It should be already handled by the ad blocker. It should be already handled by the ad blocker.
""" """
import logging import argparse
import multiprocessing
import re
import sys import sys
import typing import progressbar
import regexes import adblockparser
OPTIONS = {"third-party": True}
def explicitely_match(subdomain: str) -> bool: def explicitely_match(subdomain: str) -> bool:
for regex in regexes.REGEXES: url = f"https://{subdomain}/"
if re.match(regex, subdomain + '.'): return rules.should_block(url, OPTIONS)
return True
return False
if __name__ == '__main__': if __name__ == '__main__':
# Parsing arguments # Parsing arguments
assert len(sys.argv) <= 2 parser = argparse.ArgumentParser(
filename = None description="Filter first-party trackers from a list of subdomains")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with one subdomain per line")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line")
parser.add_argument(
'-r', '--rules', type=argparse.FileType('r'), default='rules',
help="Rules file")
args = parser.parse_args()
if len(sys.argv) == 2 and sys.argv[1] != '-': # Reading rules
filename = sys.argv[1] rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
textio = open(filename)
else: # Progress bar
textio = sys.stdin widgets = [
progressbar.Percentage(),
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
]
progress = progressbar.ProgressBar(widgets=widgets)
if args.input.seekable():
progress.max_value = len(args.input.readlines())
args.input.seek(0)
# Cleaning input # Cleaning input
iterator = iter(textio) iterator = iter(args.input)
iterator = map(str.strip, iterator) iterator = map(str.strip, iterator)
iterator = filter(None, iterator) iterator = filter(None, iterator)
# Filtering
progress.start()
for subdomain in iterator: for subdomain in iterator:
progress.update(progress.value + 1)
if not explicitely_match(subdomain): if not explicitely_match(subdomain):
print(subdomain) print(subdomain, file=args.output)
progress.finish()

View file

@ -14,6 +14,7 @@ import sys
import threading import threading
import typing import typing
import adblockparser
import coloredlogs import coloredlogs
import dns.exception import dns.exception
import dns.resolver import dns.resolver
@ -21,9 +22,9 @@ import progressbar
import regexes import regexes
DNS_TIMEOUT = 60.0 DNS_TIMEOUT = 10.0
NUMBER_THREADS = 512 NUMBER_THREADS = 64
NUMBER_TRIES = 10 NUMBER_TRIES = 5
class Worker(threading.Thread): class Worker(threading.Thread):
@ -31,6 +32,7 @@ class Worker(threading.Thread):
Worker process for a DNS resolver. Worker process for a DNS resolver.
Will resolve DNS to match first-party subdomains. Will resolve DNS to match first-party subdomains.
""" """
OPTIONS = {"third-party": True}
def change_nameserver(self) -> None: def change_nameserver(self) -> None:
""" """
@ -85,10 +87,12 @@ class Worker(threading.Thread):
self.log.warning("Empty label for %s", subdomain) self.log.warning("Empty label for %s", subdomain)
return None return None
canonical = query.canonical_name.to_text() canonical = query.canonical_name.to_text()
for regex in regexes.REGEXES: # for regex in regexes.REGEXES:
if re.match(regex, canonical): # if re.match(regex, canonical):
return True # return True
return False # return False
url = f"https://{canonical[:-1]}/"
return self.orchestrator.rules.should_block(url, Worker.OPTIONS)
def run(self) -> None: def run(self) -> None:
self.log.info("Started") self.log.info("Started")
@ -128,7 +132,9 @@ class Orchestrator():
self.log.info("Refilled nameserver queue") self.log.info("Refilled nameserver queue")
def __init__(self, subdomains: typing.Iterable[str], def __init__(self, subdomains: typing.Iterable[str],
nameservers: typing.List[str] = None): rules: typing.Iterable[str],
nameservers: typing.List[str] = None,
):
self.log = logging.getLogger('orchestrator') self.log = logging.getLogger('orchestrator')
self.subdomains = subdomains self.subdomains = subdomains
@ -140,6 +146,9 @@ class Orchestrator():
self.results_queue: queue.Queue = queue.Queue() self.results_queue: queue.Queue = queue.Queue()
self.nameservers_queue: queue.Queue = queue.Queue() self.nameservers_queue: queue.Queue = queue.Queue()
# Rules
self.rules = adblockparser.AdblockRules(rules)
self.refill_nameservers_queue() self.refill_nameservers_queue()
def fill_subdomain_queue(self) -> None: def fill_subdomain_queue(self) -> None:
@ -210,6 +219,9 @@ def main() -> None:
parser.add_argument( parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout, '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line") help="Outptut file with one tracking subdomain per line")
parser.add_argument(
'-r', '--rules', type=argparse.FileType('r'), default='rules',
help="Rules file")
# parser.add_argument( # parser.add_argument(
# '-n', '--nameserver', type=argparse.FileType('r'), # '-n', '--nameserver', type=argparse.FileType('r'),
# default='nameservers', help="File with one nameserver per line") # default='nameservers', help="File with one nameserver per line")
@ -228,6 +240,9 @@ def main() -> None:
' ', progressbar.AdaptiveETA(), ' ', progressbar.AdaptiveETA(),
] ]
progress = progressbar.ProgressBar(widgets=widgets) progress = progressbar.ProgressBar(widgets=widgets)
if args.input.seekable():
progress.max_value = len(args.input.readlines())
args.input.seek(0)
# Cleaning input # Cleaning input
iterator = iter(args.input) iterator = iter(args.input)
@ -241,7 +256,7 @@ def main() -> None:
servers = list(filter(None, map(str.strip, servers))) servers = list(filter(None, map(str.strip, servers)))
progress.start() progress.start()
for subdomain, matching in Orchestrator(iterator, servers).run(): for subdomain, matching in Orchestrator(iterator, args.rules, servers).run():
progress.update(progress.value + 1) progress.update(progress.value + 1)
if matching: if matching:
print(subdomain, file=args.output) print(subdomain, file=args.output)

View file

@ -1,30 +1,43 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Filter out the subdomains not pointing to a first-party tracker # Filter out the subdomains not pointing to a first-party tracker
cat subdomains/*.list | sort -u > temp/all_subdomains.list cat subdomains/*.list | sort -u > temp/all_subdomains.list
./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_subdomains.list --output temp/all_toblock.list
sort -u temp/all_toblock.list > dist/firstparty-trackers.txt sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
./filter_out_explicit.py --rules temp/all_rules.txt --input dist/firstparty-trackers.txt --output dist/firstparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist # Format the blocklist so it can be used as a hostlist
( function generate_hosts {
basename="$1"
description="$2"
(
echo "# First-party trackers host list" echo "# First-party trackers host list"
echo "# $description"
echo "#" echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "# Latest version of this list: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" echo "#"
echo "# Latest version:"
echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "#" echo "#"
echo "# Generation date: $(date -Isec)" echo "# Generation date: $(date -Isec)"
echo "# Generation version: eulaurarien $(git describe --tags)" echo "# Generation version: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')" echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)"
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc -l)" echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo echo
cat dist/firstparty-trackers.txt | while read host; cat "dist/$basename.txt" | while read host;
do do
echo "0.0.0.0 $host" echo "0.0.0.0 $host"
done done
) > dist/firstparty-trackers-hosts.txt ) > "dist/$basename-hosts.txt"
}
generate_hosts "firstparty-trackers" "Also contains trackers used in third-party"
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy."

2
rules/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*.custom.txt
*.cache.txt

1
rules/first-party.txt Normal file
View file

@ -0,0 +1 @@
||at-o.net^

1
temp/.gitignore vendored
View file

@ -1 +1,2 @@
*.list *.list
*.txt