Can now use AdBlock lists for tracking matching
It's not very performant by itself, especially since pyre2 isn't maintained nor really compilableinstallable anymore. The performance seems to have decreased from 200 req/s to 0.2 req/s when using 512 threads, and to 80 req/s using 64 req/s. This might or might not be related,as the CPU doesn't seem to be the bottleneck. I will probably add support for host-based rules, matching the subdomains of such hosts (as for now there doesn't seem to be any other pattern for first-party trackers than subdomains, and this would be a very broad performace / compatibility with existing lists improvement), and convert the AdBlock lists to this format, only keeping domains-only rules.
This commit is contained in:
parent
87bb24c511
commit
7d01d016a5
|
@ -44,6 +44,7 @@ Just to build the list, you can find an already-built list in the releases.
|
|||
- [Python 3.4+](https://www.python.org/)
|
||||
- [progressbar2](https://pypi.org/project/progressbar2/)
|
||||
- dnspython
|
||||
- [A Python wrapper for re2](https://pypi.org/project/google-re2/) (optional, just speeds things up)
|
||||
|
||||
(if you don't want to collect the subdomains, you can skip the following)
|
||||
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Get rules
|
||||
curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt
|
||||
|
||||
# Get a list of nameservers
|
||||
|
||||
rm -f nameservers
|
||||
|
|
|
@ -7,39 +7,62 @@ filter out the ones explicitely matching a regex.
|
|||
It should be already handled by the ad blocker.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import multiprocessing
|
||||
import re
|
||||
import argparse
|
||||
import sys
|
||||
import typing
|
||||
import progressbar
|
||||
|
||||
import regexes
|
||||
import adblockparser
|
||||
|
||||
OPTIONS = {"third-party": True}
|
||||
|
||||
|
||||
def explicitely_match(subdomain: str) -> bool:
|
||||
for regex in regexes.REGEXES:
|
||||
if re.match(regex, subdomain + '.'):
|
||||
return True
|
||||
return False
|
||||
url = f"https://{subdomain}/"
|
||||
return rules.should_block(url, OPTIONS)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Parsing arguments
|
||||
assert len(sys.argv) <= 2
|
||||
filename = None
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Filter first-party trackers from a list of subdomains")
|
||||
parser.add_argument(
|
||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||
help="Input file with one subdomain per line")
|
||||
parser.add_argument(
|
||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
||||
help="Outptut file with one tracking subdomain per line")
|
||||
parser.add_argument(
|
||||
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
||||
help="Rules file")
|
||||
args = parser.parse_args()
|
||||
|
||||
if len(sys.argv) == 2 and sys.argv[1] != '-':
|
||||
filename = sys.argv[1]
|
||||
textio = open(filename)
|
||||
else:
|
||||
textio = sys.stdin
|
||||
# Reading rules
|
||||
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
|
||||
|
||||
# Progress bar
|
||||
widgets = [
|
||||
progressbar.Percentage(),
|
||||
' ', progressbar.SimpleProgress(),
|
||||
' ', progressbar.Bar(),
|
||||
' ', progressbar.Timer(),
|
||||
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
|
||||
' ', progressbar.AdaptiveETA(),
|
||||
]
|
||||
progress = progressbar.ProgressBar(widgets=widgets)
|
||||
if args.input.seekable():
|
||||
progress.max_value = len(args.input.readlines())
|
||||
args.input.seek(0)
|
||||
|
||||
# Cleaning input
|
||||
iterator = iter(textio)
|
||||
iterator = iter(args.input)
|
||||
iterator = map(str.strip, iterator)
|
||||
iterator = filter(None, iterator)
|
||||
|
||||
# Filtering
|
||||
progress.start()
|
||||
for subdomain in iterator:
|
||||
progress.update(progress.value + 1)
|
||||
if not explicitely_match(subdomain):
|
||||
print(subdomain)
|
||||
print(subdomain, file=args.output)
|
||||
progress.finish()
|
||||
|
|
|
@ -14,6 +14,7 @@ import sys
|
|||
import threading
|
||||
import typing
|
||||
|
||||
import adblockparser
|
||||
import coloredlogs
|
||||
import dns.exception
|
||||
import dns.resolver
|
||||
|
@ -21,9 +22,9 @@ import progressbar
|
|||
|
||||
import regexes
|
||||
|
||||
DNS_TIMEOUT = 60.0
|
||||
NUMBER_THREADS = 512
|
||||
NUMBER_TRIES = 10
|
||||
DNS_TIMEOUT = 10.0
|
||||
NUMBER_THREADS = 64
|
||||
NUMBER_TRIES = 5
|
||||
|
||||
|
||||
class Worker(threading.Thread):
|
||||
|
@ -31,6 +32,7 @@ class Worker(threading.Thread):
|
|||
Worker process for a DNS resolver.
|
||||
Will resolve DNS to match first-party subdomains.
|
||||
"""
|
||||
OPTIONS = {"third-party": True}
|
||||
|
||||
def change_nameserver(self) -> None:
|
||||
"""
|
||||
|
@ -85,10 +87,12 @@ class Worker(threading.Thread):
|
|||
self.log.warning("Empty label for %s", subdomain)
|
||||
return None
|
||||
canonical = query.canonical_name.to_text()
|
||||
for regex in regexes.REGEXES:
|
||||
if re.match(regex, canonical):
|
||||
return True
|
||||
return False
|
||||
# for regex in regexes.REGEXES:
|
||||
# if re.match(regex, canonical):
|
||||
# return True
|
||||
# return False
|
||||
url = f"https://{canonical[:-1]}/"
|
||||
return self.orchestrator.rules.should_block(url, Worker.OPTIONS)
|
||||
|
||||
def run(self) -> None:
|
||||
self.log.info("Started")
|
||||
|
@ -128,7 +132,9 @@ class Orchestrator():
|
|||
self.log.info("Refilled nameserver queue")
|
||||
|
||||
def __init__(self, subdomains: typing.Iterable[str],
|
||||
nameservers: typing.List[str] = None):
|
||||
rules: typing.Iterable[str],
|
||||
nameservers: typing.List[str] = None,
|
||||
):
|
||||
self.log = logging.getLogger('orchestrator')
|
||||
self.subdomains = subdomains
|
||||
|
||||
|
@ -140,6 +146,9 @@ class Orchestrator():
|
|||
self.results_queue: queue.Queue = queue.Queue()
|
||||
self.nameservers_queue: queue.Queue = queue.Queue()
|
||||
|
||||
# Rules
|
||||
self.rules = adblockparser.AdblockRules(rules)
|
||||
|
||||
self.refill_nameservers_queue()
|
||||
|
||||
def fill_subdomain_queue(self) -> None:
|
||||
|
@ -210,6 +219,9 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
||||
help="Outptut file with one tracking subdomain per line")
|
||||
parser.add_argument(
|
||||
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
||||
help="Rules file")
|
||||
# parser.add_argument(
|
||||
# '-n', '--nameserver', type=argparse.FileType('r'),
|
||||
# default='nameservers', help="File with one nameserver per line")
|
||||
|
@ -228,6 +240,9 @@ def main() -> None:
|
|||
' ', progressbar.AdaptiveETA(),
|
||||
]
|
||||
progress = progressbar.ProgressBar(widgets=widgets)
|
||||
if args.input.seekable():
|
||||
progress.max_value = len(args.input.readlines())
|
||||
args.input.seek(0)
|
||||
|
||||
# Cleaning input
|
||||
iterator = iter(args.input)
|
||||
|
@ -241,7 +256,7 @@ def main() -> None:
|
|||
servers = list(filter(None, map(str.strip, servers)))
|
||||
|
||||
progress.start()
|
||||
for subdomain, matching in Orchestrator(iterator, servers).run():
|
||||
for subdomain, matching in Orchestrator(iterator, args.rules, servers).run():
|
||||
progress.update(progress.value + 1)
|
||||
if matching:
|
||||
print(subdomain, file=args.output)
|
||||
|
|
|
@ -1,30 +1,43 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Filter out the subdomains not pointing to a first-party tracker
|
||||
|
||||
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
||||
./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list
|
||||
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
|
||||
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_subdomains.list --output temp/all_toblock.list
|
||||
sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
|
||||
./filter_out_explicit.py --rules temp/all_rules.txt --input dist/firstparty-trackers.txt --output dist/firstparty-only-trackers.txt
|
||||
|
||||
# Format the blocklist so it can be used as a hostlist
|
||||
|
||||
(
|
||||
function generate_hosts {
|
||||
basename="$1"
|
||||
description="$2"
|
||||
|
||||
(
|
||||
echo "# First-party trackers host list"
|
||||
echo "# $description"
|
||||
echo "#"
|
||||
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
|
||||
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
|
||||
echo "# Latest version of this list: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
|
||||
echo "#"
|
||||
echo "# Latest version:"
|
||||
echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
|
||||
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
|
||||
echo "#"
|
||||
echo "# Generation date: $(date -Isec)"
|
||||
echo "# Generation version: eulaurarien $(git describe --tags)"
|
||||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
|
||||
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc -l)"
|
||||
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)"
|
||||
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
||||
echo
|
||||
cat dist/firstparty-trackers.txt | while read host;
|
||||
cat "dist/$basename.txt" | while read host;
|
||||
do
|
||||
echo "0.0.0.0 $host"
|
||||
done
|
||||
) > dist/firstparty-trackers-hosts.txt
|
||||
) > "dist/$basename-hosts.txt"
|
||||
}
|
||||
|
||||
generate_hosts "firstparty-trackers" "Also contains trackers used in third-party"
|
||||
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy."
|
||||
|
|
2
rules/.gitignore
vendored
Normal file
2
rules/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
*.custom.txt
|
||||
*.cache.txt
|
1
rules/first-party.txt
Normal file
1
rules/first-party.txt
Normal file
|
@ -0,0 +1 @@
|
|||
||at-o.net^
|
1
temp/.gitignore
vendored
1
temp/.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
*.list
|
||||
*.txt
|
||||
|
|
Loading…
Reference in a new issue