Improved DNS resolving performances

Also various fixes.
Also some debug stuff, make sure to remove that later.
newworkflow_parseropti
Geoffrey Frogeye 2019-12-03 15:35:21 +01:00
parent c609b90390
commit 0159c6037c
Signed by: geoffrey
GPG Key ID: D8A7ECA00A8CD3DD
5 changed files with 34 additions and 10 deletions

View File

@ -10,10 +10,12 @@ function dl() {
} }
# Get rules # Get rules
echo "Retrieving rules..." > /dev/stderr
dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt
# Get a list of nameservers # Get a list of nameservers
echo "Retrieving nameservers..." > /dev/stderr
rm -f nameservers rm -f nameservers
touch nameservers touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers [ -f nameservers.head ] && cat nameservers.head >> nameservers
@ -22,6 +24,7 @@ sort -R nameservers.temp >> nameservers
rm nameservers.temp rm nameservers.temp
# Get top 1M subdomains # Get top 1M subdomains
echo "Retrieving subdomains..." > /dev/stderr
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
unzip top-1m.csv.zip unzip top-1m.csv.zip
sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list

View File

@ -7,14 +7,18 @@ then
fi fi
# Gather all the rules for filtering # Gather all the rules for filtering
echo "Compiling rules..." > /dev/stderr
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list ./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
cat rules/*.list | sort -u > temp/all_rules.list cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules.list
# Filter out the subdomains not pointing to a first-party tracker # Filter out the subdomains not pointing to a first-party tracker
echo "Filtering tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list ./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
echo "Filtering first-party only tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list ./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
@ -35,7 +39,7 @@ function generate_hosts {
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "#" echo "#"
echo "# Generation date: $(date -Isec)" echo "# Generation date: $(date -Isec)"
echo "# Generation version: eulaurarien $(git describe --tags)" echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)" echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"

View File

@ -14,26 +14,26 @@ import threading
import typing import typing
import csv import csv
import adblockparser
import coloredlogs import coloredlogs
import dns.exception import dns.exception
import dns.resolver import dns.resolver
import progressbar import progressbar
DNS_TIMEOUT = 10.0 DNS_TIMEOUT = 5.0
NUMBER_THREADS = 64 NUMBER_THREADS = 512
NUMBER_TRIES = 5 NUMBER_TRIES = 5
# TODO All the domains don't get treated, # TODO All the domains don't get treated,
# so it leaves with 4-5 subdomains not resolved # so it leaves with 4-5 subdomains not resolved
glob = None
class Worker(threading.Thread): class Worker(threading.Thread):
""" """
Worker process for a DNS resolver. Worker process for a DNS resolver.
Will resolve DNS to match first-party subdomains. Will resolve DNS to match first-party subdomains.
""" """
OPTIONS = {"third-party": True}
def change_nameserver(self) -> None: def change_nameserver(self) -> None:
""" """
@ -45,7 +45,7 @@ class Worker(threading.Thread):
server = self.orchestrator.nameservers_queue.get(block=False) server = self.orchestrator.nameservers_queue.get(block=False)
except queue.Empty: except queue.Empty:
self.orchestrator.refill_nameservers_queue() self.orchestrator.refill_nameservers_queue()
self.log.debug("Using nameserver: %s", server) self.log.info("Using nameserver: %s", server)
self.resolver.nameservers = [server] self.resolver.nameservers = [server]
def __init__(self, def __init__(self,
@ -70,6 +70,7 @@ class Worker(threading.Thread):
Returns None if the nameserver was unable to satisfy the request. Returns None if the nameserver was unable to satisfy the request.
Returns [] if the requests points to nothing. Returns [] if the requests points to nothing.
""" """
self.log.debug("Querying %s", subdomain)
try: try:
query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT) query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
except dns.resolver.NXDOMAIN: except dns.resolver.NXDOMAIN:
@ -112,7 +113,10 @@ class Worker(threading.Thread):
for _ in range(NUMBER_TRIES): for _ in range(NUMBER_TRIES):
resolved = self.resolve_subdomain(subdomain) resolved = self.resolve_subdomain(subdomain)
if resolved is not None: # Retry with another nameserver if error
if resolved is None:
self.change_nameserver()
else:
break break
# If it wasn't found after multiple tries # If it wasn't found after multiple tries
@ -121,6 +125,7 @@ class Worker(threading.Thread):
resolved = [] resolved = []
resolved.insert(0, subdomain) resolved.insert(0, subdomain)
assert isinstance(resolved, list)
self.orchestrator.results_queue.put(resolved) self.orchestrator.results_queue.put(resolved)
self.orchestrator.results_queue.put(None) self.orchestrator.results_queue.put(None)
@ -267,7 +272,9 @@ def main() -> None:
writer = csv.writer(args.output) writer = csv.writer(args.output)
progress.start() progress.start()
for resolved in Orchestrator(iterator, servers).run(): global glob
glob = Orchestrator(iterator, servers)
for resolved in glob.run():
progress.update(progress.value + 1) progress.update(progress.value + 1)
writer.writerow(resolved) writer.writerow(resolved)
progress.finish() progress.finish()

View File

@ -1,7 +1,10 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Resolve the CNAME chain of all the known subdomains for later analysis # Resolve the CNAME chain of all the known subdomains for later analysis
echo "Compiling subdomain lists..." > /dev/stderr
cat subdomains/*.list | sort -u > temp/all_subdomains.list cat subdomains/*.list | sort -u > temp/all_subdomains.list
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv # Sort by last character to utilize the DNS server caching mechanism
rev temp/all_subdomains.list | sort | rev > temp/all_subdomains_reversort.list
./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv

View File

@ -1 +1,8 @@
# Xiti (AT Internet)
ati-host.net
at-o.net at-o.net
# NP6
bp01.net
# Criteo
dnsdelegation.io
storetail.io