Improved DNS resolving performances
Also various fixes. Also some debug stuff, make sure to remove that later.
This commit is contained in:
parent
c609b90390
commit
0159c6037c
|
@ -10,10 +10,12 @@ function dl() {
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get rules
|
# Get rules
|
||||||
|
echo "Retrieving rules..." > /dev/stderr
|
||||||
dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
|
dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
|
||||||
dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt
|
dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt
|
||||||
|
|
||||||
# Get a list of nameservers
|
# Get a list of nameservers
|
||||||
|
echo "Retrieving nameservers..." > /dev/stderr
|
||||||
rm -f nameservers
|
rm -f nameservers
|
||||||
touch nameservers
|
touch nameservers
|
||||||
[ -f nameservers.head ] && cat nameservers.head >> nameservers
|
[ -f nameservers.head ] && cat nameservers.head >> nameservers
|
||||||
|
@ -22,6 +24,7 @@ sort -R nameservers.temp >> nameservers
|
||||||
rm nameservers.temp
|
rm nameservers.temp
|
||||||
|
|
||||||
# Get top 1M subdomains
|
# Get top 1M subdomains
|
||||||
|
echo "Retrieving subdomains..." > /dev/stderr
|
||||||
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
|
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
|
||||||
unzip top-1m.csv.zip
|
unzip top-1m.csv.zip
|
||||||
sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list
|
sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list
|
||||||
|
|
|
@ -7,14 +7,18 @@ then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Gather all the rules for filtering
|
# Gather all the rules for filtering
|
||||||
|
echo "Compiling rules..." > /dev/stderr
|
||||||
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
|
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
|
||||||
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
|
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
|
||||||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
|
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
|
||||||
cat rules/*.list | sort -u > temp/all_rules.list
|
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules.list
|
||||||
|
|
||||||
# Filter out the subdomains not pointing to a first-party tracker
|
# Filter out the subdomains not pointing to a first-party tracker
|
||||||
|
echo "Filtering tracking domains..." > /dev/stderr
|
||||||
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
|
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
|
||||||
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
|
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
|
||||||
|
|
||||||
|
echo "Filtering first-party only tracking domains..." > /dev/stderr
|
||||||
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
|
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
|
||||||
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
|
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
|
||||||
|
|
||||||
|
@ -35,7 +39,7 @@ function generate_hosts {
|
||||||
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
|
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
|
||||||
echo "#"
|
echo "#"
|
||||||
echo "# Generation date: $(date -Isec)"
|
echo "# Generation date: $(date -Isec)"
|
||||||
echo "# Generation version: eulaurarien $(git describe --tags)"
|
echo "# Generation software: eulaurarien $(git describe --tags)"
|
||||||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
||||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||||
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"
|
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"
|
||||||
|
|
|
@ -14,26 +14,26 @@ import threading
|
||||||
import typing
|
import typing
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
import adblockparser
|
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
import dns.exception
|
import dns.exception
|
||||||
import dns.resolver
|
import dns.resolver
|
||||||
import progressbar
|
import progressbar
|
||||||
|
|
||||||
DNS_TIMEOUT = 10.0
|
DNS_TIMEOUT = 5.0
|
||||||
NUMBER_THREADS = 64
|
NUMBER_THREADS = 512
|
||||||
NUMBER_TRIES = 5
|
NUMBER_TRIES = 5
|
||||||
|
|
||||||
# TODO All the domains don't get treated,
|
# TODO All the domains don't get treated,
|
||||||
# so it leaves with 4-5 subdomains not resolved
|
# so it leaves with 4-5 subdomains not resolved
|
||||||
|
|
||||||
|
glob = None
|
||||||
|
|
||||||
|
|
||||||
class Worker(threading.Thread):
|
class Worker(threading.Thread):
|
||||||
"""
|
"""
|
||||||
Worker process for a DNS resolver.
|
Worker process for a DNS resolver.
|
||||||
Will resolve DNS to match first-party subdomains.
|
Will resolve DNS to match first-party subdomains.
|
||||||
"""
|
"""
|
||||||
OPTIONS = {"third-party": True}
|
|
||||||
|
|
||||||
def change_nameserver(self) -> None:
|
def change_nameserver(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -45,7 +45,7 @@ class Worker(threading.Thread):
|
||||||
server = self.orchestrator.nameservers_queue.get(block=False)
|
server = self.orchestrator.nameservers_queue.get(block=False)
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
self.orchestrator.refill_nameservers_queue()
|
self.orchestrator.refill_nameservers_queue()
|
||||||
self.log.debug("Using nameserver: %s", server)
|
self.log.info("Using nameserver: %s", server)
|
||||||
self.resolver.nameservers = [server]
|
self.resolver.nameservers = [server]
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
@ -70,6 +70,7 @@ class Worker(threading.Thread):
|
||||||
Returns None if the nameserver was unable to satisfy the request.
|
Returns None if the nameserver was unable to satisfy the request.
|
||||||
Returns [] if the requests points to nothing.
|
Returns [] if the requests points to nothing.
|
||||||
"""
|
"""
|
||||||
|
self.log.debug("Querying %s", subdomain)
|
||||||
try:
|
try:
|
||||||
query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
|
query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
|
||||||
except dns.resolver.NXDOMAIN:
|
except dns.resolver.NXDOMAIN:
|
||||||
|
@ -112,7 +113,10 @@ class Worker(threading.Thread):
|
||||||
|
|
||||||
for _ in range(NUMBER_TRIES):
|
for _ in range(NUMBER_TRIES):
|
||||||
resolved = self.resolve_subdomain(subdomain)
|
resolved = self.resolve_subdomain(subdomain)
|
||||||
if resolved is not None:
|
# Retry with another nameserver if error
|
||||||
|
if resolved is None:
|
||||||
|
self.change_nameserver()
|
||||||
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
# If it wasn't found after multiple tries
|
# If it wasn't found after multiple tries
|
||||||
|
@ -121,6 +125,7 @@ class Worker(threading.Thread):
|
||||||
resolved = []
|
resolved = []
|
||||||
|
|
||||||
resolved.insert(0, subdomain)
|
resolved.insert(0, subdomain)
|
||||||
|
assert isinstance(resolved, list)
|
||||||
self.orchestrator.results_queue.put(resolved)
|
self.orchestrator.results_queue.put(resolved)
|
||||||
|
|
||||||
self.orchestrator.results_queue.put(None)
|
self.orchestrator.results_queue.put(None)
|
||||||
|
@ -267,7 +272,9 @@ def main() -> None:
|
||||||
writer = csv.writer(args.output)
|
writer = csv.writer(args.output)
|
||||||
|
|
||||||
progress.start()
|
progress.start()
|
||||||
for resolved in Orchestrator(iterator, servers).run():
|
global glob
|
||||||
|
glob = Orchestrator(iterator, servers)
|
||||||
|
for resolved in glob.run():
|
||||||
progress.update(progress.value + 1)
|
progress.update(progress.value + 1)
|
||||||
writer.writerow(resolved)
|
writer.writerow(resolved)
|
||||||
progress.finish()
|
progress.finish()
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Resolve the CNAME chain of all the known subdomains for later analysis
|
# Resolve the CNAME chain of all the known subdomains for later analysis
|
||||||
|
echo "Compiling subdomain lists..." > /dev/stderr
|
||||||
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
||||||
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
|
# Sort by last character to utilize the DNS server caching mechanism
|
||||||
|
rev temp/all_subdomains.list | sort | rev > temp/all_subdomains_reversort.list
|
||||||
|
./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv
|
||||||
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
|
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
|
||||||
|
|
||||||
|
|
|
@ -1 +1,8 @@
|
||||||
|
# Xiti (AT Internet)
|
||||||
|
ati-host.net
|
||||||
at-o.net
|
at-o.net
|
||||||
|
# NP6
|
||||||
|
bp01.net
|
||||||
|
# Criteo
|
||||||
|
dnsdelegation.io
|
||||||
|
storetail.io
|
||||||
|
|
Loading…
Reference in a new issue