diff --git a/fetch_resources.sh b/fetch_resources.sh index 5b023d5..29f8e32 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -10,10 +10,12 @@ function dl() { } # Get rules +echo "Retrieving rules..." > /dev/stderr dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt # Get a list of nameservers +echo "Retrieving nameservers..." > /dev/stderr rm -f nameservers touch nameservers [ -f nameservers.head ] && cat nameservers.head >> nameservers @@ -22,6 +24,7 @@ sort -R nameservers.temp >> nameservers rm nameservers.temp # Get top 1M subdomains +echo "Retrieving subdomains..." > /dev/stderr dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip unzip top-1m.csv.zip sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 7986c2d..5f9c84e 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -7,14 +7,18 @@ then fi # Gather all the rules for filtering +echo "Compiling rules..." > /dev/stderr cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt ./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list -cat rules/*.list | sort -u > temp/all_rules.list +cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules.list # Filter out the subdomains not pointing to a first-party tracker +echo "Filtering tracking domains..." > /dev/stderr ./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt + +echo "Filtering first-party only tracking domains..." > /dev/stderr ./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt @@ -35,7 +39,7 @@ function generate_hosts { echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" echo "#" echo "# Generation date: $(date -Isec)" - echo "# Generation version: eulaurarien $(git describe --tags)" + echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)" diff --git a/resolve_subdomains.py b/resolve_subdomains.py index a047f0d..ec10c47 100755 --- a/resolve_subdomains.py +++ b/resolve_subdomains.py @@ -14,26 +14,26 @@ import threading import typing import csv -import adblockparser import coloredlogs import dns.exception import dns.resolver import progressbar -DNS_TIMEOUT = 10.0 -NUMBER_THREADS = 64 +DNS_TIMEOUT = 5.0 +NUMBER_THREADS = 512 NUMBER_TRIES = 5 # TODO All the domains don't get treated, # so it leaves with 4-5 subdomains not resolved +glob = None + class Worker(threading.Thread): """ Worker process for a DNS resolver. Will resolve DNS to match first-party subdomains. """ - OPTIONS = {"third-party": True} def change_nameserver(self) -> None: """ @@ -45,7 +45,7 @@ class Worker(threading.Thread): server = self.orchestrator.nameservers_queue.get(block=False) except queue.Empty: self.orchestrator.refill_nameservers_queue() - self.log.debug("Using nameserver: %s", server) + self.log.info("Using nameserver: %s", server) self.resolver.nameservers = [server] def __init__(self, @@ -70,6 +70,7 @@ class Worker(threading.Thread): Returns None if the nameserver was unable to satisfy the request. Returns [] if the requests points to nothing. """ + self.log.debug("Querying %s", subdomain) try: query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT) except dns.resolver.NXDOMAIN: @@ -112,7 +113,10 @@ class Worker(threading.Thread): for _ in range(NUMBER_TRIES): resolved = self.resolve_subdomain(subdomain) - if resolved is not None: + # Retry with another nameserver if error + if resolved is None: + self.change_nameserver() + else: break # If it wasn't found after multiple tries @@ -121,6 +125,7 @@ class Worker(threading.Thread): resolved = [] resolved.insert(0, subdomain) + assert isinstance(resolved, list) self.orchestrator.results_queue.put(resolved) self.orchestrator.results_queue.put(None) @@ -267,7 +272,9 @@ def main() -> None: writer = csv.writer(args.output) progress.start() - for resolved in Orchestrator(iterator, servers).run(): + global glob + glob = Orchestrator(iterator, servers) + for resolved in glob.run(): progress.update(progress.value + 1) writer.writerow(resolved) progress.finish() diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index f4f7a4c..3145afd 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -1,7 +1,10 @@ #!/usr/bin/env bash # Resolve the CNAME chain of all the known subdomains for later analysis +echo "Compiling subdomain lists..." > /dev/stderr cat subdomains/*.list | sort -u > temp/all_subdomains.list -./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv +# Sort by last character to utilize the DNS server caching mechanism +rev temp/all_subdomains.list | sort | rev > temp/all_subdomains_reversort.list +./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv diff --git a/rules/first-party.list b/rules/first-party.list index 96e615f..2159717 100644 --- a/rules/first-party.list +++ b/rules/first-party.list @@ -1 +1,8 @@ +# Xiti (AT Internet) +ati-host.net at-o.net +# NP6 +bp01.net +# Criteo +dnsdelegation.io +storetail.io