Compare commits

...

3 commits

Author SHA1 Message Date
Geoffrey Frogeye 08a8eaaada Use threads not subprocesses
You dumbo
2019-11-14 12:57:06 +01:00
Geoffrey Frogeye 32377229db Retry failed requests 2019-11-14 11:35:05 +01:00
Geoffrey Frogeye 04fe454d99 Automatically get top 1M subdomains 2019-11-14 11:23:59 +01:00
4 changed files with 35 additions and 21 deletions

View file

@ -2,6 +2,7 @@
# Main script for eulaurarien
./fetch_resources.sh
./collect_subdomains.sh
./filter_subdomains.sh

16
fetch_resources.sh Executable file
View file

@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Get a list of nameservers
rm -f nameservers
touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers
curl https://public-dns.info/nameservers.txt | sort -R >> nameservers
# Get top 1M subdomains
wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
unzip top-1m.csv.zip
sed 's|^[0-9]\+,||' top-1m.csv > subdomains/cisco-umbrella_popularity.cache.list
rm top-1m.csv top-1m.csv.zip

View file

@ -7,7 +7,8 @@ the ones resolving to a first-party tracker.
"""
import logging
import multiprocessing
import threading
import queue
import os
import re
import sys
@ -21,19 +22,18 @@ import progressbar
import regexes
DNS_TIMEOUT = 5.0
MAX_NAMESERVERS = 512
# TODO Retry failed requests
# TODO Try again does not work because sentinel get through first :/
class DnsResolver(multiprocessing.Process):
class DnsResolver(threading.Thread):
"""
Worker process for a DNS resolver.
Will resolve DNS to match first-party subdomains.
"""
def __init__(self,
in_queue: multiprocessing.Queue,
out_queue: multiprocessing.Queue,
in_queue: queue.Queue,
out_queue: queue.Queue,
server: str):
super(DnsResolver, self).__init__()
self.log = logging.getLogger(server)
@ -44,7 +44,7 @@ class DnsResolver(multiprocessing.Process):
self.resolver = dns.resolver.Resolver()
self.resolver.nameservers = [server]
def is_subdomain_matching(self, subdomain: str) -> bool:
def is_subdomain_matching(self, subdomain: str) -> typing.Optional[bool]:
"""
Indicates if the subdomain redirects to a first-party tracker.
"""
@ -61,10 +61,10 @@ class DnsResolver(multiprocessing.Process):
return False
except dns.resolver.NoNameservers:
self.log.warning("All nameservers broken for %s", subdomain)
return False
return None
except dns.exception.Timeout:
self.log.warning("Timeout for %s", subdomain)
return False
return None
except dns.name.EmptyLabel:
self.log.warning("Empty label for %s", subdomain)
return False
@ -78,6 +78,13 @@ class DnsResolver(multiprocessing.Process):
self.log.info("Started")
for subdomain in iter(self.in_queue.get, None):
matching = self.is_subdomain_matching(subdomain)
# If issue, retry
if matching is None:
# matching = False
self.in_queue.put(subdomain)
continue
result = (subdomain, matching)
# self.log.debug("%s", result)
self.out_queue.put(result)
@ -88,15 +95,14 @@ class DnsResolver(multiprocessing.Process):
def get_matching_subdomains(subdomains: typing.Iterable[str],
nameservers: typing.List[str] = None,
) -> typing.Iterable[typing.Tuple[str, bool]]:
subdomains_queue: multiprocessing.Queue = multiprocessing.Queue()
results_queue: multiprocessing.Queue = multiprocessing.Queue()
subdomains_queue: queue.Queue = queue.Queue()
results_queue: queue.Queue = queue.Queue()
"""
Orchestrator of the different DnsResolver threads.
"""
# Use interal resolver by default
servers = nameservers or dns.resolver.Resolver().nameservers
servers = servers[:MAX_NAMESERVERS]
# Create workers
for server in servers:
@ -110,14 +116,12 @@ def get_matching_subdomains(subdomains: typing.Iterable[str],
# sentinel = None ~= EOF
for _ in servers:
subdomains_queue.put(None)
subdomains_queue.close()
# Wait for one sentinel per worker
# In the meantime output results
for _ in servers:
for result in iter(results_queue.get, None):
yield result
results_queue.close()
if __name__ == '__main__':

View file

@ -1,12 +1,5 @@
#!/usr/bin/env bash
# Get a list of nameservers
rm -f nameservers
touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers
curl https://public-dns.info/nameservers.txt | sort -R >> nameservers
# Filter out the subdomains not pointing to a first-party tracker
cat subdomains/*.list | sort -u > temp/all_subdomains.list