From 8d94b80fd04540c544cf99a866c3c14c494db495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 13:38:23 +0100 Subject: [PATCH] Integrated DNS resolving to workflow Since the bigger datasets are only updated once a month, this might help for quick updates. --- resolve_subdomains.py | 81 ++++++++++++++++--------------------------- resolve_subdomains.sh | 10 +++--- 2 files changed, 34 insertions(+), 57 deletions(-) diff --git a/resolve_subdomains.py b/resolve_subdomains.py index ec10c47..b675b11 100755 --- a/resolve_subdomains.py +++ b/resolve_subdomains.py @@ -12,12 +12,11 @@ import queue import sys import threading import typing -import csv +import time import coloredlogs import dns.exception import dns.resolver -import progressbar DNS_TIMEOUT = 5.0 NUMBER_THREADS = 512 @@ -26,8 +25,6 @@ NUMBER_TRIES = 5 # TODO All the domains don't get treated, # so it leaves with 4-5 subdomains not resolved -glob = None - class Worker(threading.Thread): """ @@ -59,9 +56,9 @@ class Worker(threading.Thread): self.change_nameserver() def resolve_subdomain(self, subdomain: str) -> typing.Optional[ - typing.List[ - str - ] + typing.List[ + dns.rrset.RRset + ] ]: """ Returns the resolution chain of the subdomain to an A record, @@ -93,18 +90,7 @@ class Worker(threading.Thread): except dns.name.EmptyLabel: self.log.warning("Empty label for %s", subdomain) return None - resolved = list() - last = len(query.response.answer) - 1 - for a, answer in enumerate(query.response.answer): - if answer.rdtype == dns.rdatatype.CNAME: - assert a < last - resolved.append(answer.items[0].to_text()[:-1]) - elif answer.rdtype == dns.rdatatype.A: - assert a == last - resolved.append(answer.items[0].address) - else: - assert False - return resolved + return query.response.answer def run(self) -> None: self.log.info("Started") @@ -124,7 +110,6 @@ class Worker(threading.Thread): self.log.error("Gave up on %s", subdomain) resolved = [] - resolved.insert(0, subdomain) assert isinstance(resolved, list) self.orchestrator.results_queue.put(resolved) @@ -182,7 +167,23 @@ class Orchestrator(): for _ in range(NUMBER_THREADS): self.subdomains_queue.put(None) - def run(self) -> typing.Iterable[typing.List[str]]: + @staticmethod + def format_rrset(rrset: dns.rrset.RRset) -> typing.Iterable[str]: + if rrset.rdtype == dns.rdatatype.CNAME: + dtype = 'cname' + elif rrset.rdtype == dns.rdatatype.A: + dtype = 'a' + else: + raise NotImplementedError + name = rrset.name.to_text()[:-1] + for item in rrset.items: + value = item.to_text() + if rrset.rdtype == dns.rdatatype.CNAME: + value = value[:-1] + yield '{"timestamp":"' + str(int(time.time())) + '","name":"' + \ + name + '","type":"' + dtype + '","value":"' + value + '"}\n' + + def run(self) -> typing.Iterable[str]: """ Yield the results. """ @@ -197,9 +198,10 @@ class Orchestrator(): # Wait for one sentinel per worker # In the meantime output results for _ in range(NUMBER_THREADS): - result: typing.List[str] - for result in iter(self.results_queue.get, None): - yield result + resolved: typing.List[dns.rrset.RRset] + for resolved in iter(self.results_queue.get, None): + for rrset in resolved: + yield from self.format_rrset(rrset) self.log.info("Waiting for reader thread") fill_thread.join() @@ -214,16 +216,14 @@ def main() -> None: the last CNAME resolved and the IP adress it resolves to. Takes as an input a filename (or nothing, for stdin), and as an output a filename (or nothing, for stdout). - The input must be a subdomain per line, the output is a comma-sep - file with the columns source CNAME and A. + The input must be a subdomain per line, the output is a TODO Use the file `nameservers` as the list of nameservers to use, or else it will use the system defaults. - Also shows a nice progressbar. """ # Initialization coloredlogs.install( - level='DEBUG', + # level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' ) @@ -244,20 +244,6 @@ def main() -> None: # help="Number of threads to use") args = parser.parse_args() - # Progress bar - widgets = [ - progressbar.Percentage(), - ' ', progressbar.SimpleProgress(), - ' ', progressbar.Bar(), - ' ', progressbar.Timer(), - ' ', progressbar.AdaptiveTransferSpeed(unit='req'), - ' ', progressbar.AdaptiveETA(), - ] - progress = progressbar.ProgressBar(widgets=widgets) - if args.input.seekable(): - progress.max_value = len(args.input.readlines()) - args.input.seek(0) - # Cleaning input iterator = iter(args.input) iterator = map(str.strip, iterator) @@ -269,15 +255,8 @@ def main() -> None: servers = open('nameservers').readlines() servers = list(filter(None, map(str.strip, servers))) - writer = csv.writer(args.output) - - progress.start() - global glob - glob = Orchestrator(iterator, servers) - for resolved in glob.run(): - progress.update(progress.value + 1) - writer.writerow(resolved) - progress.finish() + for resolved in Orchestrator(iterator, servers).run(): + args.output.write(resolved) if __name__ == '__main__': diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index ed7af79..ee5f83c 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -4,11 +4,9 @@ function log() { echo -e "\033[33m$@\033[0m" } -# Resolve the CNAME chain of all the known subdomains for later analysis -log "Compiling subdomain lists..." -pv subdomains/*.list | sort -u > temp/all_subdomains.list +log "Compiling locally known subdomain…" # Sort by last character to utilize the DNS server caching mechanism -pv temp/all_subdomains.list | rev | sort | rev > temp/all_subdomains_reversort.list -./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv -sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv +pv subdomains/*.list | rev | sort -u | rev > temp/all_subdomains.list +log "Resolving locally known subdomain…" +pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.json