Integrated DNS resolving to workflow

Since the bigger datasets are only updated once a month,
this might help for quick updates.
This commit is contained in:
Geoffrey Frogeye 2019-12-13 13:38:23 +01:00
parent 9050a84670
commit 8d94b80fd0
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
2 changed files with 34 additions and 57 deletions

View file

@ -12,12 +12,11 @@ import queue
import sys import sys
import threading import threading
import typing import typing
import csv import time
import coloredlogs import coloredlogs
import dns.exception import dns.exception
import dns.resolver import dns.resolver
import progressbar
DNS_TIMEOUT = 5.0 DNS_TIMEOUT = 5.0
NUMBER_THREADS = 512 NUMBER_THREADS = 512
@ -26,8 +25,6 @@ NUMBER_TRIES = 5
# TODO All the domains don't get treated, # TODO All the domains don't get treated,
# so it leaves with 4-5 subdomains not resolved # so it leaves with 4-5 subdomains not resolved
glob = None
class Worker(threading.Thread): class Worker(threading.Thread):
""" """
@ -60,7 +57,7 @@ class Worker(threading.Thread):
def resolve_subdomain(self, subdomain: str) -> typing.Optional[ def resolve_subdomain(self, subdomain: str) -> typing.Optional[
typing.List[ typing.List[
str dns.rrset.RRset
] ]
]: ]:
""" """
@ -93,18 +90,7 @@ class Worker(threading.Thread):
except dns.name.EmptyLabel: except dns.name.EmptyLabel:
self.log.warning("Empty label for %s", subdomain) self.log.warning("Empty label for %s", subdomain)
return None return None
resolved = list() return query.response.answer
last = len(query.response.answer) - 1
for a, answer in enumerate(query.response.answer):
if answer.rdtype == dns.rdatatype.CNAME:
assert a < last
resolved.append(answer.items[0].to_text()[:-1])
elif answer.rdtype == dns.rdatatype.A:
assert a == last
resolved.append(answer.items[0].address)
else:
assert False
return resolved
def run(self) -> None: def run(self) -> None:
self.log.info("Started") self.log.info("Started")
@ -124,7 +110,6 @@ class Worker(threading.Thread):
self.log.error("Gave up on %s", subdomain) self.log.error("Gave up on %s", subdomain)
resolved = [] resolved = []
resolved.insert(0, subdomain)
assert isinstance(resolved, list) assert isinstance(resolved, list)
self.orchestrator.results_queue.put(resolved) self.orchestrator.results_queue.put(resolved)
@ -182,7 +167,23 @@ class Orchestrator():
for _ in range(NUMBER_THREADS): for _ in range(NUMBER_THREADS):
self.subdomains_queue.put(None) self.subdomains_queue.put(None)
def run(self) -> typing.Iterable[typing.List[str]]: @staticmethod
def format_rrset(rrset: dns.rrset.RRset) -> typing.Iterable[str]:
if rrset.rdtype == dns.rdatatype.CNAME:
dtype = 'cname'
elif rrset.rdtype == dns.rdatatype.A:
dtype = 'a'
else:
raise NotImplementedError
name = rrset.name.to_text()[:-1]
for item in rrset.items:
value = item.to_text()
if rrset.rdtype == dns.rdatatype.CNAME:
value = value[:-1]
yield '{"timestamp":"' + str(int(time.time())) + '","name":"' + \
name + '","type":"' + dtype + '","value":"' + value + '"}\n'
def run(self) -> typing.Iterable[str]:
""" """
Yield the results. Yield the results.
""" """
@ -197,9 +198,10 @@ class Orchestrator():
# Wait for one sentinel per worker # Wait for one sentinel per worker
# In the meantime output results # In the meantime output results
for _ in range(NUMBER_THREADS): for _ in range(NUMBER_THREADS):
result: typing.List[str] resolved: typing.List[dns.rrset.RRset]
for result in iter(self.results_queue.get, None): for resolved in iter(self.results_queue.get, None):
yield result for rrset in resolved:
yield from self.format_rrset(rrset)
self.log.info("Waiting for reader thread") self.log.info("Waiting for reader thread")
fill_thread.join() fill_thread.join()
@ -214,16 +216,14 @@ def main() -> None:
the last CNAME resolved and the IP adress it resolves to. the last CNAME resolved and the IP adress it resolves to.
Takes as an input a filename (or nothing, for stdin), Takes as an input a filename (or nothing, for stdin),
and as an output a filename (or nothing, for stdout). and as an output a filename (or nothing, for stdout).
The input must be a subdomain per line, the output is a comma-sep The input must be a subdomain per line, the output is a TODO
file with the columns source CNAME and A.
Use the file `nameservers` as the list of nameservers Use the file `nameservers` as the list of nameservers
to use, or else it will use the system defaults. to use, or else it will use the system defaults.
Also shows a nice progressbar.
""" """
# Initialization # Initialization
coloredlogs.install( coloredlogs.install(
level='DEBUG', # level='DEBUG',
fmt='%(asctime)s %(name)s %(levelname)s %(message)s' fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
) )
@ -244,20 +244,6 @@ def main() -> None:
# help="Number of threads to use") # help="Number of threads to use")
args = parser.parse_args() args = parser.parse_args()
# Progress bar
widgets = [
progressbar.Percentage(),
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
]
progress = progressbar.ProgressBar(widgets=widgets)
if args.input.seekable():
progress.max_value = len(args.input.readlines())
args.input.seek(0)
# Cleaning input # Cleaning input
iterator = iter(args.input) iterator = iter(args.input)
iterator = map(str.strip, iterator) iterator = map(str.strip, iterator)
@ -269,15 +255,8 @@ def main() -> None:
servers = open('nameservers').readlines() servers = open('nameservers').readlines()
servers = list(filter(None, map(str.strip, servers))) servers = list(filter(None, map(str.strip, servers)))
writer = csv.writer(args.output) for resolved in Orchestrator(iterator, servers).run():
args.output.write(resolved)
progress.start()
global glob
glob = Orchestrator(iterator, servers)
for resolved in glob.run():
progress.update(progress.value + 1)
writer.writerow(resolved)
progress.finish()
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -4,11 +4,9 @@ function log() {
echo -e "\033[33m$@\033[0m" echo -e "\033[33m$@\033[0m"
} }
# Resolve the CNAME chain of all the known subdomains for later analysis log "Compiling locally known subdomain…"
log "Compiling subdomain lists..."
pv subdomains/*.list | sort -u > temp/all_subdomains.list
# Sort by last character to utilize the DNS server caching mechanism # Sort by last character to utilize the DNS server caching mechanism
pv temp/all_subdomains.list | rev | sort | rev > temp/all_subdomains_reversort.list pv subdomains/*.list | rev | sort -u | rev > temp/all_subdomains.list
./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv log "Resolving locally known subdomain…"
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.json