Integrated DNS resolving to workflow

Since the bigger datasets are only updated once a month,
this might help for quick updates.
This commit is contained in:
Geoffrey Frogeye 2019-12-13 13:38:23 +01:00
parent 9050a84670
commit 8d94b80fd0
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
2 changed files with 34 additions and 57 deletions

View file

@ -12,12 +12,11 @@ import queue
import sys
import threading
import typing
import csv
import time
import coloredlogs
import dns.exception
import dns.resolver
import progressbar
DNS_TIMEOUT = 5.0
NUMBER_THREADS = 512
@ -26,8 +25,6 @@ NUMBER_TRIES = 5
# TODO All the domains don't get treated,
# so it leaves with 4-5 subdomains not resolved
glob = None
class Worker(threading.Thread):
"""
@ -59,9 +56,9 @@ class Worker(threading.Thread):
self.change_nameserver()
def resolve_subdomain(self, subdomain: str) -> typing.Optional[
typing.List[
str
]
typing.List[
dns.rrset.RRset
]
]:
"""
Returns the resolution chain of the subdomain to an A record,
@ -93,18 +90,7 @@ class Worker(threading.Thread):
except dns.name.EmptyLabel:
self.log.warning("Empty label for %s", subdomain)
return None
resolved = list()
last = len(query.response.answer) - 1
for a, answer in enumerate(query.response.answer):
if answer.rdtype == dns.rdatatype.CNAME:
assert a < last
resolved.append(answer.items[0].to_text()[:-1])
elif answer.rdtype == dns.rdatatype.A:
assert a == last
resolved.append(answer.items[0].address)
else:
assert False
return resolved
return query.response.answer
def run(self) -> None:
self.log.info("Started")
@ -124,7 +110,6 @@ class Worker(threading.Thread):
self.log.error("Gave up on %s", subdomain)
resolved = []
resolved.insert(0, subdomain)
assert isinstance(resolved, list)
self.orchestrator.results_queue.put(resolved)
@ -182,7 +167,23 @@ class Orchestrator():
for _ in range(NUMBER_THREADS):
self.subdomains_queue.put(None)
def run(self) -> typing.Iterable[typing.List[str]]:
@staticmethod
def format_rrset(rrset: dns.rrset.RRset) -> typing.Iterable[str]:
if rrset.rdtype == dns.rdatatype.CNAME:
dtype = 'cname'
elif rrset.rdtype == dns.rdatatype.A:
dtype = 'a'
else:
raise NotImplementedError
name = rrset.name.to_text()[:-1]
for item in rrset.items:
value = item.to_text()
if rrset.rdtype == dns.rdatatype.CNAME:
value = value[:-1]
yield '{"timestamp":"' + str(int(time.time())) + '","name":"' + \
name + '","type":"' + dtype + '","value":"' + value + '"}\n'
def run(self) -> typing.Iterable[str]:
"""
Yield the results.
"""
@ -197,9 +198,10 @@ class Orchestrator():
# Wait for one sentinel per worker
# In the meantime output results
for _ in range(NUMBER_THREADS):
result: typing.List[str]
for result in iter(self.results_queue.get, None):
yield result
resolved: typing.List[dns.rrset.RRset]
for resolved in iter(self.results_queue.get, None):
for rrset in resolved:
yield from self.format_rrset(rrset)
self.log.info("Waiting for reader thread")
fill_thread.join()
@ -214,16 +216,14 @@ def main() -> None:
the last CNAME resolved and the IP adress it resolves to.
Takes as an input a filename (or nothing, for stdin),
and as an output a filename (or nothing, for stdout).
The input must be a subdomain per line, the output is a comma-sep
file with the columns source CNAME and A.
The input must be a subdomain per line, the output is a TODO
Use the file `nameservers` as the list of nameservers
to use, or else it will use the system defaults.
Also shows a nice progressbar.
"""
# Initialization
coloredlogs.install(
level='DEBUG',
# level='DEBUG',
fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
)
@ -244,20 +244,6 @@ def main() -> None:
# help="Number of threads to use")
args = parser.parse_args()
# Progress bar
widgets = [
progressbar.Percentage(),
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
]
progress = progressbar.ProgressBar(widgets=widgets)
if args.input.seekable():
progress.max_value = len(args.input.readlines())
args.input.seek(0)
# Cleaning input
iterator = iter(args.input)
iterator = map(str.strip, iterator)
@ -269,15 +255,8 @@ def main() -> None:
servers = open('nameservers').readlines()
servers = list(filter(None, map(str.strip, servers)))
writer = csv.writer(args.output)
progress.start()
global glob
glob = Orchestrator(iterator, servers)
for resolved in glob.run():
progress.update(progress.value + 1)
writer.writerow(resolved)
progress.finish()
for resolved in Orchestrator(iterator, servers).run():
args.output.write(resolved)
if __name__ == '__main__':

View file

@ -4,11 +4,9 @@ function log() {
echo -e "\033[33m$@\033[0m"
}
# Resolve the CNAME chain of all the known subdomains for later analysis
log "Compiling subdomain lists..."
pv subdomains/*.list | sort -u > temp/all_subdomains.list
log "Compiling locally known subdomain…"
# Sort by last character to utilize the DNS server caching mechanism
pv temp/all_subdomains.list | rev | sort | rev > temp/all_subdomains_reversort.list
./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
pv subdomains/*.list | rev | sort -u | rev > temp/all_subdomains.list
log "Resolving locally known subdomain…"
pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.json