Separated DNS resolution from filtering
This effectively removes the parallelism of filtering, which doubles the processing time (5->8 hours), but this allows me to toy around with the performances of this step, which I aim to improve drastically.
This commit is contained in:
parent
7d01d016a5
commit
c23004fbff
|
@ -1,68 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# pylint: disable=C0103
|
|
||||||
|
|
||||||
"""
|
|
||||||
From a list of subdomains to block,
|
|
||||||
filter out the ones explicitely matching a regex.
|
|
||||||
It should be already handled by the ad blocker.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import progressbar
|
|
||||||
|
|
||||||
import adblockparser
|
|
||||||
|
|
||||||
OPTIONS = {"third-party": True}
|
|
||||||
|
|
||||||
|
|
||||||
def explicitely_match(subdomain: str) -> bool:
|
|
||||||
url = f"https://{subdomain}/"
|
|
||||||
return rules.should_block(url, OPTIONS)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# Parsing arguments
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Filter first-party trackers from a list of subdomains")
|
|
||||||
parser.add_argument(
|
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
|
||||||
help="Input file with one subdomain per line")
|
|
||||||
parser.add_argument(
|
|
||||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
|
||||||
help="Outptut file with one tracking subdomain per line")
|
|
||||||
parser.add_argument(
|
|
||||||
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
|
||||||
help="Rules file")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Reading rules
|
|
||||||
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
|
|
||||||
|
|
||||||
# Progress bar
|
|
||||||
widgets = [
|
|
||||||
progressbar.Percentage(),
|
|
||||||
' ', progressbar.SimpleProgress(),
|
|
||||||
' ', progressbar.Bar(),
|
|
||||||
' ', progressbar.Timer(),
|
|
||||||
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
|
|
||||||
' ', progressbar.AdaptiveETA(),
|
|
||||||
]
|
|
||||||
progress = progressbar.ProgressBar(widgets=widgets)
|
|
||||||
if args.input.seekable():
|
|
||||||
progress.max_value = len(args.input.readlines())
|
|
||||||
args.input.seek(0)
|
|
||||||
|
|
||||||
# Cleaning input
|
|
||||||
iterator = iter(args.input)
|
|
||||||
iterator = map(str.strip, iterator)
|
|
||||||
iterator = filter(None, iterator)
|
|
||||||
|
|
||||||
# Filtering
|
|
||||||
progress.start()
|
|
||||||
for subdomain in iterator:
|
|
||||||
progress.update(progress.value + 1)
|
|
||||||
if not explicitely_match(subdomain):
|
|
||||||
print(subdomain, file=args.output)
|
|
||||||
progress.finish()
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
# pylint: disable=C0103
|
||||||
|
|
||||||
"""
|
"""
|
||||||
From a list of subdomains, output only
|
From a list of subdomains, output only
|
||||||
|
@ -6,230 +7,56 @@ the ones resolving to a first-party tracker.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import queue
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import progressbar
|
||||||
|
import csv
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
import adblockparser
|
import adblockparser
|
||||||
import coloredlogs
|
|
||||||
import dns.exception
|
|
||||||
import dns.resolver
|
|
||||||
import progressbar
|
|
||||||
|
|
||||||
import regexes
|
|
||||||
|
|
||||||
DNS_TIMEOUT = 10.0
|
|
||||||
NUMBER_THREADS = 64
|
|
||||||
NUMBER_TRIES = 5
|
|
||||||
|
|
||||||
|
|
||||||
class Worker(threading.Thread):
|
|
||||||
"""
|
|
||||||
Worker process for a DNS resolver.
|
|
||||||
Will resolve DNS to match first-party subdomains.
|
|
||||||
"""
|
|
||||||
OPTIONS = {"third-party": True}
|
OPTIONS = {"third-party": True}
|
||||||
|
|
||||||
def change_nameserver(self) -> None:
|
|
||||||
"""
|
|
||||||
Assign a this worker another nameserver from the queue.
|
|
||||||
"""
|
|
||||||
server = None
|
|
||||||
while server is None:
|
|
||||||
try:
|
|
||||||
server = self.orchestrator.nameservers_queue.get(block=False)
|
|
||||||
except queue.Empty:
|
|
||||||
self.orchestrator.refill_nameservers_queue()
|
|
||||||
self.log.debug("Using nameserver: %s", server)
|
|
||||||
self.resolver.nameservers = [server]
|
|
||||||
|
|
||||||
def __init__(self,
|
def subdomain_matching(subdomain: str) -> bool:
|
||||||
orchestrator: 'Orchestrator',
|
url = f"https://{subdomain}/"
|
||||||
index: int = 0):
|
return rules.should_block(url, OPTIONS)
|
||||||
super(Worker, self).__init__()
|
|
||||||
self.log = logging.getLogger(f'worker{index:03d}')
|
|
||||||
self.orchestrator = orchestrator
|
|
||||||
|
|
||||||
self.resolver = dns.resolver.Resolver()
|
|
||||||
self.change_nameserver()
|
|
||||||
|
|
||||||
def is_subdomain_matching(self, subdomain: str) -> typing.Optional[bool]:
|
|
||||||
"""
|
|
||||||
Indicates if the subdomain redirects to a first-party tracker.
|
|
||||||
Returns None if the nameserver was unable to satisfy the request.
|
|
||||||
"""
|
|
||||||
# TODO Look at the whole chain rather than the last one
|
|
||||||
# TODO Also match the ASN of the IP (caching the ASN subnetworks will do)
|
|
||||||
try:
|
|
||||||
query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
|
|
||||||
except dns.resolver.NXDOMAIN:
|
|
||||||
return False
|
|
||||||
except dns.resolver.NoAnswer:
|
|
||||||
return False
|
|
||||||
except dns.resolver.YXDOMAIN:
|
|
||||||
self.log.warning("Query name too long for %s", subdomain)
|
|
||||||
return None
|
|
||||||
except dns.resolver.NoNameservers:
|
|
||||||
# NOTE Most of the time this error message means that the domain
|
|
||||||
# does not exists, but sometimes it means the that the server
|
|
||||||
# itself is broken. So we count on the retry logic.
|
|
||||||
self.log.warning("All nameservers broken for %s", subdomain)
|
|
||||||
return None
|
|
||||||
except dns.exception.Timeout:
|
|
||||||
# NOTE Same as above
|
|
||||||
self.log.warning("Timeout for %s", subdomain)
|
|
||||||
return None
|
|
||||||
except dns.name.EmptyLabel:
|
|
||||||
self.log.warning("Empty label for %s", subdomain)
|
|
||||||
return None
|
|
||||||
canonical = query.canonical_name.to_text()
|
|
||||||
# for regex in regexes.REGEXES:
|
|
||||||
# if re.match(regex, canonical):
|
|
||||||
# return True
|
|
||||||
# return False
|
|
||||||
url = f"https://{canonical[:-1]}/"
|
|
||||||
return self.orchestrator.rules.should_block(url, Worker.OPTIONS)
|
|
||||||
|
|
||||||
def run(self) -> None:
|
|
||||||
self.log.info("Started")
|
|
||||||
for subdomain in iter(self.orchestrator.subdomains_queue.get, None):
|
|
||||||
|
|
||||||
for _ in range(NUMBER_TRIES):
|
|
||||||
matching = self.is_subdomain_matching(subdomain)
|
|
||||||
if matching is not None:
|
|
||||||
break
|
|
||||||
|
|
||||||
# If it wasn't found after multiple tries
|
|
||||||
if matching is None:
|
|
||||||
self.log.error("Gave up on %s", subdomain)
|
|
||||||
matching = False
|
|
||||||
|
|
||||||
result = (subdomain, matching)
|
|
||||||
self.orchestrator.results_queue.put(result)
|
|
||||||
|
|
||||||
self.orchestrator.results_queue.put(None)
|
|
||||||
self.log.info("Stopped")
|
|
||||||
|
|
||||||
|
|
||||||
class Orchestrator():
|
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
||||||
"""
|
) -> typing.Iterable[str]:
|
||||||
Orchestrator of the different Worker threads.
|
initial = chain[0]
|
||||||
"""
|
cname_destinations = chain[1:-1]
|
||||||
|
# a_destination = chain[-1]
|
||||||
def refill_nameservers_queue(self) -> None:
|
initial_matching = subdomain_matching(initial)
|
||||||
"""
|
if no_explicit and initial_matching:
|
||||||
Re-fill the given nameservers into the nameservers queue.
|
return
|
||||||
Done every-time the queue is empty, making it
|
cname_matching = any(map(subdomain_matching, cname_destinations))
|
||||||
basically looping and infinite.
|
if cname_matching or initial_matching:
|
||||||
"""
|
yield initial
|
||||||
# Might be in a race condition but that's probably fine
|
|
||||||
for nameserver in self.nameservers:
|
|
||||||
self.nameservers_queue.put(nameserver)
|
|
||||||
self.log.info("Refilled nameserver queue")
|
|
||||||
|
|
||||||
def __init__(self, subdomains: typing.Iterable[str],
|
|
||||||
rules: typing.Iterable[str],
|
|
||||||
nameservers: typing.List[str] = None,
|
|
||||||
):
|
|
||||||
self.log = logging.getLogger('orchestrator')
|
|
||||||
self.subdomains = subdomains
|
|
||||||
|
|
||||||
# Use interal resolver by default
|
|
||||||
self.nameservers = nameservers or dns.resolver.Resolver().nameservers
|
|
||||||
|
|
||||||
self.subdomains_queue: queue.Queue = queue.Queue(
|
|
||||||
maxsize=NUMBER_THREADS)
|
|
||||||
self.results_queue: queue.Queue = queue.Queue()
|
|
||||||
self.nameservers_queue: queue.Queue = queue.Queue()
|
|
||||||
|
|
||||||
# Rules
|
|
||||||
self.rules = adblockparser.AdblockRules(rules)
|
|
||||||
|
|
||||||
self.refill_nameservers_queue()
|
|
||||||
|
|
||||||
def fill_subdomain_queue(self) -> None:
|
|
||||||
"""
|
|
||||||
Read the subdomains in input and put them into the queue.
|
|
||||||
Done in a thread so we can both:
|
|
||||||
- yield the results as they come
|
|
||||||
- not store all the subdomains at once
|
|
||||||
"""
|
|
||||||
self.log.info("Started reading subdomains")
|
|
||||||
# Send data to workers
|
|
||||||
for subdomain in self.subdomains:
|
|
||||||
self.subdomains_queue.put(subdomain)
|
|
||||||
|
|
||||||
self.log.info("Finished reading subdomains")
|
|
||||||
# Send sentinel to each worker
|
|
||||||
# sentinel = None ~= EOF
|
|
||||||
for _ in range(NUMBER_THREADS):
|
|
||||||
self.subdomains_queue.put(None)
|
|
||||||
|
|
||||||
def run(self) -> typing.Iterable[typing.Tuple[str, bool]]:
|
|
||||||
"""
|
|
||||||
Yield the results.
|
|
||||||
"""
|
|
||||||
# Create workers
|
|
||||||
self.log.info("Creating workers")
|
|
||||||
for i in range(NUMBER_THREADS):
|
|
||||||
Worker(self, i).start()
|
|
||||||
|
|
||||||
fill_thread = threading.Thread(target=self.fill_subdomain_queue)
|
|
||||||
fill_thread.start()
|
|
||||||
|
|
||||||
# Wait for one sentinel per worker
|
|
||||||
# In the meantime output results
|
|
||||||
for _ in range(NUMBER_THREADS):
|
|
||||||
for result in iter(self.results_queue.get, None):
|
|
||||||
yield result
|
|
||||||
|
|
||||||
self.log.info("Waiting for reader thread")
|
|
||||||
fill_thread.join()
|
|
||||||
|
|
||||||
self.log.info("Done!")
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
if __name__ == '__main__':
|
||||||
"""
|
|
||||||
Main function when used directly.
|
|
||||||
Takes as an input a filename (or nothing, for stdin)
|
|
||||||
that will be read and the ones that are a tracker
|
|
||||||
will be outputed on stdout.
|
|
||||||
Use the file `nameservers` as the list of nameservers
|
|
||||||
to use, or else it will use the system defaults.
|
|
||||||
Also shows a nice progressbar.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Initialization
|
|
||||||
coloredlogs.install(
|
|
||||||
level='DEBUG',
|
|
||||||
fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Filter first-party trackers from a list of subdomains")
|
description="Filter first-party trackers from a list of subdomains")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||||
help="Input file with one subdomain per line")
|
help="Input file with DNS chains")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
||||||
help="Outptut file with one tracking subdomain per line")
|
help="Outptut file with one tracking subdomain per line")
|
||||||
|
parser.add_argument(
|
||||||
|
'-n', '--no-explicit', action='store_true',
|
||||||
|
help="Don't output domains already blocked with rules without CNAME")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
||||||
help="Rules file")
|
help="Rules file")
|
||||||
# parser.add_argument(
|
|
||||||
# '-n', '--nameserver', type=argparse.FileType('r'),
|
|
||||||
# default='nameservers', help="File with one nameserver per line")
|
|
||||||
# parser.add_argument(
|
|
||||||
# '-j', '--workers', type=int, default=512,
|
|
||||||
# help="Number of threads to use")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Reading rules
|
||||||
|
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
|
||||||
|
|
||||||
# Progress bar
|
# Progress bar
|
||||||
widgets = [
|
widgets = [
|
||||||
progressbar.Percentage(),
|
progressbar.Percentage(),
|
||||||
|
@ -245,23 +72,12 @@ def main() -> None:
|
||||||
args.input.seek(0)
|
args.input.seek(0)
|
||||||
|
|
||||||
# Cleaning input
|
# Cleaning input
|
||||||
iterator = iter(args.input)
|
reader = csv.reader(args.input)
|
||||||
iterator = map(str.strip, iterator)
|
|
||||||
iterator = filter(None, iterator)
|
|
||||||
|
|
||||||
# Reading nameservers
|
|
||||||
servers: typing.List[str] = list()
|
|
||||||
if os.path.isfile('nameservers'):
|
|
||||||
servers = open('nameservers').readlines()
|
|
||||||
servers = list(filter(None, map(str.strip, servers)))
|
|
||||||
|
|
||||||
|
# Filtering
|
||||||
progress.start()
|
progress.start()
|
||||||
for subdomain, matching in Orchestrator(iterator, args.rules, servers).run():
|
for chain in reader:
|
||||||
|
for match in get_matching(chain, no_explicit=args.no_explicit):
|
||||||
|
print(match, file=args.output)
|
||||||
progress.update(progress.value + 1)
|
progress.update(progress.value + 1)
|
||||||
if matching:
|
|
||||||
print(subdomain, file=args.output)
|
|
||||||
progress.finish()
|
progress.finish()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
|
|
@ -1,14 +1,16 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Filter out the subdomains not pointing to a first-party tracker
|
# Resolve the CNAME chain of all the known subdomains for later analysis
|
||||||
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
||||||
|
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
|
||||||
|
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
|
||||||
|
|
||||||
|
# Filter out the subdomains not pointing to a first-party tracker
|
||||||
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
|
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
|
||||||
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_subdomains.list --output temp/all_toblock.list
|
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --output dist/firstparty-trackers.txt
|
||||||
sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
|
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --no-explicit --output dist/firstparty-only-trackers.txt
|
||||||
./filter_out_explicit.py --rules temp/all_rules.txt --input dist/firstparty-trackers.txt --output dist/firstparty-only-trackers.txt
|
|
||||||
|
|
||||||
# Format the blocklist so it can be used as a hostlist
|
# Format the blocklist so it can be used as a hostlist
|
||||||
|
|
||||||
function generate_hosts {
|
function generate_hosts {
|
||||||
basename="$1"
|
basename="$1"
|
||||||
description="$2"
|
description="$2"
|
||||||
|
|
277
resolve_subdomains.py
Executable file
277
resolve_subdomains.py
Executable file
|
@ -0,0 +1,277 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
From a list of subdomains, output only
|
||||||
|
the ones resolving to a first-party tracker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import queue
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import typing
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import adblockparser
|
||||||
|
import coloredlogs
|
||||||
|
import dns.exception
|
||||||
|
import dns.resolver
|
||||||
|
import progressbar
|
||||||
|
|
||||||
|
DNS_TIMEOUT = 10.0
|
||||||
|
NUMBER_THREADS = 64
|
||||||
|
NUMBER_TRIES = 5
|
||||||
|
|
||||||
|
# TODO All the domains don't get treated,
|
||||||
|
# so it leaves with 4-5 subdomains not resolved
|
||||||
|
|
||||||
|
|
||||||
|
class Worker(threading.Thread):
|
||||||
|
"""
|
||||||
|
Worker process for a DNS resolver.
|
||||||
|
Will resolve DNS to match first-party subdomains.
|
||||||
|
"""
|
||||||
|
OPTIONS = {"third-party": True}
|
||||||
|
|
||||||
|
def change_nameserver(self) -> None:
|
||||||
|
"""
|
||||||
|
Assign a this worker another nameserver from the queue.
|
||||||
|
"""
|
||||||
|
server = None
|
||||||
|
while server is None:
|
||||||
|
try:
|
||||||
|
server = self.orchestrator.nameservers_queue.get(block=False)
|
||||||
|
except queue.Empty:
|
||||||
|
self.orchestrator.refill_nameservers_queue()
|
||||||
|
self.log.debug("Using nameserver: %s", server)
|
||||||
|
self.resolver.nameservers = [server]
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
orchestrator: 'Orchestrator',
|
||||||
|
index: int = 0):
|
||||||
|
super(Worker, self).__init__()
|
||||||
|
self.log = logging.getLogger(f'worker{index:03d}')
|
||||||
|
self.orchestrator = orchestrator
|
||||||
|
|
||||||
|
self.resolver = dns.resolver.Resolver()
|
||||||
|
self.change_nameserver()
|
||||||
|
|
||||||
|
def resolve_subdomain(self, subdomain: str) -> typing.Optional[
|
||||||
|
typing.List[
|
||||||
|
str
|
||||||
|
]
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Returns the resolution chain of the subdomain to an A record,
|
||||||
|
including any intermediary CNAME.
|
||||||
|
The last element is an IP address.
|
||||||
|
Returns None if the nameserver was unable to satisfy the request.
|
||||||
|
Returns [] if the requests points to nothing.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
|
||||||
|
except dns.resolver.NXDOMAIN:
|
||||||
|
return []
|
||||||
|
except dns.resolver.NoAnswer:
|
||||||
|
return []
|
||||||
|
except dns.resolver.YXDOMAIN:
|
||||||
|
self.log.warning("Query name too long for %s", subdomain)
|
||||||
|
return None
|
||||||
|
except dns.resolver.NoNameservers:
|
||||||
|
# NOTE Most of the time this error message means that the domain
|
||||||
|
# does not exists, but sometimes it means the that the server
|
||||||
|
# itself is broken. So we count on the retry logic.
|
||||||
|
self.log.warning("All nameservers broken for %s", subdomain)
|
||||||
|
return None
|
||||||
|
except dns.exception.Timeout:
|
||||||
|
# NOTE Same as above
|
||||||
|
self.log.warning("Timeout for %s", subdomain)
|
||||||
|
return None
|
||||||
|
except dns.name.EmptyLabel:
|
||||||
|
self.log.warning("Empty label for %s", subdomain)
|
||||||
|
return None
|
||||||
|
resolved = list()
|
||||||
|
last = len(query.response.answer) - 1
|
||||||
|
for a, answer in enumerate(query.response.answer):
|
||||||
|
if answer.rdtype == dns.rdatatype.CNAME:
|
||||||
|
assert a < last
|
||||||
|
resolved.append(answer.items[0].to_text()[:-1])
|
||||||
|
elif answer.rdtype == dns.rdatatype.A:
|
||||||
|
assert a == last
|
||||||
|
resolved.append(answer.items[0].address)
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
self.log.info("Started")
|
||||||
|
subdomain: str
|
||||||
|
for subdomain in iter(self.orchestrator.subdomains_queue.get, None):
|
||||||
|
|
||||||
|
for _ in range(NUMBER_TRIES):
|
||||||
|
resolved = self.resolve_subdomain(subdomain)
|
||||||
|
if resolved is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
# If it wasn't found after multiple tries
|
||||||
|
if resolved is None:
|
||||||
|
self.log.error("Gave up on %s", subdomain)
|
||||||
|
resolved = []
|
||||||
|
|
||||||
|
resolved.insert(0, subdomain)
|
||||||
|
self.orchestrator.results_queue.put(resolved)
|
||||||
|
|
||||||
|
self.orchestrator.results_queue.put(None)
|
||||||
|
self.log.info("Stopped")
|
||||||
|
|
||||||
|
|
||||||
|
class Orchestrator():
|
||||||
|
"""
|
||||||
|
Orchestrator of the different Worker threads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def refill_nameservers_queue(self) -> None:
|
||||||
|
"""
|
||||||
|
Re-fill the given nameservers into the nameservers queue.
|
||||||
|
Done every-time the queue is empty, making it
|
||||||
|
basically looping and infinite.
|
||||||
|
"""
|
||||||
|
# Might be in a race condition but that's probably fine
|
||||||
|
for nameserver in self.nameservers:
|
||||||
|
self.nameservers_queue.put(nameserver)
|
||||||
|
self.log.info("Refilled nameserver queue")
|
||||||
|
|
||||||
|
def __init__(self, subdomains: typing.Iterable[str],
|
||||||
|
nameservers: typing.List[str] = None,
|
||||||
|
):
|
||||||
|
self.log = logging.getLogger('orchestrator')
|
||||||
|
self.subdomains = subdomains
|
||||||
|
|
||||||
|
# Use interal resolver by default
|
||||||
|
self.nameservers = nameservers or dns.resolver.Resolver().nameservers
|
||||||
|
|
||||||
|
self.subdomains_queue: queue.Queue = queue.Queue(
|
||||||
|
maxsize=NUMBER_THREADS)
|
||||||
|
self.results_queue: queue.Queue = queue.Queue()
|
||||||
|
self.nameservers_queue: queue.Queue = queue.Queue()
|
||||||
|
|
||||||
|
self.refill_nameservers_queue()
|
||||||
|
|
||||||
|
def fill_subdomain_queue(self) -> None:
|
||||||
|
"""
|
||||||
|
Read the subdomains in input and put them into the queue.
|
||||||
|
Done in a thread so we can both:
|
||||||
|
- yield the results as they come
|
||||||
|
- not store all the subdomains at once
|
||||||
|
"""
|
||||||
|
self.log.info("Started reading subdomains")
|
||||||
|
# Send data to workers
|
||||||
|
for subdomain in self.subdomains:
|
||||||
|
self.subdomains_queue.put(subdomain)
|
||||||
|
|
||||||
|
self.log.info("Finished reading subdomains")
|
||||||
|
# Send sentinel to each worker
|
||||||
|
# sentinel = None ~= EOF
|
||||||
|
for _ in range(NUMBER_THREADS):
|
||||||
|
self.subdomains_queue.put(None)
|
||||||
|
|
||||||
|
def run(self) -> typing.Iterable[typing.List[str]]:
|
||||||
|
"""
|
||||||
|
Yield the results.
|
||||||
|
"""
|
||||||
|
# Create workers
|
||||||
|
self.log.info("Creating workers")
|
||||||
|
for i in range(NUMBER_THREADS):
|
||||||
|
Worker(self, i).start()
|
||||||
|
|
||||||
|
fill_thread = threading.Thread(target=self.fill_subdomain_queue)
|
||||||
|
fill_thread.start()
|
||||||
|
|
||||||
|
# Wait for one sentinel per worker
|
||||||
|
# In the meantime output results
|
||||||
|
for _ in range(NUMBER_THREADS):
|
||||||
|
result: typing.List[str]
|
||||||
|
for result in iter(self.results_queue.get, None):
|
||||||
|
yield result
|
||||||
|
|
||||||
|
self.log.info("Waiting for reader thread")
|
||||||
|
fill_thread.join()
|
||||||
|
|
||||||
|
self.log.info("Done!")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""
|
||||||
|
Main function when used directly.
|
||||||
|
Read the subdomains provided and output it,
|
||||||
|
the last CNAME resolved and the IP adress it resolves to.
|
||||||
|
Takes as an input a filename (or nothing, for stdin),
|
||||||
|
and as an output a filename (or nothing, for stdout).
|
||||||
|
The input must be a subdomain per line, the output is a comma-sep
|
||||||
|
file with the columns source CNAME and A.
|
||||||
|
Use the file `nameservers` as the list of nameservers
|
||||||
|
to use, or else it will use the system defaults.
|
||||||
|
Also shows a nice progressbar.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Initialization
|
||||||
|
coloredlogs.install(
|
||||||
|
level='DEBUG',
|
||||||
|
fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parsing arguments
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Massively resolves subdomains and store them in a file.")
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||||
|
help="Input file with one subdomain per line")
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
||||||
|
help="Outptut file with DNS chains")
|
||||||
|
# parser.add_argument(
|
||||||
|
# '-n', '--nameserver', type=argparse.FileType('r'),
|
||||||
|
# default='nameservers', help="File with one nameserver per line")
|
||||||
|
# parser.add_argument(
|
||||||
|
# '-j', '--workers', type=int, default=512,
|
||||||
|
# help="Number of threads to use")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Progress bar
|
||||||
|
widgets = [
|
||||||
|
progressbar.Percentage(),
|
||||||
|
' ', progressbar.SimpleProgress(),
|
||||||
|
' ', progressbar.Bar(),
|
||||||
|
' ', progressbar.Timer(),
|
||||||
|
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
|
||||||
|
' ', progressbar.AdaptiveETA(),
|
||||||
|
]
|
||||||
|
progress = progressbar.ProgressBar(widgets=widgets)
|
||||||
|
if args.input.seekable():
|
||||||
|
progress.max_value = len(args.input.readlines())
|
||||||
|
args.input.seek(0)
|
||||||
|
|
||||||
|
# Cleaning input
|
||||||
|
iterator = iter(args.input)
|
||||||
|
iterator = map(str.strip, iterator)
|
||||||
|
iterator = filter(None, iterator)
|
||||||
|
|
||||||
|
# Reading nameservers
|
||||||
|
servers: typing.List[str] = list()
|
||||||
|
if os.path.isfile('nameservers'):
|
||||||
|
servers = open('nameservers').readlines()
|
||||||
|
servers = list(filter(None, map(str.strip, servers)))
|
||||||
|
|
||||||
|
writer = csv.writer(args.output)
|
||||||
|
|
||||||
|
progress.start()
|
||||||
|
for resolved in Orchestrator(iterator, servers).run():
|
||||||
|
progress.update(progress.value + 1)
|
||||||
|
writer.writerow(resolved)
|
||||||
|
progress.finish()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
1
temp/.gitignore
vendored
1
temp/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
*.list
|
*.list
|
||||||
*.txt
|
*.txt
|
||||||
|
*.csv
|
||||||
|
|
Loading…
Reference in a new issue