diff --git a/.gitignore b/.gitignore index 397b4a7..3119064 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.log +nameservers diff --git a/filter_subdomains.py b/filter_subdomains.py index e066277..3a3fc73 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -6,70 +6,162 @@ From a list of subdomains, output only the ones resolving to a first-party tracker. """ +import logging +import multiprocessing +import os import re import sys +import typing -import dns.resolver +import coloredlogs import dns.exception +import dns.resolver import progressbar import regexes DNS_TIMEOUT = 5.0 +MAX_NAMESERVERS = 512 -def is_subdomain_matching(subdomain: str) -> bool: +# TODO Retry failed requests + +class DnsResolver(multiprocessing.Process): """ - Indicates if the subdomain redirects to a first-party tracker. + Worker process for a DNS resolver. + Will resolve DNS to match first-party subdomains. """ - # TODO Look at the whole chain rather than the last one - try: - query = dns.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT) - except dns.resolver.NXDOMAIN: + + def __init__(self, + in_queue: multiprocessing.Queue, + out_queue: multiprocessing.Queue, + server: str): + super(DnsResolver, self).__init__() + self.log = logging.getLogger(server) + + self.in_queue = in_queue + self.out_queue = out_queue + + self.resolver = dns.resolver.Resolver() + self.resolver.nameservers = [server] + + def is_subdomain_matching(self, subdomain: str) -> bool: + """ + Indicates if the subdomain redirects to a first-party tracker. + """ + # TODO Look at the whole chain rather than the last one + # TODO Also match the ASN of the IP (caching the ASN subnetworks will do) + try: + query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT) + except dns.resolver.NXDOMAIN: + return False + except dns.resolver.NoAnswer: + return False + except dns.resolver.YXDOMAIN: + self.log.warning("Query name too long for %s", subdomain) + return False + except dns.resolver.NoNameservers: + self.log.warning("All nameservers broken for %s", subdomain) + return False + except dns.exception.Timeout: + self.log.warning("Timeout for %s", subdomain) + return False + except dns.name.EmptyLabel: + self.log.warning("Empty label for %s", subdomain) + return False + canonical = query.canonical_name.to_text() + for regex in regexes.REGEXES: + if re.match(regex, canonical): + return True return False - except dns.resolver.NoAnswer: - return False - except dns.resolver.YXDOMAIN: - print(f"Query name too long for {subdomain}", file=sys.stderr) - return False - except dns.resolver.NoNameservers: - print(f"All nameservers broken for {subdomain}", file=sys.stderr) - return False - except dns.exception.Timeout: - print(f"Timeout for {subdomain}", file=sys.stderr) - return False - except dns.name.EmptyLabel: - print(f"Empty label for {subdomain}", file=sys.stderr) - return False - canonical = query.canonical_name.to_text() - for regex in regexes.REGEXES: - if re.match(regex, canonical): - return True - return False + + def run(self) -> None: + self.log.info("Started") + for subdomain in iter(self.in_queue.get, None): + matching = self.is_subdomain_matching(subdomain) + result = (subdomain, matching) + # self.log.debug("%s", result) + self.out_queue.put(result) + self.out_queue.put(None) + self.log.info("Stopped") -def is_subdomain_matching_standalone(subdomain: str) -> None: +def get_matching_subdomains(subdomains: typing.Iterable[str], + nameservers: typing.List[str] = None, + ) -> typing.Iterable[typing.Tuple[str, bool]]: + subdomains_queue: multiprocessing.Queue = multiprocessing.Queue() + results_queue: multiprocessing.Queue = multiprocessing.Queue() """ - Print the subdomain if it redirects to a first-party tracker. + Orchestrator of the different DnsResolver threads. """ - subdomain = subdomain.strip() - if not subdomain: - return - if is_subdomain_matching(subdomain): - print(subdomain) + + # Use interal resolver by default + servers = nameservers or dns.resolver.Resolver().nameservers + servers = servers[:MAX_NAMESERVERS] + + # Create workers + for server in servers: + DnsResolver(subdomains_queue, results_queue, server).start() + + # Send data to workers + for subdomain in subdomains: + subdomains_queue.put(subdomain) + + # Send sentinel to each worker + # sentinel = None ~= EOF + for _ in servers: + subdomains_queue.put(None) + subdomains_queue.close() + + # Wait for one sentinel per worker + # In the meantime output results + for _ in servers: + for result in iter(results_queue.get, None): + yield result + results_queue.close() if __name__ == '__main__': + coloredlogs.install( + level='DEBUG', + fmt='%(asctime)s %(name)s[%(process)d] %(levelname)s %(message)s' + ) + + # Progress bar + widgets = [ + progressbar.Percentage(), + ' ', progressbar.SimpleProgress(), + ' ', progressbar.Bar(), + ' ', progressbar.Timer(), + ' ', progressbar.AdaptiveTransferSpeed(unit='req'), + ' ', progressbar.AdaptiveETA(), + ] + progress = progressbar.ProgressBar(widgets=widgets) + + # Parsing arguments assert len(sys.argv) <= 2 filename = None + if len(sys.argv) == 2 and sys.argv[1] != '-': filename = sys.argv[1] - num_lines = sum(1 for line in open(filename)) - iterator = progressbar.progressbar(open(filename), max_value=num_lines) + progress.max_value = sum(1 for line in open(filename)) + textio = open(filename) else: - iterator = sys.stdin + textio = sys.stdin - for line in iterator: - is_subdomain_matching_standalone(line) + # Cleaning input + iterator = iter(textio) + iterator = map(str.strip, iterator) + iterator = filter(None, iterator) - if filename: - iterator.close() + # Reading nameservers + servers: typing.List[str] = list() + if os.path.isfile('nameservers'): + servers = open('nameservers').readlines() + servers = list(filter(None, map(str.strip, servers))) + + progress.start() + for subdomain, matching in get_matching_subdomains(iterator, servers): + progress.update(progress.value + 1) + if matching: + print(subdomain) + progress.finish()