
253 lines
8 KiB
Raw Normal View History

2019-11-10 17:14:25 +00:00
#!/usr/bin/env python3
From a list of subdomains, output only
the ones resolving to a first-party tracker.
import argparse
2019-11-14 09:45:06 +00:00
import logging
import os
2019-11-14 14:03:20 +00:00
import queue
2019-11-10 17:14:25 +00:00
import re
import sys
2019-11-14 14:03:20 +00:00
import threading
2019-11-14 09:45:06 +00:00
import typing
2019-11-10 17:14:25 +00:00
2019-11-14 09:45:06 +00:00
import coloredlogs
2019-11-10 21:18:27 +00:00
import dns.exception
2019-11-14 09:45:06 +00:00
import dns.resolver
2019-11-10 20:59:06 +00:00
import progressbar
2019-11-10 17:14:25 +00:00
import regexes
2019-11-14 12:10:14 +00:00
2019-11-14 14:03:20 +00:00
2019-11-14 09:45:06 +00:00
2019-11-14 11:57:06 +00:00
2019-11-14 14:03:20 +00:00
class Worker(threading.Thread):
2019-11-10 17:14:25 +00:00
2019-11-14 09:45:06 +00:00
Worker process for a DNS resolver.
Will resolve DNS to match first-party subdomains.
2019-11-10 17:14:25 +00:00
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
def change_nameserver(self) -> None:
Assign a this worker another nameserver from the queue.
server = None
while server is None:
server = self.orchestrator.nameservers_queue.get(block=False)
except queue.Empty:
self.log.debug("Using nameserver: %s", server)
self.resolver.nameservers = [server]
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
def __init__(self,
orchestrator: 'Orchestrator',
index: int = 0):
super(Worker, self).__init__()
self.log = logging.getLogger(f'worker{index:03d}')
self.orchestrator = orchestrator
2019-11-14 09:45:06 +00:00
self.resolver = dns.resolver.Resolver()
2019-11-14 14:03:20 +00:00
2019-11-14 09:45:06 +00:00
2019-11-14 10:35:05 +00:00
def is_subdomain_matching(self, subdomain: str) -> typing.Optional[bool]:
2019-11-14 09:45:06 +00:00
Indicates if the subdomain redirects to a first-party tracker.
2019-11-14 14:03:20 +00:00
Returns None if the nameserver was unable to satisfy the request.
2019-11-14 09:45:06 +00:00
# TODO Look at the whole chain rather than the last one
# TODO Also match the ASN of the IP (caching the ASN subnetworks will do)
query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
except dns.resolver.NXDOMAIN:
return False
except dns.resolver.NoAnswer:
return False
except dns.resolver.YXDOMAIN:
self.log.warning("Query name too long for %s", subdomain)
2019-11-14 14:03:20 +00:00
return None
2019-11-14 09:45:06 +00:00
except dns.resolver.NoNameservers:
2019-11-14 14:03:20 +00:00
# NOTE Most of the time this error message means that the domain
# does not exists, but sometimes it means the that the server
# itself is broken. So we count on the retry logic.
2019-11-14 09:45:06 +00:00
self.log.warning("All nameservers broken for %s", subdomain)
2019-11-14 10:35:05 +00:00
return None
2019-11-14 09:45:06 +00:00
except dns.exception.Timeout:
2019-11-14 14:03:20 +00:00
# NOTE Same as above
2019-11-14 09:45:06 +00:00
self.log.warning("Timeout for %s", subdomain)
2019-11-14 10:35:05 +00:00
return None
2019-11-14 09:45:06 +00:00
self.log.warning("Empty label for %s", subdomain)
2019-11-14 14:03:20 +00:00
return None
2019-11-14 09:45:06 +00:00
canonical = query.canonical_name.to_text()
for regex in regexes.REGEXES:
if re.match(regex, canonical):
return True
2019-11-10 22:07:21 +00:00
return False
2019-11-14 09:45:06 +00:00
def run(self) -> None:"Started")
2019-11-14 14:03:20 +00:00
for subdomain in iter(self.orchestrator.subdomains_queue.get, None):
for _ in range(NUMBER_TRIES):
matching = self.is_subdomain_matching(subdomain)
if matching is not None:
2019-11-14 10:35:05 +00:00
2019-11-14 14:03:20 +00:00
# If it wasn't found after multiple tries
2019-11-14 10:35:05 +00:00
if matching is None:
2019-11-14 14:03:20 +00:00
self.log.error("Gave up on %s", subdomain)
matching = False
2019-11-14 10:35:05 +00:00
2019-11-14 09:45:06 +00:00
result = (subdomain, matching)
2019-11-14 14:03:20 +00:00
2019-11-14 09:45:06 +00:00"Stopped")
2019-11-10 17:14:25 +00:00
2019-11-14 14:03:20 +00:00
class Orchestrator():
2019-11-10 20:59:06 +00:00
2019-11-14 14:03:20 +00:00
Orchestrator of the different Worker threads.
2019-11-10 20:59:06 +00:00
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
def refill_nameservers_queue(self) -> None:
Re-fill the given nameservers into the nameservers queue.
Done every-time the queue is empty, making it
basically looping and infinite.
# Might be in a race condition but that's probably fine
for nameserver in self.nameservers:
self.nameservers_queue.put(nameserver)"Refilled nameserver queue")
def __init__(self, subdomains: typing.Iterable[str],
nameservers: typing.List[str] = None):
self.log = logging.getLogger('orchestrator')
self.subdomains = subdomains
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
# Use interal resolver by default
self.nameservers = nameservers or dns.resolver.Resolver().nameservers
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
self.subdomains_queue: queue.Queue = queue.Queue(
self.results_queue: queue.Queue = queue.Queue()
self.nameservers_queue: queue.Queue = queue.Queue()
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
def fill_subdomain_queue(self) -> None:
Read the subdomains in input and put them into the queue.
Done in a thread so we can both:
- yield the results as they come
- not store all the subdomains at once
""""Started reading subdomains")
# Send data to workers
for subdomain in self.subdomains:
self.subdomains_queue.put(subdomain)"Finished reading subdomains")
# Send sentinel to each worker
# sentinel = None ~= EOF
for _ in range(NUMBER_THREADS):
def run(self) -> typing.Iterable[typing.Tuple[str, bool]]:
Yield the results.
# Create workers"Creating workers")
for i in range(NUMBER_THREADS):
Worker(self, i).start()
2019-11-10 20:59:06 +00:00
2019-11-14 14:03:20 +00:00
fill_thread = threading.Thread(target=self.fill_subdomain_queue)
# Wait for one sentinel per worker
# In the meantime output results
for _ in range(NUMBER_THREADS):
for result in iter(self.results_queue.get, None):
yield result"Waiting for reader thread")
def main() -> None:
Main function when used directly.
Takes as an input a filename (or nothing, for stdin)
that will be read and the ones that are a tracker
will be outputed on stdout.
Use the file `nameservers` as the list of nameservers
to use, or else it will use the system defaults.
Also shows a nice progressbar.
2019-11-10 20:59:06 +00:00
# Initialization
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
2019-11-14 09:45:06 +00:00
# Parsing arguments
parser = argparse.ArgumentParser(
description="Filter first-party trackers from a list of subdomains")
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with one subdomain per line")
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line")
# parser.add_argument(
# '-n', '--nameserver', type=argparse.FileType('r'),
# default='nameservers', help="File with one nameserver per line")
# parser.add_argument(
# '-j', '--workers', type=int, default=512,
# help="Number of threads to use")
args = parser.parse_args()
2019-11-14 09:45:06 +00:00
# Progress bar
widgets = [
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
progress = progressbar.ProgressBar(widgets=widgets)
# Cleaning input
iterator = iter(args.input)
2019-11-14 09:45:06 +00:00
iterator = map(str.strip, iterator)
iterator = filter(None, iterator)
2019-11-10 20:59:06 +00:00
2019-11-14 09:45:06 +00:00
# Reading nameservers
servers: typing.List[str] = list()
if os.path.isfile('nameservers'):
servers = open('nameservers').readlines()
servers = list(filter(None, map(str.strip, servers)))
2019-11-10 20:59:06 +00:00
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
for subdomain, matching in Orchestrator(iterator, servers).run():
2019-11-14 09:45:06 +00:00
progress.update(progress.value + 1)
if matching:
print(subdomain, file=args.output)
2019-11-14 09:45:06 +00:00
2019-11-14 14:03:20 +00:00
if __name__ == '__main__':