eulaurarien/filter_subdomains.py

76 lines
2 KiB
Python
Executable file

#!/usr/bin/env python3
# pylint: disable=C0103
"""
From a list of subdomains, output only
the ones resolving to a first-party tracker.
"""
import re
import sys
import dns.resolver
import dns.exception
import progressbar
import regexes
DNS_TIMEOUT = 5.0
def is_subdomain_matching(subdomain: str) -> bool:
"""
Indicates if the subdomain redirects to a first-party tracker.
"""
# TODO Look at the whole chain rather than the last one
try:
query = dns.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
except dns.resolver.NXDOMAIN:
return False
except dns.resolver.NoAnswer:
return False
except dns.resolver.YXDOMAIN:
print(f"Query name too long for {subdomain}", file=sys.stderr)
return False
except dns.resolver.NoNameservers:
print(f"All nameservers broken for {subdomain}", file=sys.stderr)
return False
except dns.exception.Timeout:
print(f"Timeout for {subdomain}", file=sys.stderr)
return False
except dns.name.EmptyLabel:
print(f"Empty label for {subdomain}", file=sys.stderr)
return False
canonical = query.canonical_name.to_text()
for regex in regexes.REGEXES:
if re.match(regex, canonical):
return True
return False
def is_subdomain_matching_standalone(subdomain: str) -> None:
"""
Print the subdomain if it redirects to a first-party tracker.
"""
subdomain = subdomain.strip()
if not subdomain:
return
if is_subdomain_matching(subdomain):
print(subdomain)
if __name__ == '__main__':
assert len(sys.argv) <= 2
filename = None
if len(sys.argv) == 2 and sys.argv[1] != '-':
filename = sys.argv[1]
num_lines = sum(1 for line in open(filename))
iterator = progressbar.progressbar(open(filename), max_value=num_lines)
else:
iterator = sys.stdin
for line in iterator:
is_subdomain_matching_standalone(line)
if filename:
iterator.close()