eulaurarien/filter_subdomains.py

73 lines
1.9 KiB
Python
Raw Normal View History

2019-11-10 17:14:25 +00:00
#!/usr/bin/env python3
2019-11-10 20:59:06 +00:00
# pylint: disable=C0103
2019-11-10 17:14:25 +00:00
"""
From a list of subdomains, output only
the ones resolving to a first-party tracker.
"""
import re
import sys
import dns.resolver
2019-11-10 21:18:27 +00:00
import dns.exception
2019-11-10 20:59:06 +00:00
import progressbar
2019-11-10 17:14:25 +00:00
import regexes
2019-11-10 21:18:27 +00:00
DNS_TIMEOUT = 5.0
2019-11-10 17:14:25 +00:00
def is_subdomain_matching(subdomain: str) -> bool:
"""
Indicates if the subdomain redirects to a first-party tracker.
"""
# TODO Look at the whole chain rather than the last one
2019-11-10 20:59:06 +00:00
try:
2019-11-10 21:18:27 +00:00
query = dns.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT)
2019-11-10 20:59:06 +00:00
except dns.resolver.NXDOMAIN:
return False
2019-11-10 21:18:27 +00:00
except dns.resolver.NoAnswer:
return False
except dns.resolver.YXDOMAIN:
print(f"Query name too long for {subdomain}", file=sys.stderr)
return False
except dns.resolver.NoNameservers:
print(f"All nameservers broken for {subdomain}", file=sys.stderr)
return False
except dns.exception.Timeout:
print(f"Timeout for {subdomain}", file=sys.stderr)
return False
2019-11-10 17:14:25 +00:00
canonical = query.canonical_name.to_text()
for regex in regexes.REGEXES:
if re.match(regex, canonical):
return True
return False
2019-11-10 20:59:06 +00:00
def is_subdomain_matching_standalone(subdomain: str) -> None:
"""
Print the subdomain if it redirects to a first-party tracker.
"""
subdomain = subdomain.strip()
if not subdomain:
return
if is_subdomain_matching(subdomain):
print(subdomain)
2019-11-10 17:14:25 +00:00
if __name__ == '__main__':
2019-11-10 20:59:06 +00:00
assert len(sys.argv) <= 2
filename = None
if len(sys.argv) == 2 and sys.argv[1] != '-':
filename = sys.argv[1]
num_lines = sum(1 for line in open(filename))
iterator = progressbar.progressbar(open(filename), max_value=num_lines)
else:
iterator = sys.stdin
for line in iterator:
is_subdomain_matching_standalone(line)
if filename:
iterator.close()