diff --git a/README.md b/README.md index b732fa9..d6e0a36 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ Just to build the list, you can find an already-built list in the releases. - Selenium - seleniumwire - dnspython +- [progressbar2](https://pypi.org/project/progressbar2/) And then just run `eulaurarien.sh`. diff --git a/collect_subdomains.py b/collect_subdomains.py index 4177812..26c50c7 100755 --- a/collect_subdomains.py +++ b/collect_subdomains.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# pylint: disable=C0103 """ From a list of URLs, output the subdomains @@ -9,6 +10,7 @@ import sys import typing import urllib.parse +import progressbar import selenium.webdriver.firefox.options import seleniumwire.webdriver @@ -38,10 +40,26 @@ def collect_subdomains(url: str) -> typing.Iterable[str]: driver.close() +def collect_subdomains_standalone(url: str) -> None: + url = url.strip() + if not url: + return + for subdomain in collect_subdomains(url): + print(subdomain) + + if __name__ == '__main__': - for line in sys.stdin: - line = line.strip() - if not line: - continue - for subdomain in collect_subdomains(line): - print(subdomain) + assert len(sys.argv) <= 2 + filename = None + if len(sys.argv) == 2 and sys.argv[1] != '-': + filename = sys.argv[1] + num_lines = sum(1 for line in open(filename)) + iterator = progressbar.progressbar(open(filename), max_value=num_lines) + else: + iterator = sys.stdin + + for line in iterator: + collect_subdomains_standalone(line) + + if filename: + iterator.close() diff --git a/eulaurarien.sh b/eulaurarien.sh index 88cfc14..d6e3a0e 100755 --- a/eulaurarien.sh +++ b/eulaurarien.sh @@ -3,11 +3,11 @@ # Main script for eulaurarien # Get all subdomains accessed by each website in the website list -cat websites.list | ./collect_subdomains.py > subdomains.list +./collect_subdomains.py websites.list > subdomains.list sort -u subdomains.list > subdomains.sorted.list # Filter out the subdomains not pointing to a first-party tracker -cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list +./filter_subdomains.py subdomains.sorted.list > toblock.list sort -u toblock.list > toblock.sorted.list # Format the blocklist so it can be used as a hostlist diff --git a/filter_subdomains.py b/filter_subdomains.py index fd3a590..d630b1a 100755 --- a/filter_subdomains.py +++ b/filter_subdomains.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# pylint: disable=C0103 """ From a list of subdomains, output only @@ -9,6 +10,7 @@ import re import sys import dns.resolver +import progressbar import regexes @@ -18,7 +20,10 @@ def is_subdomain_matching(subdomain: str) -> bool: Indicates if the subdomain redirects to a first-party tracker. """ # TODO Look at the whole chain rather than the last one - query = dns.resolver.query(subdomain, 'A') + try: + query = dns.resolver.query(subdomain, 'A') + except dns.resolver.NXDOMAIN: + return False canonical = query.canonical_name.to_text() for regex in regexes.REGEXES: if re.match(regex, canonical): @@ -26,10 +31,29 @@ def is_subdomain_matching(subdomain: str) -> bool: return False +def is_subdomain_matching_standalone(subdomain: str) -> None: + """ + Print the subdomain if it redirects to a first-party tracker. + """ + subdomain = subdomain.strip() + if not subdomain: + return + if is_subdomain_matching(subdomain): + print(subdomain) + + if __name__ == '__main__': - for line in sys.stdin: - line = line.strip() - if not line: - continue - if is_subdomain_matching(line): - print(line) + assert len(sys.argv) <= 2 + filename = None + if len(sys.argv) == 2 and sys.argv[1] != '-': + filename = sys.argv[1] + num_lines = sum(1 for line in open(filename)) + iterator = progressbar.progressbar(open(filename), max_value=num_lines) + else: + iterator = sys.stdin + + for line in iterator: + is_subdomain_matching_standalone(line) + + if filename: + iterator.close()