#!/usr/bin/env python3 # pylint: disable=C0103 """ From a list of URLs, output the subdomains accessed by the websites. """ import sys import typing import urllib.parse import time import progressbar import selenium.webdriver.firefox.options import seleniumwire.webdriver def subdomain_from_url(url: str) -> str: """ Extract the domain part from an url. """ parsed = urllib.parse.urlparse(url) return parsed.netloc def collect_subdomains(url: str) -> typing.Iterable[str]: """ Load an URL into an headless browser and return all the domains it tried to access. """ options = selenium.webdriver.firefox.options.Options() options.add_argument('-headless') driver = seleniumwire.webdriver.Firefox( executable_path='geckodriver', options=options) driver.get(url) time.sleep(10) for request in driver.requests: if request.response: yield subdomain_from_url(request.path) driver.close() def collect_subdomains_standalone(url: str) -> None: url = url.strip() if not url: return try: for subdomain in collect_subdomains(url): print(subdomain) except: pass if __name__ == '__main__': assert len(sys.argv) <= 2 filename = None if len(sys.argv) == 2 and sys.argv[1] != '-': filename = sys.argv[1] num_lines = sum(1 for line in open(filename)) iterator = progressbar.progressbar(open(filename), max_value=num_lines) else: iterator = sys.stdin for line in iterator: collect_subdomains_standalone(line) if filename: iterator.close()