#!/usr/bin/env python3 # pylint: disable=C0103 """ From a list of URLs, output the subdomains accessed by the websites. """ import sys import typing import urllib.parse import time import progressbar import selenium.webdriver.firefox.options import seleniumwire.webdriver import logging log = logging.getLogger("cs") DRIVER = None SCROLL_TIME = 10.0 SCROLL_STEPS = 100 SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})" def new_driver() -> seleniumwire.webdriver.browser.Firefox: profile = selenium.webdriver.FirefoxProfile() profile.set_preference("privacy.trackingprotection.enabled", False) profile.set_preference("network.cookie.cookieBehavior", 0) profile.set_preference("privacy.trackingprotection.pbmode.enabled", False) profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False) profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False) options = selenium.webdriver.firefox.options.Options() # options.add_argument('-headless') driver = seleniumwire.webdriver.Firefox( profile, executable_path="geckodriver", options=options ) return driver def subdomain_from_url(url: str) -> str: """ Extract the domain part from an url. """ parsed = urllib.parse.urlparse(url) return parsed.netloc def collect_subdomains(url: str) -> typing.Iterable[str]: """ Load an URL into an headless browser and return all the domains it tried to access. """ global DRIVER if not DRIVER: DRIVER = new_driver() try: DRIVER.get(url) for s in range(SCROLL_STEPS): DRIVER.execute_script(SCROLL_CMD) time.sleep(SCROLL_TIME / SCROLL_STEPS) for request in DRIVER.requests: if request.response: yield subdomain_from_url(request.path) except: log.exception("Error") DRIVER.quit() DRIVER = None def collect_subdomains_standalone(url: str) -> None: url = url.strip() if not url: return for subdomain in collect_subdomains(url): print(subdomain) if __name__ == "__main__": assert len(sys.argv) <= 2 filename = None if len(sys.argv) == 2 and sys.argv[1] != "-": filename = sys.argv[1] num_lines = sum(1 for line in open(filename)) iterator = progressbar.progressbar(open(filename), max_value=num_lines) else: iterator = sys.stdin for line in iterator: collect_subdomains_standalone(line) if DRIVER: DRIVER.quit() if filename: iterator.close()