#!/usr/bin/env python3 # pylint: disable=C0103 """ From a list of URLs, output the subdomains accessed by the websites. """ import sys import typing import urllib.parse import time import progressbar import selenium.webdriver.firefox.options import seleniumwire.webdriver import logging log = logging.getLogger('cs') DRIVER = None SCROLL_TIME = 10.0 SCROLL_STEPS = 100 SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})' def new_driver() -> seleniumwire.webdriver.browser.Firefox: profile = selenium.webdriver.FirefoxProfile() profile.set_preference('privacy.trackingprotection.enabled', False) profile.set_preference('network.cookie.cookieBehavior', 0) profile.set_preference('privacy.trackingprotection.pbmode.enabled', False) profile.set_preference( 'privacy.trackingprotection.cryptomining.enabled', False) profile.set_preference( 'privacy.trackingprotection.fingerprinting.enabled', False) options = selenium.webdriver.firefox.options.Options() # options.add_argument('-headless') driver = seleniumwire.webdriver.Firefox(profile, executable_path='geckodriver', options=options) return driver def subdomain_from_url(url: str) -> str: """ Extract the domain part from an url. """ parsed = urllib.parse.urlparse(url) return parsed.netloc def collect_subdomains(url: str) -> typing.Iterable[str]: """ Load an URL into an headless browser and return all the domains it tried to access. """ global DRIVER if not DRIVER: DRIVER = new_driver() try: DRIVER.get(url) for s in range(SCROLL_STEPS): DRIVER.execute_script(SCROLL_CMD) time.sleep(SCROLL_TIME/SCROLL_STEPS) for request in DRIVER.requests: if request.response: yield subdomain_from_url(request.path) except: log.exception("Error") DRIVER.quit() DRIVER = None def collect_subdomains_standalone(url: str) -> None: url = url.strip() if not url: return for subdomain in collect_subdomains(url): print(subdomain) if __name__ == '__main__': assert len(sys.argv) <= 2 filename = None if len(sys.argv) == 2 and sys.argv[1] != '-': filename = sys.argv[1] num_lines = sum(1 for line in open(filename)) iterator = progressbar.progressbar(open(filename), max_value=num_lines) else: iterator = sys.stdin for line in iterator: collect_subdomains_standalone(line) if DRIVER: DRIVER.quit() if filename: iterator.close()