eulaurarien/collect_subdomains.py

#!/usr/bin/env python3
# pylint: disable=C0103

"""
From a list of URLs, output the subdomains
accessed by the websites.
"""

import sys
import typing
import urllib.parse
import time

import progressbar
import selenium.webdriver.firefox.options
import seleniumwire.webdriver
import logging

log = logging.getLogger("cs")
DRIVER = None
SCROLL_TIME = 10.0
SCROLL_STEPS = 100
SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})"


def new_driver() -> seleniumwire.webdriver.browser.Firefox:
    profile = selenium.webdriver.FirefoxProfile()
    profile.set_preference("privacy.trackingprotection.enabled", False)
    profile.set_preference("network.cookie.cookieBehavior", 0)
    profile.set_preference("privacy.trackingprotection.pbmode.enabled", False)
    profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False)
    profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False)
    options = selenium.webdriver.firefox.options.Options()
    # options.add_argument('-headless')
    driver = seleniumwire.webdriver.Firefox(
        profile, executable_path="geckodriver", options=options
    )
    return driver


def subdomain_from_url(url: str) -> str:
    """
    Extract the domain part from an url.
    """
    parsed = urllib.parse.urlparse(url)
    return parsed.netloc


def collect_subdomains(url: str) -> typing.Iterable[str]:
    """
    Load an URL into an headless browser and return all the domains
    it tried to access.
    """
    global DRIVER
    if not DRIVER:
        DRIVER = new_driver()

    try:
        DRIVER.get(url)
        for s in range(SCROLL_STEPS):
            DRIVER.execute_script(SCROLL_CMD)
            time.sleep(SCROLL_TIME / SCROLL_STEPS)
        for request in DRIVER.requests:
            if request.response:
                yield subdomain_from_url(request.path)
    except Exception:
        log.exception("Error")
        DRIVER.quit()
        DRIVER = None


def collect_subdomains_standalone(url: str) -> None:
    url = url.strip()
    if not url:
        return
    for subdomain in collect_subdomains(url):
        print(subdomain)


if __name__ == "__main__":
    assert len(sys.argv) <= 2
    filename = None
    if len(sys.argv) == 2 and sys.argv[1] != "-":
        filename = sys.argv[1]
        num_lines = sum(1 for line in open(filename))
        iterator = progressbar.progressbar(open(filename), max_value=num_lines)
    else:
        iterator = sys.stdin

    for line in iterator:
        collect_subdomains_standalone(line)

    if DRIVER:
        DRIVER.quit()

    if filename:
        iterator.close()