eulaurarien/collect_subdomains.py

#!/usr/bin/env python3
# pylint: disable=C0103

"""
From a list of URLs, output the subdomains
accessed by the websites.
"""

import sys
import typing
import urllib.parse
import time

import progressbar
import selenium.webdriver.firefox.options
import seleniumwire.webdriver
import logging

log = logging.getLogger("cs")
DRIVER = None
SCROLL_TIME = 10.0
SCROLL_STEPS = 100
SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})"


def new_driver() -> seleniumwire.webdriver.browser.Firefox:
    profile = selenium.webdriver.FirefoxProfile()
    profile.set_preference("privacy.trackingprotection.enabled", False)
    profile.set_preference("network.cookie.cookieBehavior", 0)
    profile.set_preference("privacy.trackingprotection.pbmode.enabled", False)
    profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False)
    profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False)
    options = selenium.webdriver.firefox.options.Options()
    # options.add_argument('-headless')
    driver = seleniumwire.webdriver.Firefox(
        profile, executable_path="geckodriver", options=options
    )
    return driver


def subdomain_from_url(url: str) -> str:
    """
    Extract the domain part from an url.
    """
    parsed = urllib.parse.urlparse(url)
    return parsed.netloc


def collect_subdomains(url: str) -> typing.Iterable[str]:
    """
    Load an URL into an headless browser and return all the domains
    it tried to access.
    """
    global DRIVER
    if not DRIVER:
        DRIVER = new_driver()

    try:
        DRIVER.get(url)
        for s in range(SCROLL_STEPS):
            DRIVER.execute_script(SCROLL_CMD)
            time.sleep(SCROLL_TIME / SCROLL_STEPS)
        for request in DRIVER.requests:
            if request.response:
                yield subdomain_from_url(request.path)
    except Exception:
        log.exception("Error")
        DRIVER.quit()
        DRIVER = None


def collect_subdomains_standalone(url: str) -> None:
    url = url.strip()
    if not url:
        return
    for subdomain in collect_subdomains(url):
        print(subdomain)


if __name__ == "__main__":
    assert len(sys.argv) <= 2
    filename = None
    if len(sys.argv) == 2 and sys.argv[1] != "-":
        filename = sys.argv[1]
        num_lines = sum(1 for line in open(filename))
        iterator = progressbar.progressbar(open(filename), max_value=num_lines)
    else:
        iterator = sys.stdin

    for line in iterator:
        collect_subdomains_standalone(line)

    if DRIVER:
        DRIVER.quit()

    if filename:
        iterator.close()
Initial commit 2019-11-10 17:14:25 +00:00			`#!/usr/bin/env python3`
Added progressbar and ETA 2019-11-10 20:59:06 +00:00			`# pylint: disable=C0103`
Initial commit 2019-11-10 17:14:25 +00:00
			`"""`
			`From a list of URLs, output the subdomains`
			`accessed by the websites.`
			`"""`

			`import sys`
			`import typing`
			`import urllib.parse`
Added some delay for websites subdomains collecting Some websites load their trackers after the page is done loading. 2019-11-14 05:29:24 +00:00			`import time`
Initial commit 2019-11-10 17:14:25 +00:00
Added progressbar and ETA 2019-11-10 20:59:06 +00:00			`import progressbar`
Initial commit 2019-11-10 17:14:25 +00:00			`import selenium.webdriver.firefox.options`
			`import seleniumwire.webdriver`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`import logging`

Black pass 2021-08-14 21:27:28 +00:00			`log = logging.getLogger("cs")`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`DRIVER = None`
			`SCROLL_TIME = 10.0`
			`SCROLL_STEPS = 100`
Black pass 2021-08-14 21:27:28 +00:00			`SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})"`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00

			`def new_driver() -> seleniumwire.webdriver.browser.Firefox:`
			`profile = selenium.webdriver.FirefoxProfile()`
Black pass 2021-08-14 21:27:28 +00:00			`profile.set_preference("privacy.trackingprotection.enabled", False)`
			`profile.set_preference("network.cookie.cookieBehavior", 0)`
			`profile.set_preference("privacy.trackingprotection.pbmode.enabled", False)`
			`profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False)`
			`profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False)`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`options = selenium.webdriver.firefox.options.Options()`
			`# options.add_argument('-headless')`
Black pass 2021-08-14 21:27:28 +00:00			`driver = seleniumwire.webdriver.Firefox(`
			`profile, executable_path="geckodriver", options=options`
			`)`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`return driver`
Initial commit 2019-11-10 17:14:25 +00:00

			`def subdomain_from_url(url: str) -> str:`
			`"""`
			`Extract the domain part from an url.`
			`"""`
			`parsed = urllib.parse.urlparse(url)`
			`return parsed.netloc`


			`def collect_subdomains(url: str) -> typing.Iterable[str]:`
			`"""`
			`Load an URL into an headless browser and return all the domains`
			`it tried to access.`
			`"""`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`global DRIVER`
			`if not DRIVER:`
			`DRIVER = new_driver()`
Initial commit 2019-11-10 17:14:25 +00:00
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`try:`
			`DRIVER.get(url)`
			`for s in range(SCROLL_STEPS):`
			`DRIVER.execute_script(SCROLL_CMD)`
Black pass 2021-08-14 21:27:28 +00:00			`time.sleep(SCROLL_TIME / SCROLL_STEPS)`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`for request in DRIVER.requests:`
			`if request.response:`
			`yield subdomain_from_url(request.path)`
Fix (most) mypy / flake8 errors 2021-08-14 21:35:51 +00:00			`except Exception:`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`log.exception("Error")`
			`DRIVER.quit()`
			`DRIVER = None`
Initial commit 2019-11-10 17:14:25 +00:00

Added progressbar and ETA 2019-11-10 20:59:06 +00:00			`def collect_subdomains_standalone(url: str) -> None:`
			`url = url.strip()`
			`if not url:`
			`return`
Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`for subdomain in collect_subdomains(url):`
			`print(subdomain)`
Added progressbar and ETA 2019-11-10 20:59:06 +00:00

Black pass 2021-08-14 21:27:28 +00:00			`if __name__ == "__main__":`
Added progressbar and ETA 2019-11-10 20:59:06 +00:00			`assert len(sys.argv) <= 2`
			`filename = None`
Black pass 2021-08-14 21:27:28 +00:00			`if len(sys.argv) == 2 and sys.argv[1] != "-":`
Added progressbar and ETA 2019-11-10 20:59:06 +00:00			`filename = sys.argv[1]`
			`num_lines = sum(1 for line in open(filename))`
			`iterator = progressbar.progressbar(open(filename), max_value=num_lines)`
			`else:`
			`iterator = sys.stdin`

			`for line in iterator:`
			`collect_subdomains_standalone(line)`

Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. 2020-01-03 21:08:06 +00:00			`if DRIVER:`
			`DRIVER.quit()`

Added progressbar and ETA 2019-11-10 20:59:06 +00:00			`if filename:`
			`iterator.close()`