eulaurarien/collect_subdomains.py

#!/usr/bin/env python3

"""
From a list of URLs, output the subdomains
accessed by the websites.
"""

import sys
import typing
import urllib.parse

import selenium.webdriver.firefox.options
import seleniumwire.webdriver


def subdomain_from_url(url: str) -> str:
    """
    Extract the domain part from an url.
    """
    parsed = urllib.parse.urlparse(url)
    return parsed.netloc


def collect_subdomains(url: str) -> typing.Iterable[str]:
    """
    Load an URL into an headless browser and return all the domains
    it tried to access.
    """
    options = selenium.webdriver.firefox.options.Options()
    options.add_argument('-headless')
    driver = seleniumwire.webdriver.Firefox(
        executable_path='geckodriver', options=options)

    driver.get(url)
    for request in driver.requests:
        if request.response:
            yield subdomain_from_url(request.path)
    driver.close()


if __name__ == '__main__':
    for line in sys.stdin:
        line = line.strip()
        if not line:
            continue
        for subdomain in collect_subdomains(line):
            print(subdomain)
Initial commit 2019-11-10 17:14:25 +00:00			`#!/usr/bin/env python3`

			`"""`
			`From a list of URLs, output the subdomains`
			`accessed by the websites.`
			`"""`

			`import sys`
			`import typing`
			`import urllib.parse`

			`import selenium.webdriver.firefox.options`
			`import seleniumwire.webdriver`


			`def subdomain_from_url(url: str) -> str:`
			`"""`
			`Extract the domain part from an url.`
			`"""`
			`parsed = urllib.parse.urlparse(url)`
			`return parsed.netloc`


			`def collect_subdomains(url: str) -> typing.Iterable[str]:`
			`"""`
			`Load an URL into an headless browser and return all the domains`
			`it tried to access.`
			`"""`
			`options = selenium.webdriver.firefox.options.Options()`
			`options.add_argument('-headless')`
			`driver = seleniumwire.webdriver.Firefox(`
			`executable_path='geckodriver', options=options)`

			`driver.get(url)`
			`for request in driver.requests:`
			`if request.response:`
			`yield subdomain_from_url(request.path)`
			`driver.close()`


			`if __name__ == '__main__':`
			`for line in sys.stdin:`
			`line = line.strip()`
			`if not line:`
			`continue`
			`for subdomain in collect_subdomains(line):`
			`print(subdomain)`