eulaurarien/collect_subdomains.py

98 lines
2.6 KiB
Python
Raw Normal View History

2019-11-10 18:14:25 +01:00
#!/usr/bin/env python3
2019-11-10 21:59:06 +01:00
# pylint: disable=C0103
2019-11-10 18:14:25 +01:00
"""
From a list of URLs, output the subdomains
accessed by the websites.
"""
import sys
import typing
import urllib.parse
import time
2019-11-10 18:14:25 +01:00
2019-11-10 21:59:06 +01:00
import progressbar
2019-11-10 18:14:25 +01:00
import selenium.webdriver.firefox.options
import seleniumwire.webdriver
import logging
2021-08-14 23:27:28 +02:00
log = logging.getLogger("cs")
DRIVER = None
SCROLL_TIME = 10.0
SCROLL_STEPS = 100
2021-08-14 23:27:28 +02:00
SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})"
def new_driver() -> seleniumwire.webdriver.browser.Firefox:
profile = selenium.webdriver.FirefoxProfile()
2021-08-14 23:27:28 +02:00
profile.set_preference("privacy.trackingprotection.enabled", False)
profile.set_preference("network.cookie.cookieBehavior", 0)
profile.set_preference("privacy.trackingprotection.pbmode.enabled", False)
profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False)
profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False)
options = selenium.webdriver.firefox.options.Options()
# options.add_argument('-headless')
2021-08-14 23:27:28 +02:00
driver = seleniumwire.webdriver.Firefox(
profile, executable_path="geckodriver", options=options
)
return driver
2019-11-10 18:14:25 +01:00
def subdomain_from_url(url: str) -> str:
"""
Extract the domain part from an url.
"""
parsed = urllib.parse.urlparse(url)
return parsed.netloc
def collect_subdomains(url: str) -> typing.Iterable[str]:
"""
Load an URL into an headless browser and return all the domains
it tried to access.
"""
global DRIVER
if not DRIVER:
DRIVER = new_driver()
2019-11-10 18:14:25 +01:00
try:
DRIVER.get(url)
for s in range(SCROLL_STEPS):
DRIVER.execute_script(SCROLL_CMD)
2021-08-14 23:27:28 +02:00
time.sleep(SCROLL_TIME / SCROLL_STEPS)
for request in DRIVER.requests:
if request.response:
yield subdomain_from_url(request.path)
2021-08-14 23:35:51 +02:00
except Exception:
log.exception("Error")
DRIVER.quit()
DRIVER = None
2019-11-10 18:14:25 +01:00
2019-11-10 21:59:06 +01:00
def collect_subdomains_standalone(url: str) -> None:
url = url.strip()
if not url:
return
for subdomain in collect_subdomains(url):
print(subdomain)
2019-11-10 21:59:06 +01:00
2021-08-14 23:27:28 +02:00
if __name__ == "__main__":
2019-11-10 21:59:06 +01:00
assert len(sys.argv) <= 2
filename = None
2021-08-14 23:27:28 +02:00
if len(sys.argv) == 2 and sys.argv[1] != "-":
2019-11-10 21:59:06 +01:00
filename = sys.argv[1]
num_lines = sum(1 for line in open(filename))
iterator = progressbar.progressbar(open(filename), max_value=num_lines)
else:
iterator = sys.stdin
for line in iterator:
collect_subdomains_standalone(line)
if DRIVER:
DRIVER.quit()
2019-11-10 21:59:06 +01:00
if filename:
iterator.close()