2019-11-10 18:14:25 +01:00
|
|
|
#!/usr/bin/env python3
|
2019-11-10 21:59:06 +01:00
|
|
|
# pylint: disable=C0103
|
2019-11-10 18:14:25 +01:00
|
|
|
|
|
|
|
"""
|
|
|
|
From a list of URLs, output the subdomains
|
|
|
|
accessed by the websites.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import typing
|
|
|
|
import urllib.parse
|
2019-11-14 06:29:24 +01:00
|
|
|
import time
|
2019-11-10 18:14:25 +01:00
|
|
|
|
2019-11-10 21:59:06 +01:00
|
|
|
import progressbar
|
2019-11-10 18:14:25 +01:00
|
|
|
import selenium.webdriver.firefox.options
|
|
|
|
import seleniumwire.webdriver
|
2020-01-03 22:08:06 +01:00
|
|
|
import logging
|
|
|
|
|
|
|
|
log = logging.getLogger('cs')
|
|
|
|
DRIVER = None
|
|
|
|
SCROLL_TIME = 10.0
|
|
|
|
SCROLL_STEPS = 100
|
|
|
|
SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})'
|
|
|
|
|
|
|
|
|
|
|
|
def new_driver() -> seleniumwire.webdriver.browser.Firefox:
|
|
|
|
profile = selenium.webdriver.FirefoxProfile()
|
|
|
|
profile.set_preference('privacy.trackingprotection.enabled', False)
|
|
|
|
profile.set_preference('network.cookie.cookieBehavior', 0)
|
|
|
|
profile.set_preference('privacy.trackingprotection.pbmode.enabled', False)
|
|
|
|
profile.set_preference(
|
|
|
|
'privacy.trackingprotection.cryptomining.enabled', False)
|
|
|
|
profile.set_preference(
|
|
|
|
'privacy.trackingprotection.fingerprinting.enabled', False)
|
|
|
|
options = selenium.webdriver.firefox.options.Options()
|
|
|
|
# options.add_argument('-headless')
|
|
|
|
driver = seleniumwire.webdriver.Firefox(profile,
|
|
|
|
executable_path='geckodriver', options=options)
|
|
|
|
return driver
|
2019-11-10 18:14:25 +01:00
|
|
|
|
|
|
|
|
|
|
|
def subdomain_from_url(url: str) -> str:
|
|
|
|
"""
|
|
|
|
Extract the domain part from an url.
|
|
|
|
"""
|
|
|
|
parsed = urllib.parse.urlparse(url)
|
|
|
|
return parsed.netloc
|
|
|
|
|
|
|
|
|
|
|
|
def collect_subdomains(url: str) -> typing.Iterable[str]:
|
|
|
|
"""
|
|
|
|
Load an URL into an headless browser and return all the domains
|
|
|
|
it tried to access.
|
|
|
|
"""
|
2020-01-03 22:08:06 +01:00
|
|
|
global DRIVER
|
|
|
|
if not DRIVER:
|
|
|
|
DRIVER = new_driver()
|
2019-11-10 18:14:25 +01:00
|
|
|
|
2020-01-03 22:08:06 +01:00
|
|
|
try:
|
|
|
|
DRIVER.get(url)
|
|
|
|
for s in range(SCROLL_STEPS):
|
|
|
|
DRIVER.execute_script(SCROLL_CMD)
|
|
|
|
time.sleep(SCROLL_TIME/SCROLL_STEPS)
|
|
|
|
for request in DRIVER.requests:
|
|
|
|
if request.response:
|
|
|
|
yield subdomain_from_url(request.path)
|
|
|
|
except:
|
|
|
|
log.exception("Error")
|
|
|
|
DRIVER.quit()
|
|
|
|
DRIVER = None
|
2019-11-10 18:14:25 +01:00
|
|
|
|
|
|
|
|
2019-11-10 21:59:06 +01:00
|
|
|
def collect_subdomains_standalone(url: str) -> None:
|
|
|
|
url = url.strip()
|
|
|
|
if not url:
|
|
|
|
return
|
2020-01-03 22:08:06 +01:00
|
|
|
for subdomain in collect_subdomains(url):
|
|
|
|
print(subdomain)
|
2019-11-10 21:59:06 +01:00
|
|
|
|
|
|
|
|
2019-11-10 18:14:25 +01:00
|
|
|
if __name__ == '__main__':
|
2019-11-10 21:59:06 +01:00
|
|
|
assert len(sys.argv) <= 2
|
|
|
|
filename = None
|
|
|
|
if len(sys.argv) == 2 and sys.argv[1] != '-':
|
|
|
|
filename = sys.argv[1]
|
|
|
|
num_lines = sum(1 for line in open(filename))
|
|
|
|
iterator = progressbar.progressbar(open(filename), max_value=num_lines)
|
|
|
|
else:
|
|
|
|
iterator = sys.stdin
|
|
|
|
|
|
|
|
for line in iterator:
|
|
|
|
collect_subdomains_standalone(line)
|
|
|
|
|
2020-01-03 22:08:06 +01:00
|
|
|
if DRIVER:
|
|
|
|
DRIVER.quit()
|
|
|
|
|
2019-11-10 21:59:06 +01:00
|
|
|
if filename:
|
|
|
|
iterator.close()
|