From 808e36dde3ba52a427170fbe37bbedfb8f5b3314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 3 Jan 2020 22:08:06 +0100 Subject: [PATCH] Improvements to subdomain collection I use this for tracker identification so it's not perfect but still it's a bit better. --- README.md | 2 +- collect_subdomains.py | 58 ++++++++++++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 3fed9c1..68b9f4d 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ If you want to force re-importing, run `rm last_updates/rapid7_*.txt`. ### Export the lists -For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them). +For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them). For other purposes, tinker with the `./export.py` program. #### Explanations diff --git a/collect_subdomains.py b/collect_subdomains.py index 4877d19..33879e2 100755 --- a/collect_subdomains.py +++ b/collect_subdomains.py @@ -14,6 +14,29 @@ import time import progressbar import selenium.webdriver.firefox.options import seleniumwire.webdriver +import logging + +log = logging.getLogger('cs') +DRIVER = None +SCROLL_TIME = 10.0 +SCROLL_STEPS = 100 +SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})' + + +def new_driver() -> seleniumwire.webdriver.browser.Firefox: + profile = selenium.webdriver.FirefoxProfile() + profile.set_preference('privacy.trackingprotection.enabled', False) + profile.set_preference('network.cookie.cookieBehavior', 0) + profile.set_preference('privacy.trackingprotection.pbmode.enabled', False) + profile.set_preference( + 'privacy.trackingprotection.cryptomining.enabled', False) + profile.set_preference( + 'privacy.trackingprotection.fingerprinting.enabled', False) + options = selenium.webdriver.firefox.options.Options() + # options.add_argument('-headless') + driver = seleniumwire.webdriver.Firefox(profile, + executable_path='geckodriver', options=options) + return driver def subdomain_from_url(url: str) -> str: @@ -29,28 +52,30 @@ def collect_subdomains(url: str) -> typing.Iterable[str]: Load an URL into an headless browser and return all the domains it tried to access. """ - options = selenium.webdriver.firefox.options.Options() - options.add_argument('-headless') - driver = seleniumwire.webdriver.Firefox( - executable_path='geckodriver', options=options) + global DRIVER + if not DRIVER: + DRIVER = new_driver() - driver.get(url) - time.sleep(10) - for request in driver.requests: - if request.response: - yield subdomain_from_url(request.path) - driver.close() + try: + DRIVER.get(url) + for s in range(SCROLL_STEPS): + DRIVER.execute_script(SCROLL_CMD) + time.sleep(SCROLL_TIME/SCROLL_STEPS) + for request in DRIVER.requests: + if request.response: + yield subdomain_from_url(request.path) + except: + log.exception("Error") + DRIVER.quit() + DRIVER = None def collect_subdomains_standalone(url: str) -> None: url = url.strip() if not url: return - try: - for subdomain in collect_subdomains(url): - print(subdomain) - except: - pass + for subdomain in collect_subdomains(url): + print(subdomain) if __name__ == '__main__': @@ -66,5 +91,8 @@ if __name__ == '__main__': for line in iterator: collect_subdomains_standalone(line) + if DRIVER: + DRIVER.quit() + if filename: iterator.close()