Improvements to subdomain collection

I use this for tracker identification so it's not perfect but still it's a bit better.
2020-01-03 22:08:06 +01:00 · 2020-01-03 22:08:06 +01:00 · 808e36dde3
parent 2b97ee4cb9
commit 808e36dde3
2 changed files with 44 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -151,7 +151,7 @@ If you want to force re-importing, run `rm last_updates/rapid7_*.txt`.

 ### Export the lists

-For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
+For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them).
 For other purposes, tinker with the `./export.py` program.

 #### Explanations
--- a/collect_subdomains.py
+++ b/collect_subdomains.py
@ -14,6 +14,29 @@ import time
 import progressbar
 import selenium.webdriver.firefox.options
 import seleniumwire.webdriver
+import logging
+
+log = logging.getLogger('cs')
+DRIVER = None
+SCROLL_TIME = 10.0
+SCROLL_STEPS = 100
+SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})'
+
+
+def new_driver() -> seleniumwire.webdriver.browser.Firefox:
+    profile = selenium.webdriver.FirefoxProfile()
+    profile.set_preference('privacy.trackingprotection.enabled', False)
+    profile.set_preference('network.cookie.cookieBehavior', 0)
+    profile.set_preference('privacy.trackingprotection.pbmode.enabled', False)
+    profile.set_preference(
+        'privacy.trackingprotection.cryptomining.enabled', False)
+    profile.set_preference(
+        'privacy.trackingprotection.fingerprinting.enabled', False)
+    options = selenium.webdriver.firefox.options.Options()
+    # options.add_argument('-headless')
+    driver = seleniumwire.webdriver.Firefox(profile,
+                                            executable_path='geckodriver', options=options)
+    return driver


 def subdomain_from_url(url: str) -> str:
@ -29,28 +52,30 @@ def collect_subdomains(url: str) -> typing.Iterable[str]:
    Load an URL into an headless browser and return all the domains
    it tried to access.
    """
-    options = selenium.webdriver.firefox.options.Options()
-    options.add_argument('-headless')
-    driver = seleniumwire.webdriver.Firefox(
-        executable_path='geckodriver', options=options)
+    global DRIVER
+    if not DRIVER:
+        DRIVER = new_driver()

-    driver.get(url)
-    time.sleep(10)
-    for request in driver.requests:
-        if request.response:
-            yield subdomain_from_url(request.path)
-    driver.close()
+    try:
+        DRIVER.get(url)
+        for s in range(SCROLL_STEPS):
+            DRIVER.execute_script(SCROLL_CMD)
+            time.sleep(SCROLL_TIME/SCROLL_STEPS)
+        for request in DRIVER.requests:
+            if request.response:
+                yield subdomain_from_url(request.path)
+    except:
+        log.exception("Error")
+        DRIVER.quit()
+        DRIVER = None


 def collect_subdomains_standalone(url: str) -> None:
    url = url.strip()
    if not url:
        return
-    try:
-        for subdomain in collect_subdomains(url):
-            print(subdomain)
-    except:
-        pass
+    for subdomain in collect_subdomains(url):
+        print(subdomain)


 if __name__ == '__main__':
@ -66,5 +91,8 @@ if __name__ == '__main__':
    for line in iterator:
        collect_subdomains_standalone(line)

+    if DRIVER:
+        DRIVER.quit()
+
    if filename:
        iterator.close()