Browse Source

Improvements to subdomain collection

I use this for tracker identification so it's not perfect but still it's
a bit better.
master
Geoffrey Frogeye 1 month ago
parent
commit
808e36dde3
Signed by: Geoffrey “Frogeye” Preud'homme <geoffrey@frogeye.fr> GPG Key ID: D8A7ECA00A8CD3DD
2 changed files with 44 additions and 16 deletions
  1. +1
    -1
      README.md
  2. +43
    -15
      collect_subdomains.py

+ 1
- 1
README.md View File

@@ -151,7 +151,7 @@ If you want to force re-importing, run `rm last_updates/rapid7_*.txt`.

### Export the lists

For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them).
For other purposes, tinker with the `./export.py` program.

#### Explanations

+ 43
- 15
collect_subdomains.py View File

@@ -14,6 +14,29 @@ import time
import progressbar
import selenium.webdriver.firefox.options
import seleniumwire.webdriver
import logging

log = logging.getLogger('cs')
DRIVER = None
SCROLL_TIME = 10.0
SCROLL_STEPS = 100
SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})'


def new_driver() -> seleniumwire.webdriver.browser.Firefox:
profile = selenium.webdriver.FirefoxProfile()
profile.set_preference('privacy.trackingprotection.enabled', False)
profile.set_preference('network.cookie.cookieBehavior', 0)
profile.set_preference('privacy.trackingprotection.pbmode.enabled', False)
profile.set_preference(
'privacy.trackingprotection.cryptomining.enabled', False)
profile.set_preference(
'privacy.trackingprotection.fingerprinting.enabled', False)
options = selenium.webdriver.firefox.options.Options()
# options.add_argument('-headless')
driver = seleniumwire.webdriver.Firefox(profile,
executable_path='geckodriver', options=options)
return driver


def subdomain_from_url(url: str) -> str:
@@ -29,28 +52,30 @@ def collect_subdomains(url: str) -> typing.Iterable[str]:
Load an URL into an headless browser and return all the domains
it tried to access.
"""
options = selenium.webdriver.firefox.options.Options()
options.add_argument('-headless')
driver = seleniumwire.webdriver.Firefox(
executable_path='geckodriver', options=options)
global DRIVER
if not DRIVER:
DRIVER = new_driver()

driver.get(url)
time.sleep(10)
for request in driver.requests:
if request.response:
yield subdomain_from_url(request.path)
driver.close()
try:
DRIVER.get(url)
for s in range(SCROLL_STEPS):
DRIVER.execute_script(SCROLL_CMD)
time.sleep(SCROLL_TIME/SCROLL_STEPS)
for request in DRIVER.requests:
if request.response:
yield subdomain_from_url(request.path)
except:
log.exception("Error")
DRIVER.quit()
DRIVER = None


def collect_subdomains_standalone(url: str) -> None:
url = url.strip()
if not url:
return
try:
for subdomain in collect_subdomains(url):
print(subdomain)
except:
pass
for subdomain in collect_subdomains(url):
print(subdomain)


if __name__ == '__main__':
@@ -66,5 +91,8 @@ if __name__ == '__main__':
for line in iterator:
collect_subdomains_standalone(line)

if DRIVER:
DRIVER.quit()

if filename:
iterator.close()

Loading…
Cancel
Save