Improvements to subdomain collection

I use this for tracker identification so it's not perfect but still it's
a bit better.
This commit is contained in:
Geoffrey Frogeye 2020-01-03 22:08:06 +01:00
parent 2b97ee4cb9
commit 808e36dde3
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
2 changed files with 44 additions and 16 deletions

View file

@ -151,7 +151,7 @@ If you want to force re-importing, run `rm last_updates/rapid7_*.txt`.
### Export the lists ### Export the lists
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them). For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them).
For other purposes, tinker with the `./export.py` program. For other purposes, tinker with the `./export.py` program.
#### Explanations #### Explanations

View file

@ -14,6 +14,29 @@ import time
import progressbar import progressbar
import selenium.webdriver.firefox.options import selenium.webdriver.firefox.options
import seleniumwire.webdriver import seleniumwire.webdriver
import logging
log = logging.getLogger('cs')
DRIVER = None
SCROLL_TIME = 10.0
SCROLL_STEPS = 100
SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})'
def new_driver() -> seleniumwire.webdriver.browser.Firefox:
profile = selenium.webdriver.FirefoxProfile()
profile.set_preference('privacy.trackingprotection.enabled', False)
profile.set_preference('network.cookie.cookieBehavior', 0)
profile.set_preference('privacy.trackingprotection.pbmode.enabled', False)
profile.set_preference(
'privacy.trackingprotection.cryptomining.enabled', False)
profile.set_preference(
'privacy.trackingprotection.fingerprinting.enabled', False)
options = selenium.webdriver.firefox.options.Options()
# options.add_argument('-headless')
driver = seleniumwire.webdriver.Firefox(profile,
executable_path='geckodriver', options=options)
return driver
def subdomain_from_url(url: str) -> str: def subdomain_from_url(url: str) -> str:
@ -29,28 +52,30 @@ def collect_subdomains(url: str) -> typing.Iterable[str]:
Load an URL into an headless browser and return all the domains Load an URL into an headless browser and return all the domains
it tried to access. it tried to access.
""" """
options = selenium.webdriver.firefox.options.Options() global DRIVER
options.add_argument('-headless') if not DRIVER:
driver = seleniumwire.webdriver.Firefox( DRIVER = new_driver()
executable_path='geckodriver', options=options)
driver.get(url) try:
time.sleep(10) DRIVER.get(url)
for request in driver.requests: for s in range(SCROLL_STEPS):
DRIVER.execute_script(SCROLL_CMD)
time.sleep(SCROLL_TIME/SCROLL_STEPS)
for request in DRIVER.requests:
if request.response: if request.response:
yield subdomain_from_url(request.path) yield subdomain_from_url(request.path)
driver.close() except:
log.exception("Error")
DRIVER.quit()
DRIVER = None
def collect_subdomains_standalone(url: str) -> None: def collect_subdomains_standalone(url: str) -> None:
url = url.strip() url = url.strip()
if not url: if not url:
return return
try:
for subdomain in collect_subdomains(url): for subdomain in collect_subdomains(url):
print(subdomain) print(subdomain)
except:
pass
if __name__ == '__main__': if __name__ == '__main__':
@ -66,5 +91,8 @@ if __name__ == '__main__':
for line in iterator: for line in iterator:
collect_subdomains_standalone(line) collect_subdomains_standalone(line)
if DRIVER:
DRIVER.quit()
if filename: if filename:
iterator.close() iterator.close()