Improvements to subdomain collection
I use this for tracker identification so it's not perfect but still it's a bit better.
This commit is contained in:
parent
2b97ee4cb9
commit
808e36dde3
|
@ -151,7 +151,7 @@ If you want to force re-importing, run `rm last_updates/rapid7_*.txt`.
|
||||||
|
|
||||||
### Export the lists
|
### Export the lists
|
||||||
|
|
||||||
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
|
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them).
|
||||||
For other purposes, tinker with the `./export.py` program.
|
For other purposes, tinker with the `./export.py` program.
|
||||||
|
|
||||||
#### Explanations
|
#### Explanations
|
||||||
|
|
|
@ -14,6 +14,29 @@ import time
|
||||||
import progressbar
|
import progressbar
|
||||||
import selenium.webdriver.firefox.options
|
import selenium.webdriver.firefox.options
|
||||||
import seleniumwire.webdriver
|
import seleniumwire.webdriver
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger('cs')
|
||||||
|
DRIVER = None
|
||||||
|
SCROLL_TIME = 10.0
|
||||||
|
SCROLL_STEPS = 100
|
||||||
|
SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})'
|
||||||
|
|
||||||
|
|
||||||
|
def new_driver() -> seleniumwire.webdriver.browser.Firefox:
|
||||||
|
profile = selenium.webdriver.FirefoxProfile()
|
||||||
|
profile.set_preference('privacy.trackingprotection.enabled', False)
|
||||||
|
profile.set_preference('network.cookie.cookieBehavior', 0)
|
||||||
|
profile.set_preference('privacy.trackingprotection.pbmode.enabled', False)
|
||||||
|
profile.set_preference(
|
||||||
|
'privacy.trackingprotection.cryptomining.enabled', False)
|
||||||
|
profile.set_preference(
|
||||||
|
'privacy.trackingprotection.fingerprinting.enabled', False)
|
||||||
|
options = selenium.webdriver.firefox.options.Options()
|
||||||
|
# options.add_argument('-headless')
|
||||||
|
driver = seleniumwire.webdriver.Firefox(profile,
|
||||||
|
executable_path='geckodriver', options=options)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
|
||||||
def subdomain_from_url(url: str) -> str:
|
def subdomain_from_url(url: str) -> str:
|
||||||
|
@ -29,28 +52,30 @@ def collect_subdomains(url: str) -> typing.Iterable[str]:
|
||||||
Load an URL into an headless browser and return all the domains
|
Load an URL into an headless browser and return all the domains
|
||||||
it tried to access.
|
it tried to access.
|
||||||
"""
|
"""
|
||||||
options = selenium.webdriver.firefox.options.Options()
|
global DRIVER
|
||||||
options.add_argument('-headless')
|
if not DRIVER:
|
||||||
driver = seleniumwire.webdriver.Firefox(
|
DRIVER = new_driver()
|
||||||
executable_path='geckodriver', options=options)
|
|
||||||
|
|
||||||
driver.get(url)
|
try:
|
||||||
time.sleep(10)
|
DRIVER.get(url)
|
||||||
for request in driver.requests:
|
for s in range(SCROLL_STEPS):
|
||||||
|
DRIVER.execute_script(SCROLL_CMD)
|
||||||
|
time.sleep(SCROLL_TIME/SCROLL_STEPS)
|
||||||
|
for request in DRIVER.requests:
|
||||||
if request.response:
|
if request.response:
|
||||||
yield subdomain_from_url(request.path)
|
yield subdomain_from_url(request.path)
|
||||||
driver.close()
|
except:
|
||||||
|
log.exception("Error")
|
||||||
|
DRIVER.quit()
|
||||||
|
DRIVER = None
|
||||||
|
|
||||||
|
|
||||||
def collect_subdomains_standalone(url: str) -> None:
|
def collect_subdomains_standalone(url: str) -> None:
|
||||||
url = url.strip()
|
url = url.strip()
|
||||||
if not url:
|
if not url:
|
||||||
return
|
return
|
||||||
try:
|
|
||||||
for subdomain in collect_subdomains(url):
|
for subdomain in collect_subdomains(url):
|
||||||
print(subdomain)
|
print(subdomain)
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -66,5 +91,8 @@ if __name__ == '__main__':
|
||||||
for line in iterator:
|
for line in iterator:
|
||||||
collect_subdomains_standalone(line)
|
collect_subdomains_standalone(line)
|
||||||
|
|
||||||
|
if DRIVER:
|
||||||
|
DRIVER.quit()
|
||||||
|
|
||||||
if filename:
|
if filename:
|
||||||
iterator.close()
|
iterator.close()
|
||||||
|
|
Loading…
Reference in a new issue