Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
#!/usr/bin/env python3 # pylint: disable=C0103
"""
From a list of URLs, output the subdomains accessed by the websites. """
import sys import typing import urllib.parse import time
import progressbar import selenium.webdriver.firefox.options import seleniumwire.webdriver import logging
log = logging.getLogger("cs") DRIVER = None SCROLL_TIME = 10.0 SCROLL_STEPS = 100 SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})"
def new_driver() -> seleniumwire.webdriver.browser.Firefox: profile = selenium.webdriver.FirefoxProfile() profile.set_preference("privacy.trackingprotection.enabled", False) profile.set_preference("network.cookie.cookieBehavior", 0) profile.set_preference("privacy.trackingprotection.pbmode.enabled", False) profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False) profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False) options = selenium.webdriver.firefox.options.Options() # options.add_argument('-headless') driver = seleniumwire.webdriver.Firefox( profile, executable_path="geckodriver", options=options ) return driver
def subdomain_from_url(url: str) -> str: """
Extract the domain part from an url. """
parsed = urllib.parse.urlparse(url) return parsed.netloc
def collect_subdomains(url: str) -> typing.Iterable[str]: """
Load an URL into an headless browser and return all the domains it tried to access. """
global DRIVER if not DRIVER: DRIVER = new_driver()
try: DRIVER.get(url) for s in range(SCROLL_STEPS): DRIVER.execute_script(SCROLL_CMD) time.sleep(SCROLL_TIME / SCROLL_STEPS) for request in DRIVER.requests: if request.response: yield subdomain_from_url(request.path) except Exception: log.exception("Error") DRIVER.quit() DRIVER = None
def collect_subdomains_standalone(url: str) -> None: url = url.strip() if not url: return for subdomain in collect_subdomains(url): print(subdomain)
if __name__ == "__main__": assert len(sys.argv) <= 2 filename = None if len(sys.argv) == 2 and sys.argv[1] != "-": filename = sys.argv[1] num_lines = sum(1 for line in open(filename)) iterator = progressbar.progressbar(open(filename), max_value=num_lines) else: iterator = sys.stdin
for line in iterator: collect_subdomains_standalone(line)
if DRIVER: DRIVER.quit()
if filename: iterator.close()
|