Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

47 lines
1.1 KiB

#!/usr/bin/env python3
"""
From a list of URLs, output the subdomains
accessed by the websites.
"""
import sys
import typing
import urllib.parse
import selenium.webdriver.firefox.options
import seleniumwire.webdriver
def subdomain_from_url(url: str) -> str:
"""
Extract the domain part from an url.
"""
parsed = urllib.parse.urlparse(url)
return parsed.netloc
def collect_subdomains(url: str) -> typing.Iterable[str]:
"""
Load an URL into an headless browser and return all the domains
it tried to access.
"""
options = selenium.webdriver.firefox.options.Options()
options.add_argument('-headless')
driver = seleniumwire.webdriver.Firefox(
executable_path='geckodriver', options=options)
driver.get(url)
for request in driver.requests:
if request.response:
yield subdomain_from_url(request.path)
driver.close()
if __name__ == '__main__':
for line in sys.stdin:
line = line.strip()
if not line:
continue
for subdomain in collect_subdomains(line):
print(subdomain)