Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
2.6 KiB

2 years ago
2 years ago
2 years ago
2 years ago
4 months ago
4 months ago
4 months ago
4 months ago
2 years ago
2 years ago
4 months ago
2 years ago
4 months ago
4 months ago
  1. #!/usr/bin/env python3
  2. # pylint: disable=C0103
  3. """
  4. From a list of URLs, output the subdomains
  5. accessed by the websites.
  6. """
  7. import sys
  8. import typing
  9. import urllib.parse
  10. import time
  11. import progressbar
  12. import selenium.webdriver.firefox.options
  13. import seleniumwire.webdriver
  14. import logging
  15. log = logging.getLogger("cs")
  16. DRIVER = None
  17. SCROLL_TIME = 10.0
  18. SCROLL_STEPS = 100
  19. SCROLL_CMD = f"window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})"
  20. def new_driver() -> seleniumwire.webdriver.browser.Firefox:
  21. profile = selenium.webdriver.FirefoxProfile()
  22. profile.set_preference("privacy.trackingprotection.enabled", False)
  23. profile.set_preference("network.cookie.cookieBehavior", 0)
  24. profile.set_preference("privacy.trackingprotection.pbmode.enabled", False)
  25. profile.set_preference("privacy.trackingprotection.cryptomining.enabled", False)
  26. profile.set_preference("privacy.trackingprotection.fingerprinting.enabled", False)
  27. options = selenium.webdriver.firefox.options.Options()
  28. # options.add_argument('-headless')
  29. driver = seleniumwire.webdriver.Firefox(
  30. profile, executable_path="geckodriver", options=options
  31. )
  32. return driver
  33. def subdomain_from_url(url: str) -> str:
  34. """
  35. Extract the domain part from an url.
  36. """
  37. parsed = urllib.parse.urlparse(url)
  38. return parsed.netloc
  39. def collect_subdomains(url: str) -> typing.Iterable[str]:
  40. """
  41. Load an URL into an headless browser and return all the domains
  42. it tried to access.
  43. """
  44. global DRIVER
  45. if not DRIVER:
  46. DRIVER = new_driver()
  47. try:
  48. DRIVER.get(url)
  49. for s in range(SCROLL_STEPS):
  50. DRIVER.execute_script(SCROLL_CMD)
  51. time.sleep(SCROLL_TIME / SCROLL_STEPS)
  52. for request in DRIVER.requests:
  53. if request.response:
  54. yield subdomain_from_url(request.path)
  55. except Exception:
  56. log.exception("Error")
  57. DRIVER.quit()
  58. DRIVER = None
  59. def collect_subdomains_standalone(url: str) -> None:
  60. url = url.strip()
  61. if not url:
  62. return
  63. for subdomain in collect_subdomains(url):
  64. print(subdomain)
  65. if __name__ == "__main__":
  66. assert len(sys.argv) <= 2
  67. filename = None
  68. if len(sys.argv) == 2 and sys.argv[1] != "-":
  69. filename = sys.argv[1]
  70. num_lines = sum(1 for line in open(filename))
  71. iterator = progressbar.progressbar(open(filename), max_value=num_lines)
  72. else:
  73. iterator = sys.stdin
  74. for line in iterator:
  75. collect_subdomains_standalone(line)
  76. if DRIVER:
  77. DRIVER.quit()
  78. if filename:
  79. iterator.close()