Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 lines
2.6KB

  1. #!/usr/bin/env python3
  2. # pylint: disable=C0103
  3. """
  4. From a list of URLs, output the subdomains
  5. accessed by the websites.
  6. """
  7. import sys
  8. import typing
  9. import urllib.parse
  10. import time
  11. import progressbar
  12. import selenium.webdriver.firefox.options
  13. import seleniumwire.webdriver
  14. import logging
  15. log = logging.getLogger('cs')
  16. DRIVER = None
  17. SCROLL_TIME = 10.0
  18. SCROLL_STEPS = 100
  19. SCROLL_CMD = f'window.scrollBy(0,document.body.scrollHeight/{SCROLL_STEPS})'
  20. def new_driver() -> seleniumwire.webdriver.browser.Firefox:
  21. profile = selenium.webdriver.FirefoxProfile()
  22. profile.set_preference('privacy.trackingprotection.enabled', False)
  23. profile.set_preference('network.cookie.cookieBehavior', 0)
  24. profile.set_preference('privacy.trackingprotection.pbmode.enabled', False)
  25. profile.set_preference(
  26. 'privacy.trackingprotection.cryptomining.enabled', False)
  27. profile.set_preference(
  28. 'privacy.trackingprotection.fingerprinting.enabled', False)
  29. options = selenium.webdriver.firefox.options.Options()
  30. # options.add_argument('-headless')
  31. driver = seleniumwire.webdriver.Firefox(profile,
  32. executable_path='geckodriver', options=options)
  33. return driver
  34. def subdomain_from_url(url: str) -> str:
  35. """
  36. Extract the domain part from an url.
  37. """
  38. parsed = urllib.parse.urlparse(url)
  39. return parsed.netloc
  40. def collect_subdomains(url: str) -> typing.Iterable[str]:
  41. """
  42. Load an URL into an headless browser and return all the domains
  43. it tried to access.
  44. """
  45. global DRIVER
  46. if not DRIVER:
  47. DRIVER = new_driver()
  48. try:
  49. DRIVER.get(url)
  50. for s in range(SCROLL_STEPS):
  51. DRIVER.execute_script(SCROLL_CMD)
  52. time.sleep(SCROLL_TIME/SCROLL_STEPS)
  53. for request in DRIVER.requests:
  54. if request.response:
  55. yield subdomain_from_url(request.path)
  56. except:
  57. log.exception("Error")
  58. DRIVER.quit()
  59. DRIVER = None
  60. def collect_subdomains_standalone(url: str) -> None:
  61. url = url.strip()
  62. if not url:
  63. return
  64. for subdomain in collect_subdomains(url):
  65. print(subdomain)
  66. if __name__ == '__main__':
  67. assert len(sys.argv) <= 2
  68. filename = None
  69. if len(sys.argv) == 2 and sys.argv[1] != '-':
  70. filename = sys.argv[1]
  71. num_lines = sum(1 for line in open(filename))
  72. iterator = progressbar.progressbar(open(filename), max_value=num_lines)
  73. else:
  74. iterator = sys.stdin
  75. for line in iterator:
  76. collect_subdomains_standalone(line)
  77. if DRIVER:
  78. DRIVER.quit()
  79. if filename:
  80. iterator.close()