Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
3.1 KiB

2 years ago
2 years ago
2 years ago
2 years ago
  1. #!/usr/bin/env python3
  2. # pylint: disable=C0103
  3. """
  4. From a list of subdomains, output only
  5. the ones resolving to a first-party tracker.
  6. """
  7. import argparse
  8. import sys
  9. import progressbar
  10. import csv
  11. import typing
  12. # DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
  13. DomainRule = typing.Union[bool, typing.Dict]
  14. RULES_DICT: DomainRule = dict()
  15. def subdomain_matching(subdomain: str) -> bool:
  16. parts = subdomain.split('.')
  17. parts.reverse()
  18. dic = RULES_DICT
  19. for part in parts:
  20. if isinstance(dic, bool) or part not in dic:
  21. break
  22. dic = dic[part]
  23. if isinstance(dic, bool):
  24. return dic
  25. return False
  26. def get_matching(chain: typing.List[str], no_explicit: bool = False
  27. ) -> typing.Iterable[str]:
  28. initial = chain[0]
  29. cname_destinations = chain[1:-1]
  30. # a_destination = chain[-1]
  31. initial_matching = subdomain_matching(initial)
  32. if no_explicit and initial_matching:
  33. return
  34. cname_matching = any(map(subdomain_matching, cname_destinations))
  35. if cname_matching or initial_matching:
  36. yield initial
  37. def register_rule(subdomain: str) -> None:
  38. # Make a tree with domain parts
  39. parts = subdomain.split('.')
  40. parts.reverse()
  41. dic = RULES_DICT
  42. last_part = len(parts) - 1
  43. for p, part in enumerate(parts):
  44. if isinstance(dic, bool):
  45. return
  46. if p == last_part:
  47. dic[part] = True
  48. else:
  49. dic.setdefault(part, dict())
  50. dic = dic[part]
  51. if __name__ == '__main__':
  52. # Parsing arguments
  53. parser = argparse.ArgumentParser(
  54. description="Filter first-party trackers from a list of subdomains")
  55. parser.add_argument(
  56. '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
  57. help="Input file with DNS chains")
  58. parser.add_argument(
  59. '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
  60. help="Outptut file with one tracking subdomain per line")
  61. parser.add_argument(
  62. '-n', '--no-explicit', action='store_true',
  63. help="Don't output domains already blocked with rules without CNAME")
  64. parser.add_argument(
  65. '-r', '--rules', type=argparse.FileType('r'), default='rules',
  66. help="Rules file")
  67. args = parser.parse_args()
  68. # Progress bar
  69. widgets = [
  70. progressbar.Percentage(),
  71. ' ', progressbar.SimpleProgress(),
  72. ' ', progressbar.Bar(),
  73. ' ', progressbar.Timer(),
  74. ' ', progressbar.AdaptiveTransferSpeed(unit='req'),
  75. ' ', progressbar.AdaptiveETA(),
  76. ]
  77. progress = progressbar.ProgressBar(widgets=widgets)
  78. # Reading rules
  79. for rule in args.rules:
  80. register_rule(rule.strip())
  81. # Reading domains to filter
  82. if args.input.seekable():
  83. progress.max_value = len(args.input.readlines())
  84. args.input.seek(0)
  85. reader = csv.reader(args.input)
  86. progress.start()
  87. for chain in reader:
  88. for match in get_matching(chain, no_explicit=args.no_explicit):
  89. print(match, file=args.output)
  90. progress.update(progress.value + 1)
  91. progress.finish()