Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
4.5 KiB

2 years ago
2 years ago
2 years ago
2 years ago
  1. #!/usr/bin/env python3
  2. # pylint: disable=C0103
  3. """
  4. From a list of subdomains, output only
  5. the ones resolving to a first-party tracker.
  6. """
  7. import argparse
  8. import sys
  9. import progressbar
  10. import csv
  11. import typing
  12. import ipaddress
  13. # DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
  14. DomainRule = typing.Union[bool, typing.Dict]
  15. # IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']]
  16. IpRule = typing.Union[bool, typing.Dict]
  17. RULES_DICT: DomainRule = dict()
  18. RULES_IP_DICT: IpRule = dict()
  19. def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
  20. for char in address.packed:
  21. for i in range(7, -1, -1):
  22. yield (char >> i) & 0b1
  23. def subdomain_matching(subdomain: str) -> bool:
  24. parts = subdomain.split('.')
  25. parts.reverse()
  26. dic = RULES_DICT
  27. for part in parts:
  28. if isinstance(dic, bool) or part not in dic:
  29. break
  30. dic = dic[part]
  31. if isinstance(dic, bool):
  32. return dic
  33. return False
  34. def ip_matching(ip_str: str) -> bool:
  35. ip = ipaddress.ip_address(ip_str)
  36. dic = RULES_IP_DICT
  37. i = 0
  38. for bit in get_bits(ip):
  39. i += 1
  40. if isinstance(dic, bool) or bit not in dic:
  41. break
  42. dic = dic[bit]
  43. if isinstance(dic, bool):
  44. return dic
  45. return False
  46. def get_matching(chain: typing.List[str], no_explicit: bool = False
  47. ) -> typing.Iterable[str]:
  48. if len(chain) <= 1:
  49. return
  50. initial = chain[0]
  51. cname_destinations = chain[1:-1]
  52. a_destination = chain[-1]
  53. initial_matching = subdomain_matching(initial)
  54. if no_explicit and initial_matching:
  55. return
  56. cname_matching = any(map(subdomain_matching, cname_destinations))
  57. if cname_matching or initial_matching or ip_matching(a_destination):
  58. yield initial
  59. def register_rule(subdomain: str) -> None:
  60. # Make a tree with domain parts
  61. parts = subdomain.split('.')
  62. parts.reverse()
  63. dic = RULES_DICT
  64. last_part = len(parts) - 1
  65. for p, part in enumerate(parts):
  66. if isinstance(dic, bool):
  67. return
  68. if p == last_part:
  69. dic[part] = True
  70. else:
  71. dic.setdefault(part, dict())
  72. dic = dic[part]
  73. def register_rule_ip(network: str) -> None:
  74. net = ipaddress.ip_network(network)
  75. ip = net.network_address
  76. dic = RULES_IP_DICT
  77. last_bit = net.prefixlen - 1
  78. for b, bit in enumerate(get_bits(ip)):
  79. if isinstance(dic, bool):
  80. return
  81. if b == last_bit:
  82. dic[bit] = True
  83. else:
  84. dic.setdefault(bit, dict())
  85. dic = dic[bit]
  86. if __name__ == '__main__':
  87. # Parsing arguments
  88. parser = argparse.ArgumentParser(
  89. description="Filter first-party trackers from a list of subdomains")
  90. parser.add_argument(
  91. '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
  92. help="Input file with DNS chains")
  93. parser.add_argument(
  94. '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
  95. help="Outptut file with one tracking subdomain per line")
  96. parser.add_argument(
  97. '-n', '--no-explicit', action='store_true',
  98. help="Don't output domains already blocked with rules without CNAME")
  99. parser.add_argument(
  100. '-r', '--rules', type=argparse.FileType('r'),
  101. help="List of domains domains to block (with their subdomains)")
  102. parser.add_argument(
  103. '-p', '--rules-ip', type=argparse.FileType('r'),
  104. help="List of IPs ranges to block")
  105. args = parser.parse_args()
  106. # Progress bar
  107. widgets = [
  108. progressbar.Percentage(),
  109. ' ', progressbar.SimpleProgress(),
  110. ' ', progressbar.Bar(),
  111. ' ', progressbar.Timer(),
  112. ' ', progressbar.AdaptiveTransferSpeed(unit='req'),
  113. ' ', progressbar.AdaptiveETA(),
  114. ]
  115. progress = progressbar.ProgressBar(widgets=widgets)
  116. # Reading rules
  117. if args.rules:
  118. for rule in args.rules:
  119. register_rule(rule.strip())
  120. if args.rules_ip:
  121. for rule in args.rules_ip:
  122. register_rule_ip(rule.strip())
  123. # Approximating line count
  124. if args.input.seekable():
  125. lines = 0
  126. for line in args.input:
  127. lines += 1
  128. progress.max_value = lines
  129. args.input.seek(0)
  130. # Reading domains to filter
  131. reader = csv.reader(args.input)
  132. progress.start()
  133. for chain in reader:
  134. for match in get_matching(chain, no_explicit=args.no_explicit):
  135. print(match, file=args.output)
  136. progress.update(progress.value + 1)
  137. progress.finish()