Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

280 lines
8.2 KiB

4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
4 months ago
  1. #!/usr/bin/env python3
  2. import argparse
  3. import database
  4. import logging
  5. import sys
  6. import typing
  7. import multiprocessing
  8. import time
  9. Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str]
  10. # select, write
  11. FUNCTION_MAP: typing.Any = {
  12. "a": (
  13. database.Database.get_ip4,
  14. database.Database.set_hostname,
  15. ),
  16. "cname": (
  17. database.Database.get_domain,
  18. database.Database.set_hostname,
  19. ),
  20. "ptr": (
  21. database.Database.get_domain,
  22. database.Database.set_ip4address,
  23. ),
  24. }
  25. class Writer(multiprocessing.Process):
  26. def __init__(
  27. self,
  28. recs_queue: multiprocessing.Queue = None,
  29. autosave_interval: int = 0,
  30. ip4_cache: int = 0,
  31. ):
  32. if recs_queue: # MP
  33. super(Writer, self).__init__()
  34. self.recs_queue = recs_queue
  35. self.log = logging.getLogger("wr")
  36. self.autosave_interval = autosave_interval
  37. self.ip4_cache = ip4_cache
  38. if not recs_queue: # No MP
  39. self.open_db()
  40. def open_db(self) -> None:
  41. self.db = database.Database()
  42. self.db.log = logging.getLogger("wr")
  43. self.db.fill_ip4cache(max_size=self.ip4_cache)
  44. def exec_record(self, record: Record) -> None:
  45. self.db.enter_step("exec_record")
  46. select, write, updated, name, value = record
  47. try:
  48. for source in select(self.db, value):
  49. write(self.db, name, updated, source=source)
  50. except (ValueError, IndexError):
  51. # ValueError: non-number in IP
  52. # IndexError: IP too big
  53. self.log.exception("Cannot execute: %s", record)
  54. def end(self) -> None:
  55. self.db.enter_step("end")
  56. self.db.save()
  57. def run(self) -> None:
  58. self.open_db()
  59. if self.autosave_interval > 0:
  60. next_save = time.time() + self.autosave_interval
  61. else:
  62. next_save = 0
  63. self.db.enter_step("block_wait")
  64. block: typing.List[Record]
  65. for block in iter(self.recs_queue.get, None):
  66. assert block
  67. record: Record
  68. for record in block:
  69. self.exec_record(record)
  70. if next_save > 0 and time.time() > next_save:
  71. self.log.info("Saving database...")
  72. self.db.save()
  73. self.log.info("Done!")
  74. next_save = time.time() + self.autosave_interval
  75. self.db.enter_step("block_wait")
  76. self.end()
  77. class Parser:
  78. def __init__(
  79. self,
  80. buf: typing.Any,
  81. recs_queue: multiprocessing.Queue = None,
  82. block_size: int = 0,
  83. writer: Writer = None,
  84. ):
  85. assert bool(writer) ^ bool(block_size and recs_queue)
  86. self.buf = buf
  87. self.log = logging.getLogger("pr")
  88. self.recs_queue = recs_queue
  89. if writer: # No MP
  90. self.prof: database.Profiler = writer.db
  91. self.register = writer.exec_record
  92. else: # MP
  93. self.block: typing.List[Record] = list()
  94. self.block_size = block_size
  95. self.prof = database.Profiler()
  96. self.prof.log = logging.getLogger("pr")
  97. self.register = self.add_to_queue
  98. def add_to_queue(self, record: Record) -> None:
  99. self.prof.enter_step("register")
  100. self.block.append(record)
  101. if len(self.block) >= self.block_size:
  102. self.prof.enter_step("put_block")
  103. assert self.recs_queue
  104. self.recs_queue.put(self.block)
  105. self.block = list()
  106. def run(self) -> None:
  107. self.consume()
  108. if self.recs_queue:
  109. self.recs_queue.put(self.block)
  110. self.prof.profile()
  111. def consume(self) -> None:
  112. raise NotImplementedError
  113. class Rapid7Parser(Parser):
  114. def consume(self) -> None:
  115. data = dict()
  116. for line in self.buf:
  117. self.prof.enter_step("parse_rapid7")
  118. split = line.split('"')
  119. try:
  120. for k in range(1, 14, 4):
  121. key = split[k]
  122. val = split[k + 2]
  123. data[key] = val
  124. select, writer = FUNCTION_MAP[data["type"]]
  125. record = (
  126. select,
  127. writer,
  128. int(data["timestamp"]),
  129. data["name"],
  130. data["value"],
  131. )
  132. except (IndexError, KeyError):
  133. # IndexError: missing field
  134. # KeyError: Unknown type field
  135. self.log.exception("Cannot parse: %s", line)
  136. self.register(record)
  137. class MassDnsParser(Parser):
  138. # massdns --output Snrql
  139. # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
  140. TYPES = {
  141. "A": (FUNCTION_MAP["a"][0], FUNCTION_MAP["a"][1], -1, None),
  142. # 'AAAA': (FUNCTION_MAP['aaaa'][0], FUNCTION_MAP['aaaa'][1], -1, None),
  143. "CNAME": (FUNCTION_MAP["cname"][0], FUNCTION_MAP["cname"][1], -1, -1),
  144. }
  145. def consume(self) -> None:
  146. self.prof.enter_step("parse_massdns")
  147. timestamp = 0
  148. header = True
  149. for line in self.buf:
  150. line = line[:-1]
  151. if not line:
  152. header = True
  153. continue
  154. split = line.split(" ")
  155. try:
  156. if header:
  157. timestamp = int(split[1])
  158. header = False
  159. else:
  160. select, write, name_offset, value_offset = MassDnsParser.TYPES[
  161. split[1]
  162. ]
  163. record = (
  164. select,
  165. write,
  166. timestamp,
  167. split[0][:name_offset].lower(),
  168. split[2][:value_offset].lower(),
  169. )
  170. self.register(record)
  171. self.prof.enter_step("parse_massdns")
  172. except KeyError:
  173. continue
  174. PARSERS = {
  175. "rapid7": Rapid7Parser,
  176. "massdns": MassDnsParser,
  177. }
  178. if __name__ == "__main__":
  179. # Parsing arguments
  180. log = logging.getLogger("feed_dns")
  181. args_parser = argparse.ArgumentParser(
  182. description="Read DNS records and import "
  183. "tracking-relevant data into the database"
  184. )
  185. args_parser.add_argument("parser", choices=PARSERS.keys(), help="Input format")
  186. args_parser.add_argument(
  187. "-i",
  188. "--input",
  189. type=argparse.FileType("r"),
  190. default=sys.stdin,
  191. help="Input file",
  192. )
  193. args_parser.add_argument(
  194. "-b", "--block-size", type=int, default=1024, help="Performance tuning value"
  195. )
  196. args_parser.add_argument(
  197. "-q", "--queue-size", type=int, default=128, help="Performance tuning value"
  198. )
  199. args_parser.add_argument(
  200. "-a",
  201. "--autosave-interval",
  202. type=int,
  203. default=900,
  204. help="Interval to which the database will save in seconds. " "0 to disable.",
  205. )
  206. args_parser.add_argument(
  207. "-s",
  208. "--single-process",
  209. action="store_true",
  210. help="Only use one process. " "Might be useful for single core computers.",
  211. )
  212. args_parser.add_argument(
  213. "-4",
  214. "--ip4-cache",
  215. type=int,
  216. default=0,
  217. help="RAM cache for faster IPv4 lookup. "
  218. "Maximum useful value: 512 MiB (536870912). "
  219. "Warning: Depending on the rules, this might already "
  220. "be a memory-heavy process, even without the cache.",
  221. )
  222. args = args_parser.parse_args()
  223. parser_cls = PARSERS[args.parser]
  224. if args.single_process:
  225. writer = Writer(
  226. autosave_interval=args.autosave_interval, ip4_cache=args.ip4_cache
  227. )
  228. parser = parser_cls(args.input, writer=writer)
  229. parser.run()
  230. writer.end()
  231. else:
  232. recs_queue: multiprocessing.Queue = multiprocessing.Queue(
  233. maxsize=args.queue_size
  234. )
  235. writer = Writer(
  236. recs_queue,
  237. autosave_interval=args.autosave_interval,
  238. ip4_cache=args.ip4_cache,
  239. )
  240. writer.start()
  241. parser = parser_cls(
  242. args.input, recs_queue=recs_queue, block_size=args.block_size
  243. )
  244. parser.run()
  245. recs_queue.put(None)
  246. writer.join()