Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

215 lines
6.2 KiB

  1. #!/usr/bin/env python3
  2. import argparse
  3. import database
  4. import logging
  5. import sys
  6. import typing
  7. import multiprocessing
  8. import time
  9. Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str]
  10. # select, write
  11. FUNCTION_MAP: typing.Any = {
  12. 'a': (
  13. database.Database.get_ip4,
  14. database.Database.set_hostname,
  15. ),
  16. 'cname': (
  17. database.Database.get_domain,
  18. database.Database.set_hostname,
  19. ),
  20. 'ptr': (
  21. database.Database.get_domain,
  22. database.Database.set_ip4address,
  23. ),
  24. }
  25. class Writer(multiprocessing.Process):
  26. def __init__(self,
  27. recs_queue: multiprocessing.Queue,
  28. autosave_interval: int = 0):
  29. super(Writer, self).__init__()
  30. self.log = logging.getLogger(f'wr')
  31. self.recs_queue = recs_queue
  32. self.autosave_interval = autosave_interval
  33. def run(self) -> None:
  34. self.db = database.Database()
  35. self.db.log = logging.getLogger(f'wr')
  36. if self.autosave_interval > 0:
  37. next_save = time.time() + self.autosave_interval
  38. else:
  39. next_save = 0
  40. self.db.enter_step('block_wait')
  41. block: typing.List[Record]
  42. for block in iter(self.recs_queue.get, None):
  43. record: Record
  44. for record in block:
  45. select, write, updated, name, value = record
  46. self.db.enter_step('feed_switch')
  47. try:
  48. for source in select(self.db, value):
  49. write(self.db, name, updated, source=source)
  50. except ValueError:
  51. self.log.exception("Cannot execute: %s", record)
  52. if next_save > 0 and time.time() > next_save:
  53. self.log.info("Saving database...")
  54. self.db.save()
  55. self.log.info("Done!")
  56. next_save = time.time() + self.autosave_interval
  57. self.db.enter_step('block_wait')
  58. self.db.enter_step('end')
  59. self.db.save()
  60. class Parser():
  61. def __init__(self,
  62. buf: typing.Any,
  63. recs_queue: multiprocessing.Queue,
  64. block_size: int,
  65. ):
  66. super(Parser, self).__init__()
  67. self.buf = buf
  68. self.log = logging.getLogger('pr')
  69. self.recs_queue = recs_queue
  70. self.block: typing.List[Record] = list()
  71. self.block_size = block_size
  72. self.prof = database.Profiler()
  73. self.prof.log = logging.getLogger('pr')
  74. def register(self, record: Record) -> None:
  75. self.prof.enter_step('register')
  76. self.block.append(record)
  77. if len(self.block) >= self.block_size:
  78. self.prof.enter_step('put_block')
  79. self.recs_queue.put(self.block)
  80. self.block = list()
  81. def run(self) -> None:
  82. self.consume()
  83. self.recs_queue.put(self.block)
  84. self.prof.profile()
  85. def consume(self) -> None:
  86. raise NotImplementedError
  87. class Rapid7Parser(Parser):
  88. def consume(self) -> None:
  89. data = dict()
  90. for line in self.buf:
  91. self.prof.enter_step('parse_rapid7')
  92. split = line.split('"')
  93. try:
  94. for k in range(1, 14, 4):
  95. key = split[k]
  96. val = split[k+2]
  97. data[key] = val
  98. select, writer = FUNCTION_MAP[data['type']]
  99. record = (
  100. select,
  101. writer,
  102. int(data['timestamp']),
  103. data['name'],
  104. data['value']
  105. )
  106. except IndexError:
  107. self.log.exception("Cannot parse: %s", line)
  108. self.register(record)
  109. class MassDnsParser(Parser):
  110. # massdns --output Snrql
  111. # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
  112. TYPES = {
  113. 'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None),
  114. # 'AAAA': (FUNCTION_MAP['aaaa'][0], FUNCTION_MAP['aaaa'][1], -1, None),
  115. 'CNAME': (FUNCTION_MAP['cname'][0], FUNCTION_MAP['cname'][1], -1, -1),
  116. }
  117. def consume(self) -> None:
  118. self.prof.enter_step('parse_massdns')
  119. timestamp = 0
  120. header = True
  121. for line in self.buf:
  122. line = line[:-1]
  123. if not line:
  124. header = True
  125. continue
  126. split = line.split(' ')
  127. try:
  128. if header:
  129. timestamp = int(split[1])
  130. header = False
  131. else:
  132. select, write, name_offset, value_offset = \
  133. MassDnsParser.TYPES[split[1]]
  134. record = (
  135. select,
  136. write,
  137. timestamp,
  138. split[0][:name_offset],
  139. split[2][:value_offset],
  140. )
  141. self.register(record)
  142. self.prof.enter_step('parse_massdns')
  143. except KeyError:
  144. continue
  145. PARSERS = {
  146. 'rapid7': Rapid7Parser,
  147. 'massdns': MassDnsParser,
  148. }
  149. if __name__ == '__main__':
  150. # Parsing arguments
  151. log = logging.getLogger('feed_dns')
  152. args_parser = argparse.ArgumentParser(
  153. description="TODO")
  154. args_parser.add_argument(
  155. 'parser',
  156. choices=PARSERS.keys(),
  157. help="TODO")
  158. args_parser.add_argument(
  159. '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
  160. help="TODO")
  161. args_parser.add_argument(
  162. '-j', '--workers', type=int, default=4,
  163. help="TODO")
  164. args_parser.add_argument(
  165. '-b', '--block-size', type=int, default=1024,
  166. help="TODO")
  167. args_parser.add_argument(
  168. '-q', '--queue-size', type=int, default=128,
  169. help="TODO")
  170. args_parser.add_argument(
  171. '-a', '--autosave-interval', type=int, default=900,
  172. help="TODO seconds")
  173. args = args_parser.parse_args()
  174. recs_queue: multiprocessing.Queue = multiprocessing.Queue(
  175. maxsize=args.queue_size)
  176. writer = Writer(recs_queue, autosave_interval=args.autosave_interval)
  177. writer.start()
  178. parser = PARSERS[args.parser](args.input, recs_queue, args.block_size)
  179. parser.run()
  180. recs_queue.put(None)
  181. writer.join()