diff --git a/database.py b/database.py index bff6638..6e4ca3a 100644 --- a/database.py +++ b/database.py @@ -79,6 +79,7 @@ class Match(): def __init__(self) -> None: self.source: typing.Optional[Path] = None self.updated: int = 0 + self.dupplicate: bool = False # Cache self.level: int = 0 @@ -148,7 +149,7 @@ class Profiler(): class Database(Profiler): - VERSION = 17 + VERSION = 18 PATH = "blocking.p" def initialize(self) -> None: @@ -411,6 +412,7 @@ class Database(Profiler): def export(self, first_party_only: bool = False, end_chain_only: bool = False, + no_dupplicates: bool = False, explain: bool = False, ) -> typing.Iterable[str]: @@ -423,6 +425,8 @@ class Database(Profiler): return if end_chain_only and match.references > 0: return + if no_dupplicates and match.dupplicate: + return if explain: yield self.explain(path) else: @@ -448,18 +452,19 @@ class Database(Profiler): yield from self.exec_each(list_rules_cb) def count_records(self, - first_party_only: bool = False, - rules_only: bool = False, - ) -> str: + first_party_only: bool = False, + rules_only: bool = False, + no_dupplicates: bool = False, + ) -> str: memo: typing.Dict[str, int] = dict() def count_records_cb(path: Path, match: Match) -> None: if first_party_only and not match.first_party: return - # if isinstance(path, ZonePath) \ - # or (isinstance(path, Ip4Path) and path.prefixlen < 32): if rules_only and match.level > 1: return + if no_dupplicates and match.dupplicate: + return try: memo[path.__class__.__name__] += 1 except KeyError: @@ -518,6 +523,7 @@ class Database(Profiler): updated: int, source: Path, source_match: Match = None, + dupplicate: bool = False, ) -> None: # source_match is in parameters because most of the time # its parent function needs it too, @@ -536,6 +542,7 @@ class Database(Profiler): match.first_party = source_match.first_party match.source = source source_match.references += 1 + match.dupplicate = dupplicate def _set_domain(self, hostname: bool, @@ -549,13 +556,13 @@ class Database(Profiler): is_first_party = source_match.first_party self.enter_step('set_domain_brws') dic = self.domtree + dupplicate = False for part in domain.parts: if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] if dic.match_zone.active(is_first_party): - # Refuse to add domain whose zone is already matching - return + dupplicate = True if hostname: match = dic.match_hostname else: @@ -565,6 +572,7 @@ class Database(Profiler): updated, source, source_match=source_match, + dupplicate=dupplicate, ) def set_hostname(self, @@ -603,6 +611,7 @@ class Database(Profiler): is_first_party = source_match.first_party self.enter_step('set_ip4_brws') dic = self.ip4tree + dupplicate = False for i in range(31, 31-ip4.prefixlen, -1): bit = (ip4.value >> i) & 0b1 next_dic = dic.one if bit else dic.zero @@ -614,13 +623,13 @@ class Database(Profiler): dic.zero = next_dic dic = next_dic if dic.active(is_first_party): - # Refuse to add ip4* whose network is already matching - return + dupplicate = True self._set_match( dic, updated, source, source_match=source_match, + dupplicate=dupplicate, ) def set_ip4address(self, diff --git a/export.py b/export.py index 91f7193..8befd77 100755 --- a/export.py +++ b/export.py @@ -25,6 +25,9 @@ if __name__ == '__main__': parser.add_argument( '-r', '--rules', action='store_true', help="TODO") + parser.add_argument( + '-d', '--no-dupplicates', action='store_true', + help="TODO") parser.add_argument( '-c', '--count', action='store_true', help="TODO") @@ -35,7 +38,9 @@ if __name__ == '__main__': if args.count: print(DB.count_records( first_party_only=args.first_party, - rules_only=args.rules)) + rules_only=args.rules, + no_dupplicates=args.no_dupplicates, + )) else: if args.rules: for line in DB.list_rules(): @@ -43,6 +48,7 @@ if __name__ == '__main__': for domain in DB.export( first_party_only=args.first_party, end_chain_only=args.end_chain, + no_dupplicates=args.no_dupplicates, explain=args.explain, ): print(domain, file=args.output) diff --git a/export_lists.sh b/export_lists.sh index 20a34cb..7ef8156 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -6,9 +6,9 @@ function log() { log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt -./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt +./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt ./export.py --output dist/multiparty-trackers.txt -./export.py --end-chain --output dist/multiparty-only-trackers.txt +./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt log "Generating statistics…" ./export.py --count --first-party > temp/count_recs_firstparty.txt diff --git a/feed_asn.py b/feed_asn.py index aa28dfe..6acfba7 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -48,7 +48,8 @@ if __name__ == '__main__': assert isinstance(match, database.AsnNode) asn_str = database.Database.unpack_asn(path) DB.enter_step('asn_get_name') - match.name = get_name(asn_str) + name = get_name(asn_str) + match.name = name DB.enter_step('asn_get_ranges') for prefix in get_ranges(asn_str): parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) @@ -58,7 +59,7 @@ if __name__ == '__main__': source=path, updated=int(time.time()) ) - log.info('Added %s from %s (%s)', prefix, asn_str, path) + log.info('Added %s from %s (%s)', prefix, path, name) elif parsed_prefix.version == 6: log.warning('Unimplemented prefix version: %s', prefix) else: diff --git a/feed_dns.old.py b/feed_dns.old.py deleted file mode 100755 index b106968..0000000 --- a/feed_dns.old.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import database -import logging -import sys -import typing -import enum - -RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') -Record = typing.Tuple[RecordType, int, str, str] - -# select, write -FUNCTION_MAP: typing.Any = { - RecordType.A: ( - database.Database.get_ip4, - database.Database.set_hostname, - ), - RecordType.CNAME: ( - database.Database.get_domain, - database.Database.set_hostname, - ), - RecordType.PTR: ( - database.Database.get_domain, - database.Database.set_ip4address, - ), -} - - -class Parser(): - def __init__(self, buf: typing.Any) -> None: - self.buf = buf - self.log = logging.getLogger('parser') - self.db = database.Database() - - def end(self) -> None: - self.db.save() - - def register(self, - rtype: RecordType, - updated: int, - name: str, - value: str - ) -> None: - - self.db.enter_step('register') - select, write = FUNCTION_MAP[rtype] - for source in select(self.db, value): - # write(self.db, name, updated, source=source) - write(self.db, name, updated) - - def consume(self) -> None: - raise NotImplementedError - - -class Rapid7Parser(Parser): - TYPES = { - 'a': RecordType.A, - 'aaaa': RecordType.AAAA, - 'cname': RecordType.CNAME, - 'ptr': RecordType.PTR, - } - - def consume(self) -> None: - data = dict() - for line in self.buf: - self.db.enter_step('parse_rapid7') - split = line.split('"') - - for k in range(1, 14, 4): - key = split[k] - val = split[k+2] - data[key] = val - - self.register( - Rapid7Parser.TYPES[data['type']], - int(data['timestamp']), - data['name'], - data['value'] - ) - - -class DnsMassParser(Parser): - # dnsmass --output Snrql - # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 - TYPES = { - 'A': (RecordType.A, -1, None), - 'AAAA': (RecordType.AAAA, -1, None), - 'CNAME': (RecordType.CNAME, -1, -1), - } - - def consume(self) -> None: - self.db.enter_step('parse_dnsmass') - timestamp = 0 - header = True - for line in self.buf: - line = line[:-1] - if not line: - header = True - continue - - split = line.split(' ') - try: - if header: - timestamp = int(split[1]) - header = False - else: - dtype, name_offset, value_offset = \ - DnsMassParser.TYPES[split[1]] - self.register( - dtype, - timestamp, - split[0][:name_offset], - split[2][:value_offset], - ) - self.db.enter_step('parse_dnsmass') - except KeyError: - continue - - -PARSERS = { - 'rapid7': Rapid7Parser, - 'dnsmass': DnsMassParser, -} - -if __name__ == '__main__': - - # Parsing arguments - log = logging.getLogger('feed_dns') - args_parser = argparse.ArgumentParser( - description="TODO") - args_parser.add_argument( - 'parser', - choices=PARSERS.keys(), - help="TODO") - args_parser.add_argument( - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="TODO") - args = args_parser.parse_args() - - parser = PARSERS[args.parser](args.input) - try: - parser.consume() - except KeyboardInterrupt: - pass - parser.end() - diff --git a/feed_dns.py b/feed_dns.py index 43df1fd..58529fd 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -181,10 +181,10 @@ if __name__ == '__main__': '-j', '--workers', type=int, default=4, help="TODO") args_parser.add_argument( - '-b', '--block-size', type=int, default=100, + '-b', '--block-size', type=int, default=1024, help="TODO") args_parser.add_argument( - '-q', '--queue-size', type=int, default=10, + '-q', '--queue-size', type=int, default=128, help="TODO") args = args_parser.parse_args()