Save dupplicates too

Maybe I won't publish them but this will help me for tracking trackers.
newworkflow
Geoffrey Frogeye 2019-12-17 14:09:06 +01:00
parent ea0855bd00
commit d65107f849
Signed by: geoffrey
GPG Key ID: D8A7ECA00A8CD3DD
6 changed files with 33 additions and 164 deletions

View File

@ -79,6 +79,7 @@ class Match():
def __init__(self) -> None:
self.source: typing.Optional[Path] = None
self.updated: int = 0
self.dupplicate: bool = False
# Cache
self.level: int = 0
@ -148,7 +149,7 @@ class Profiler():
class Database(Profiler):
VERSION = 17
VERSION = 18
PATH = "blocking.p"
def initialize(self) -> None:
@ -411,6 +412,7 @@ class Database(Profiler):
def export(self,
first_party_only: bool = False,
end_chain_only: bool = False,
no_dupplicates: bool = False,
explain: bool = False,
) -> typing.Iterable[str]:
@ -423,6 +425,8 @@ class Database(Profiler):
return
if end_chain_only and match.references > 0:
return
if no_dupplicates and match.dupplicate:
return
if explain:
yield self.explain(path)
else:
@ -448,18 +452,19 @@ class Database(Profiler):
yield from self.exec_each(list_rules_cb)
def count_records(self,
first_party_only: bool = False,
rules_only: bool = False,
) -> str:
first_party_only: bool = False,
rules_only: bool = False,
no_dupplicates: bool = False,
) -> str:
memo: typing.Dict[str, int] = dict()
def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
# if isinstance(path, ZonePath) \
# or (isinstance(path, Ip4Path) and path.prefixlen < 32):
if rules_only and match.level > 1:
return
if no_dupplicates and match.dupplicate:
return
try:
memo[path.__class__.__name__] += 1
except KeyError:
@ -518,6 +523,7 @@ class Database(Profiler):
updated: int,
source: Path,
source_match: Match = None,
dupplicate: bool = False,
) -> None:
# source_match is in parameters because most of the time
# its parent function needs it too,
@ -536,6 +542,7 @@ class Database(Profiler):
match.first_party = source_match.first_party
match.source = source
source_match.references += 1
match.dupplicate = dupplicate
def _set_domain(self,
hostname: bool,
@ -549,13 +556,13 @@ class Database(Profiler):
is_first_party = source_match.first_party
self.enter_step('set_domain_brws')
dic = self.domtree
dupplicate = False
for part in domain.parts:
if part not in dic.children:
dic.children[part] = DomainTreeNode()
dic = dic.children[part]
if dic.match_zone.active(is_first_party):
# Refuse to add domain whose zone is already matching
return
dupplicate = True
if hostname:
match = dic.match_hostname
else:
@ -565,6 +572,7 @@ class Database(Profiler):
updated,
source,
source_match=source_match,
dupplicate=dupplicate,
)
def set_hostname(self,
@ -603,6 +611,7 @@ class Database(Profiler):
is_first_party = source_match.first_party
self.enter_step('set_ip4_brws')
dic = self.ip4tree
dupplicate = False
for i in range(31, 31-ip4.prefixlen, -1):
bit = (ip4.value >> i) & 0b1
next_dic = dic.one if bit else dic.zero
@ -614,13 +623,13 @@ class Database(Profiler):
dic.zero = next_dic
dic = next_dic
if dic.active(is_first_party):
# Refuse to add ip4* whose network is already matching
return
dupplicate = True
self._set_match(
dic,
updated,
source,
source_match=source_match,
dupplicate=dupplicate,
)
def set_ip4address(self,

View File

@ -25,6 +25,9 @@ if __name__ == '__main__':
parser.add_argument(
'-r', '--rules', action='store_true',
help="TODO")
parser.add_argument(
'-d', '--no-dupplicates', action='store_true',
help="TODO")
parser.add_argument(
'-c', '--count', action='store_true',
help="TODO")
@ -35,7 +38,9 @@ if __name__ == '__main__':
if args.count:
print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules))
rules_only=args.rules,
no_dupplicates=args.no_dupplicates,
))
else:
if args.rules:
for line in DB.list_rules():
@ -43,6 +48,7 @@ if __name__ == '__main__':
for domain in DB.export(
first_party_only=args.first_party,
end_chain_only=args.end_chain,
no_dupplicates=args.no_dupplicates,
explain=args.explain,
):
print(domain, file=args.output)

View File

@ -6,9 +6,9 @@ function log() {
log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output dist/multiparty-only-trackers.txt
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt

View File

@ -48,7 +48,8 @@ if __name__ == '__main__':
assert isinstance(match, database.AsnNode)
asn_str = database.Database.unpack_asn(path)
DB.enter_step('asn_get_name')
match.name = get_name(asn_str)
name = get_name(asn_str)
match.name = name
DB.enter_step('asn_get_ranges')
for prefix in get_ranges(asn_str):
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
@ -58,7 +59,7 @@ if __name__ == '__main__':
source=path,
updated=int(time.time())
)
log.info('Added %s from %s (%s)', prefix, asn_str, path)
log.info('Added %s from %s (%s)', prefix, path, name)
elif parsed_prefix.version == 6:
log.warning('Unimplemented prefix version: %s', prefix)
else:

View File

@ -1,147 +0,0 @@
#!/usr/bin/env python3
import argparse
import database
import logging
import sys
import typing
import enum
RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR')
Record = typing.Tuple[RecordType, int, str, str]
# select, write
FUNCTION_MAP: typing.Any = {
RecordType.A: (
database.Database.get_ip4,
database.Database.set_hostname,
),
RecordType.CNAME: (
database.Database.get_domain,
database.Database.set_hostname,
),
RecordType.PTR: (
database.Database.get_domain,
database.Database.set_ip4address,
),
}
class Parser():
def __init__(self, buf: typing.Any) -> None:
self.buf = buf
self.log = logging.getLogger('parser')
self.db = database.Database()
def end(self) -> None:
self.db.save()
def register(self,
rtype: RecordType,
updated: int,
name: str,
value: str
) -> None:
self.db.enter_step('register')
select, write = FUNCTION_MAP[rtype]
for source in select(self.db, value):
# write(self.db, name, updated, source=source)
write(self.db, name, updated)
def consume(self) -> None:
raise NotImplementedError
class Rapid7Parser(Parser):
TYPES = {
'a': RecordType.A,
'aaaa': RecordType.AAAA,
'cname': RecordType.CNAME,
'ptr': RecordType.PTR,
}
def consume(self) -> None:
data = dict()
for line in self.buf:
self.db.enter_step('parse_rapid7')
split = line.split('"')
for k in range(1, 14, 4):
key = split[k]
val = split[k+2]
data[key] = val
self.register(
Rapid7Parser.TYPES[data['type']],
int(data['timestamp']),
data['name'],
data['value']
)
class DnsMassParser(Parser):
# dnsmass --output Snrql
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
TYPES = {
'A': (RecordType.A, -1, None),
'AAAA': (RecordType.AAAA, -1, None),
'CNAME': (RecordType.CNAME, -1, -1),
}
def consume(self) -> None:
self.db.enter_step('parse_dnsmass')
timestamp = 0
header = True
for line in self.buf:
line = line[:-1]
if not line:
header = True
continue
split = line.split(' ')
try:
if header:
timestamp = int(split[1])
header = False
else:
dtype, name_offset, value_offset = \
DnsMassParser.TYPES[split[1]]
self.register(
dtype,
timestamp,
split[0][:name_offset],
split[2][:value_offset],
)
self.db.enter_step('parse_dnsmass')
except KeyError:
continue
PARSERS = {
'rapid7': Rapid7Parser,
'dnsmass': DnsMassParser,
}
if __name__ == '__main__':
# Parsing arguments
log = logging.getLogger('feed_dns')
args_parser = argparse.ArgumentParser(
description="TODO")
args_parser.add_argument(
'parser',
choices=PARSERS.keys(),
help="TODO")
args_parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = args_parser.parse_args()
parser = PARSERS[args.parser](args.input)
try:
parser.consume()
except KeyboardInterrupt:
pass
parser.end()

View File

@ -181,10 +181,10 @@ if __name__ == '__main__':
'-j', '--workers', type=int, default=4,
help="TODO")
args_parser.add_argument(
'-b', '--block-size', type=int, default=100,
'-b', '--block-size', type=int, default=1024,
help="TODO")
args_parser.add_argument(
'-q', '--queue-size', type=int, default=10,
'-q', '--queue-size', type=int, default=128,
help="TODO")
args = args_parser.parse_args()