Save dupplicates too
Maybe I won't publish them but this will help me for tracking trackers.
This commit is contained in:
parent
ea0855bd00
commit
d65107f849
29
database.py
29
database.py
|
@ -79,6 +79,7 @@ class Match():
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.source: typing.Optional[Path] = None
|
self.source: typing.Optional[Path] = None
|
||||||
self.updated: int = 0
|
self.updated: int = 0
|
||||||
|
self.dupplicate: bool = False
|
||||||
|
|
||||||
# Cache
|
# Cache
|
||||||
self.level: int = 0
|
self.level: int = 0
|
||||||
|
@ -148,7 +149,7 @@ class Profiler():
|
||||||
|
|
||||||
|
|
||||||
class Database(Profiler):
|
class Database(Profiler):
|
||||||
VERSION = 17
|
VERSION = 18
|
||||||
PATH = "blocking.p"
|
PATH = "blocking.p"
|
||||||
|
|
||||||
def initialize(self) -> None:
|
def initialize(self) -> None:
|
||||||
|
@ -411,6 +412,7 @@ class Database(Profiler):
|
||||||
def export(self,
|
def export(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
end_chain_only: bool = False,
|
end_chain_only: bool = False,
|
||||||
|
no_dupplicates: bool = False,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
|
|
||||||
|
@ -423,6 +425,8 @@ class Database(Profiler):
|
||||||
return
|
return
|
||||||
if end_chain_only and match.references > 0:
|
if end_chain_only and match.references > 0:
|
||||||
return
|
return
|
||||||
|
if no_dupplicates and match.dupplicate:
|
||||||
|
return
|
||||||
if explain:
|
if explain:
|
||||||
yield self.explain(path)
|
yield self.explain(path)
|
||||||
else:
|
else:
|
||||||
|
@ -448,18 +452,19 @@ class Database(Profiler):
|
||||||
yield from self.exec_each(list_rules_cb)
|
yield from self.exec_each(list_rules_cb)
|
||||||
|
|
||||||
def count_records(self,
|
def count_records(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
rules_only: bool = False,
|
rules_only: bool = False,
|
||||||
) -> str:
|
no_dupplicates: bool = False,
|
||||||
|
) -> str:
|
||||||
memo: typing.Dict[str, int] = dict()
|
memo: typing.Dict[str, int] = dict()
|
||||||
|
|
||||||
def count_records_cb(path: Path, match: Match) -> None:
|
def count_records_cb(path: Path, match: Match) -> None:
|
||||||
if first_party_only and not match.first_party:
|
if first_party_only and not match.first_party:
|
||||||
return
|
return
|
||||||
# if isinstance(path, ZonePath) \
|
|
||||||
# or (isinstance(path, Ip4Path) and path.prefixlen < 32):
|
|
||||||
if rules_only and match.level > 1:
|
if rules_only and match.level > 1:
|
||||||
return
|
return
|
||||||
|
if no_dupplicates and match.dupplicate:
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
memo[path.__class__.__name__] += 1
|
memo[path.__class__.__name__] += 1
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -518,6 +523,7 @@ class Database(Profiler):
|
||||||
updated: int,
|
updated: int,
|
||||||
source: Path,
|
source: Path,
|
||||||
source_match: Match = None,
|
source_match: Match = None,
|
||||||
|
dupplicate: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
# source_match is in parameters because most of the time
|
# source_match is in parameters because most of the time
|
||||||
# its parent function needs it too,
|
# its parent function needs it too,
|
||||||
|
@ -536,6 +542,7 @@ class Database(Profiler):
|
||||||
match.first_party = source_match.first_party
|
match.first_party = source_match.first_party
|
||||||
match.source = source
|
match.source = source
|
||||||
source_match.references += 1
|
source_match.references += 1
|
||||||
|
match.dupplicate = dupplicate
|
||||||
|
|
||||||
def _set_domain(self,
|
def _set_domain(self,
|
||||||
hostname: bool,
|
hostname: bool,
|
||||||
|
@ -549,13 +556,13 @@ class Database(Profiler):
|
||||||
is_first_party = source_match.first_party
|
is_first_party = source_match.first_party
|
||||||
self.enter_step('set_domain_brws')
|
self.enter_step('set_domain_brws')
|
||||||
dic = self.domtree
|
dic = self.domtree
|
||||||
|
dupplicate = False
|
||||||
for part in domain.parts:
|
for part in domain.parts:
|
||||||
if part not in dic.children:
|
if part not in dic.children:
|
||||||
dic.children[part] = DomainTreeNode()
|
dic.children[part] = DomainTreeNode()
|
||||||
dic = dic.children[part]
|
dic = dic.children[part]
|
||||||
if dic.match_zone.active(is_first_party):
|
if dic.match_zone.active(is_first_party):
|
||||||
# Refuse to add domain whose zone is already matching
|
dupplicate = True
|
||||||
return
|
|
||||||
if hostname:
|
if hostname:
|
||||||
match = dic.match_hostname
|
match = dic.match_hostname
|
||||||
else:
|
else:
|
||||||
|
@ -565,6 +572,7 @@ class Database(Profiler):
|
||||||
updated,
|
updated,
|
||||||
source,
|
source,
|
||||||
source_match=source_match,
|
source_match=source_match,
|
||||||
|
dupplicate=dupplicate,
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_hostname(self,
|
def set_hostname(self,
|
||||||
|
@ -603,6 +611,7 @@ class Database(Profiler):
|
||||||
is_first_party = source_match.first_party
|
is_first_party = source_match.first_party
|
||||||
self.enter_step('set_ip4_brws')
|
self.enter_step('set_ip4_brws')
|
||||||
dic = self.ip4tree
|
dic = self.ip4tree
|
||||||
|
dupplicate = False
|
||||||
for i in range(31, 31-ip4.prefixlen, -1):
|
for i in range(31, 31-ip4.prefixlen, -1):
|
||||||
bit = (ip4.value >> i) & 0b1
|
bit = (ip4.value >> i) & 0b1
|
||||||
next_dic = dic.one if bit else dic.zero
|
next_dic = dic.one if bit else dic.zero
|
||||||
|
@ -614,13 +623,13 @@ class Database(Profiler):
|
||||||
dic.zero = next_dic
|
dic.zero = next_dic
|
||||||
dic = next_dic
|
dic = next_dic
|
||||||
if dic.active(is_first_party):
|
if dic.active(is_first_party):
|
||||||
# Refuse to add ip4* whose network is already matching
|
dupplicate = True
|
||||||
return
|
|
||||||
self._set_match(
|
self._set_match(
|
||||||
dic,
|
dic,
|
||||||
updated,
|
updated,
|
||||||
source,
|
source,
|
||||||
source_match=source_match,
|
source_match=source_match,
|
||||||
|
dupplicate=dupplicate,
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_ip4address(self,
|
def set_ip4address(self,
|
||||||
|
|
|
@ -25,6 +25,9 @@ if __name__ == '__main__':
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-r', '--rules', action='store_true',
|
'-r', '--rules', action='store_true',
|
||||||
help="TODO")
|
help="TODO")
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--no-dupplicates', action='store_true',
|
||||||
|
help="TODO")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-c', '--count', action='store_true',
|
'-c', '--count', action='store_true',
|
||||||
help="TODO")
|
help="TODO")
|
||||||
|
@ -35,7 +38,9 @@ if __name__ == '__main__':
|
||||||
if args.count:
|
if args.count:
|
||||||
print(DB.count_records(
|
print(DB.count_records(
|
||||||
first_party_only=args.first_party,
|
first_party_only=args.first_party,
|
||||||
rules_only=args.rules))
|
rules_only=args.rules,
|
||||||
|
no_dupplicates=args.no_dupplicates,
|
||||||
|
))
|
||||||
else:
|
else:
|
||||||
if args.rules:
|
if args.rules:
|
||||||
for line in DB.list_rules():
|
for line in DB.list_rules():
|
||||||
|
@ -43,6 +48,7 @@ if __name__ == '__main__':
|
||||||
for domain in DB.export(
|
for domain in DB.export(
|
||||||
first_party_only=args.first_party,
|
first_party_only=args.first_party,
|
||||||
end_chain_only=args.end_chain,
|
end_chain_only=args.end_chain,
|
||||||
|
no_dupplicates=args.no_dupplicates,
|
||||||
explain=args.explain,
|
explain=args.explain,
|
||||||
):
|
):
|
||||||
print(domain, file=args.output)
|
print(domain, file=args.output)
|
||||||
|
|
|
@ -6,9 +6,9 @@ function log() {
|
||||||
|
|
||||||
log "Exporting lists…"
|
log "Exporting lists…"
|
||||||
./export.py --first-party --output dist/firstparty-trackers.txt
|
./export.py --first-party --output dist/firstparty-trackers.txt
|
||||||
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
|
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
|
||||||
./export.py --output dist/multiparty-trackers.txt
|
./export.py --output dist/multiparty-trackers.txt
|
||||||
./export.py --end-chain --output dist/multiparty-only-trackers.txt
|
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
|
||||||
|
|
||||||
log "Generating statistics…"
|
log "Generating statistics…"
|
||||||
./export.py --count --first-party > temp/count_recs_firstparty.txt
|
./export.py --count --first-party > temp/count_recs_firstparty.txt
|
||||||
|
|
|
@ -48,7 +48,8 @@ if __name__ == '__main__':
|
||||||
assert isinstance(match, database.AsnNode)
|
assert isinstance(match, database.AsnNode)
|
||||||
asn_str = database.Database.unpack_asn(path)
|
asn_str = database.Database.unpack_asn(path)
|
||||||
DB.enter_step('asn_get_name')
|
DB.enter_step('asn_get_name')
|
||||||
match.name = get_name(asn_str)
|
name = get_name(asn_str)
|
||||||
|
match.name = name
|
||||||
DB.enter_step('asn_get_ranges')
|
DB.enter_step('asn_get_ranges')
|
||||||
for prefix in get_ranges(asn_str):
|
for prefix in get_ranges(asn_str):
|
||||||
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
|
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
|
||||||
|
@ -58,7 +59,7 @@ if __name__ == '__main__':
|
||||||
source=path,
|
source=path,
|
||||||
updated=int(time.time())
|
updated=int(time.time())
|
||||||
)
|
)
|
||||||
log.info('Added %s from %s (%s)', prefix, asn_str, path)
|
log.info('Added %s from %s (%s)', prefix, path, name)
|
||||||
elif parsed_prefix.version == 6:
|
elif parsed_prefix.version == 6:
|
||||||
log.warning('Unimplemented prefix version: %s', prefix)
|
log.warning('Unimplemented prefix version: %s', prefix)
|
||||||
else:
|
else:
|
||||||
|
|
147
feed_dns.old.py
147
feed_dns.old.py
|
@ -1,147 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import database
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
import typing
|
|
||||||
import enum
|
|
||||||
|
|
||||||
RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR')
|
|
||||||
Record = typing.Tuple[RecordType, int, str, str]
|
|
||||||
|
|
||||||
# select, write
|
|
||||||
FUNCTION_MAP: typing.Any = {
|
|
||||||
RecordType.A: (
|
|
||||||
database.Database.get_ip4,
|
|
||||||
database.Database.set_hostname,
|
|
||||||
),
|
|
||||||
RecordType.CNAME: (
|
|
||||||
database.Database.get_domain,
|
|
||||||
database.Database.set_hostname,
|
|
||||||
),
|
|
||||||
RecordType.PTR: (
|
|
||||||
database.Database.get_domain,
|
|
||||||
database.Database.set_ip4address,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Parser():
|
|
||||||
def __init__(self, buf: typing.Any) -> None:
|
|
||||||
self.buf = buf
|
|
||||||
self.log = logging.getLogger('parser')
|
|
||||||
self.db = database.Database()
|
|
||||||
|
|
||||||
def end(self) -> None:
|
|
||||||
self.db.save()
|
|
||||||
|
|
||||||
def register(self,
|
|
||||||
rtype: RecordType,
|
|
||||||
updated: int,
|
|
||||||
name: str,
|
|
||||||
value: str
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
self.db.enter_step('register')
|
|
||||||
select, write = FUNCTION_MAP[rtype]
|
|
||||||
for source in select(self.db, value):
|
|
||||||
# write(self.db, name, updated, source=source)
|
|
||||||
write(self.db, name, updated)
|
|
||||||
|
|
||||||
def consume(self) -> None:
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class Rapid7Parser(Parser):
|
|
||||||
TYPES = {
|
|
||||||
'a': RecordType.A,
|
|
||||||
'aaaa': RecordType.AAAA,
|
|
||||||
'cname': RecordType.CNAME,
|
|
||||||
'ptr': RecordType.PTR,
|
|
||||||
}
|
|
||||||
|
|
||||||
def consume(self) -> None:
|
|
||||||
data = dict()
|
|
||||||
for line in self.buf:
|
|
||||||
self.db.enter_step('parse_rapid7')
|
|
||||||
split = line.split('"')
|
|
||||||
|
|
||||||
for k in range(1, 14, 4):
|
|
||||||
key = split[k]
|
|
||||||
val = split[k+2]
|
|
||||||
data[key] = val
|
|
||||||
|
|
||||||
self.register(
|
|
||||||
Rapid7Parser.TYPES[data['type']],
|
|
||||||
int(data['timestamp']),
|
|
||||||
data['name'],
|
|
||||||
data['value']
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DnsMassParser(Parser):
|
|
||||||
# dnsmass --output Snrql
|
|
||||||
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
|
|
||||||
TYPES = {
|
|
||||||
'A': (RecordType.A, -1, None),
|
|
||||||
'AAAA': (RecordType.AAAA, -1, None),
|
|
||||||
'CNAME': (RecordType.CNAME, -1, -1),
|
|
||||||
}
|
|
||||||
|
|
||||||
def consume(self) -> None:
|
|
||||||
self.db.enter_step('parse_dnsmass')
|
|
||||||
timestamp = 0
|
|
||||||
header = True
|
|
||||||
for line in self.buf:
|
|
||||||
line = line[:-1]
|
|
||||||
if not line:
|
|
||||||
header = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
split = line.split(' ')
|
|
||||||
try:
|
|
||||||
if header:
|
|
||||||
timestamp = int(split[1])
|
|
||||||
header = False
|
|
||||||
else:
|
|
||||||
dtype, name_offset, value_offset = \
|
|
||||||
DnsMassParser.TYPES[split[1]]
|
|
||||||
self.register(
|
|
||||||
dtype,
|
|
||||||
timestamp,
|
|
||||||
split[0][:name_offset],
|
|
||||||
split[2][:value_offset],
|
|
||||||
)
|
|
||||||
self.db.enter_step('parse_dnsmass')
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
PARSERS = {
|
|
||||||
'rapid7': Rapid7Parser,
|
|
||||||
'dnsmass': DnsMassParser,
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# Parsing arguments
|
|
||||||
log = logging.getLogger('feed_dns')
|
|
||||||
args_parser = argparse.ArgumentParser(
|
|
||||||
description="TODO")
|
|
||||||
args_parser.add_argument(
|
|
||||||
'parser',
|
|
||||||
choices=PARSERS.keys(),
|
|
||||||
help="TODO")
|
|
||||||
args_parser.add_argument(
|
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
|
||||||
help="TODO")
|
|
||||||
args = args_parser.parse_args()
|
|
||||||
|
|
||||||
parser = PARSERS[args.parser](args.input)
|
|
||||||
try:
|
|
||||||
parser.consume()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
pass
|
|
||||||
parser.end()
|
|
||||||
|
|
|
@ -181,10 +181,10 @@ if __name__ == '__main__':
|
||||||
'-j', '--workers', type=int, default=4,
|
'-j', '--workers', type=int, default=4,
|
||||||
help="TODO")
|
help="TODO")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-b', '--block-size', type=int, default=100,
|
'-b', '--block-size', type=int, default=1024,
|
||||||
help="TODO")
|
help="TODO")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-q', '--queue-size', type=int, default=10,
|
'-q', '--queue-size', type=int, default=128,
|
||||||
help="TODO")
|
help="TODO")
|
||||||
args = args_parser.parse_args()
|
args = args_parser.parse_args()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue