Reworked rule export

This commit is contained in:
Geoffrey Frogeye 2019-12-17 13:29:02 +01:00
parent 8f6e01c857
commit 7851b038f5
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
4 changed files with 75 additions and 44 deletions

View file

@ -92,7 +92,9 @@ class Match():
class AsnNode(Match): class AsnNode(Match):
pass def __init__(self) -> None:
Match.__init__(self)
self.name = ''
class DomainTreeNode(): class DomainTreeNode():
@ -111,8 +113,7 @@ class IpTreeNode(Match):
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
MatchCallable = typing.Callable[[Path, MatchCallable = typing.Callable[[Path,
Match, Match],
typing.Optional[typing.Any]],
typing.Any] typing.Any]
@ -284,7 +285,6 @@ class Database(Profiler):
def exec_each_asn(self, def exec_each_asn(self,
callback: MatchCallable, callback: MatchCallable,
arg: typing.Any = None,
) -> typing.Any: ) -> typing.Any:
for asn in self.asns: for asn in self.asns:
match = self.asns[asn] match = self.asns[asn]
@ -292,7 +292,6 @@ class Database(Profiler):
c = callback( c = callback(
AsnPath(asn), AsnPath(asn),
match, match,
arg
) )
try: try:
yield from c yield from c
@ -301,7 +300,6 @@ class Database(Profiler):
def exec_each_domain(self, def exec_each_domain(self,
callback: MatchCallable, callback: MatchCallable,
arg: typing.Any = None,
_dic: DomainTreeNode = None, _dic: DomainTreeNode = None,
_par: DomainPath = None, _par: DomainPath = None,
) -> typing.Any: ) -> typing.Any:
@ -311,7 +309,6 @@ class Database(Profiler):
c = callback( c = callback(
HostnamePath(_par.parts), HostnamePath(_par.parts),
_dic.match_hostname, _dic.match_hostname,
arg
) )
try: try:
yield from c yield from c
@ -321,7 +318,6 @@ class Database(Profiler):
c = callback( c = callback(
ZonePath(_par.parts), ZonePath(_par.parts),
_dic.match_zone, _dic.match_zone,
arg
) )
try: try:
yield from c yield from c
@ -331,14 +327,12 @@ class Database(Profiler):
dic = _dic.children[part] dic = _dic.children[part]
yield from self.exec_each_domain( yield from self.exec_each_domain(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=DomainPath(_par.parts + [part]) _par=DomainPath(_par.parts + [part])
) )
def exec_each_ip4(self, def exec_each_ip4(self,
callback: MatchCallable, callback: MatchCallable,
arg: typing.Any = None,
_dic: IpTreeNode = None, _dic: IpTreeNode = None,
_par: Ip4Path = None, _par: Ip4Path = None,
) -> typing.Any: ) -> typing.Any:
@ -348,7 +342,6 @@ class Database(Profiler):
c = callback( c = callback(
_par, _par,
_dic, _dic,
arg
) )
try: try:
yield from c yield from c
@ -363,7 +356,6 @@ class Database(Profiler):
assert addr0 == _par.value assert addr0 == _par.value
yield from self.exec_each_ip4( yield from self.exec_each_ip4(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=Ip4Path(addr0, pref) _par=Ip4Path(addr0, pref)
) )
@ -373,14 +365,12 @@ class Database(Profiler):
addr1 = _par.value | (1 << (32-pref)) addr1 = _par.value | (1 << (32-pref))
yield from self.exec_each_ip4( yield from self.exec_each_ip4(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=Ip4Path(addr1, pref) _par=Ip4Path(addr1, pref)
) )
def exec_each(self, def exec_each(self,
callback: MatchCallable, callback: MatchCallable,
arg: typing.Any = None,
) -> typing.Any: ) -> typing.Any:
yield from self.exec_each_domain(callback) yield from self.exec_each_domain(callback)
yield from self.exec_each_ip4(callback) yield from self.exec_each_ip4(callback)
@ -390,19 +380,19 @@ class Database(Profiler):
# Should be correctly calculated normally, # Should be correctly calculated normally,
# keeping this just in case # keeping this just in case
def reset_references_cb(path: Path, def reset_references_cb(path: Path,
match: Match, _: typing.Any match: Match
) -> None: ) -> None:
match.references = 0 match.references = 0
for _ in self.exec_each(reset_references_cb, None): for _ in self.exec_each(reset_references_cb):
pass pass
def increment_references_cb(path: Path, def increment_references_cb(path: Path,
match: Match, _: typing.Any match: Match
) -> None: ) -> None:
if match.source: if match.source:
source = self.get_match(match.source) source = self.get_match(match.source)
source.references += 1 source.references += 1
for _ in self.exec_each(increment_references_cb, None): for _ in self.exec_each(increment_references_cb):
pass pass
def prune(self, before: int, base_only: bool = False) -> None: def prune(self, before: int, base_only: bool = False) -> None:
@ -410,6 +400,9 @@ class Database(Profiler):
def explain(self, path: Path) -> str: def explain(self, path: Path) -> str:
match = self.get_match(path) match = self.get_match(path)
if isinstance(match, AsnNode):
string = f'{path} ({match.name}) #{match.references}'
else:
string = f'{path} #{match.references}' string = f'{path} #{match.references}'
if match.source: if match.source:
string += f'{self.explain(match.source)}' string += f'{self.explain(match.source)}'
@ -421,7 +414,7 @@ class Database(Profiler):
explain: bool = False, explain: bool = False,
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
def export_cb(path: Path, match: Match, _: typing.Any def export_cb(path: Path, match: Match
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
assert isinstance(path, DomainPath) assert isinstance(path, DomainPath)
if not isinstance(path, HostnamePath): if not isinstance(path, HostnamePath):
@ -435,27 +428,49 @@ class Database(Profiler):
else: else:
yield self.unpack_domain(path) yield self.unpack_domain(path)
yield from self.exec_each_domain(export_cb, None) yield from self.exec_each_domain(export_cb)
def list_rules(self, def list_rules(self,
first_party_only: bool = False, first_party_only: bool = False,
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
def list_rules_cb(path: Path, match: Match, _: typing.Any def list_rules_cb(path: Path, match: Match
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
if first_party_only and not match.first_party: if first_party_only and not match.first_party:
return return
if isinstance(path, ZonePath) \ if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32): or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 0: # if match.level == 1:
# It should be the latter condition but it is more
# useful when using the former
yield self.explain(path) yield self.explain(path)
yield from self.exec_each(list_rules_cb, None) yield from self.exec_each(list_rules_cb)
def count_rules(self, def count_records(self,
first_party_only: bool = False, first_party_only: bool = False,
rules_only: bool = False,
) -> str: ) -> str:
raise NotImplementedError memo: typing.Dict[str, int] = dict()
def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
# if isinstance(path, ZonePath) \
# or (isinstance(path, Ip4Path) and path.prefixlen < 32):
if rules_only and match.level > 1:
return
try:
memo[path.__class__.__name__] += 1
except KeyError:
memo[path.__class__.__name__] = 1
for _ in self.exec_each(count_records_cb):
pass
split: typing.List[str] = list()
for key, value in sorted(memo.items(), key=lambda s: s[0]):
split.append(f'{key[:-4]}: {value}')
return ', '.join(split)
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
self.enter_step('get_domain_pack') self.enter_step('get_domain_pack')
@ -486,7 +501,6 @@ class Database(Profiler):
dic = self.ip4tree dic = self.ip4tree
for i in range(31, 31-ip4.prefixlen, -1): for i in range(31, 31-ip4.prefixlen, -1):
bit = (ip4.value >> i) & 0b1 bit = (ip4.value >> i) & 0b1
# TODO PERF copy value and slide once every loop
if dic.active(): if dic.active():
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)

View file

@ -32,15 +32,14 @@ if __name__ == '__main__':
DB = database.Database() DB = database.Database()
if args.rules:
if args.count: if args.count:
print(DB.count_rules(first_party_only=args.first_party)) print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules))
else: else:
if args.rules:
for line in DB.list_rules(): for line in DB.list_rules():
print(line) print(line)
else:
if args.count:
raise NotImplementedError
for domain in DB.export( for domain in DB.export(
first_party_only=args.first_party, first_party_only=args.first_party,
end_chain_only=args.end_chain, end_chain_only=args.end_chain,

View file

@ -4,21 +4,25 @@ function log() {
echo -e "\033[33m$@\033[0m" echo -e "\033[33m$@\033[0m"
} }
log "Pruning old data…"
./database.py --prune
log "Recounting references…"
./database.py --references
log "Exporting lists…" log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt ./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt ./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output dist/multiparty-only-trackers.txt ./export.py --end-chain --output dist/multiparty-only-trackers.txt
log "Generating hosts lists…" log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt ./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt ./export.py --rules --count > temp/count_rules_multiparty.txt
log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
function generate_hosts { function generate_hosts {
basename="$1" basename="$1"
description="$2" description="$2"
@ -46,13 +50,15 @@ function generate_hosts {
echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
echo "#" echo "#"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#" echo "#"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo echo

View file

@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
yield pref['prefix'] yield pref['prefix']
def get_name(asn: str) -> str:
req = requests.get(
'https://stat.ripe.net/data/as-overview/data.json',
params={'resource': asn}
)
data = req.json()
return data['data']['holder']
if __name__ == '__main__': if __name__ == '__main__':
log = logging.getLogger('feed_asn') log = logging.getLogger('feed_asn')
@ -34,9 +43,12 @@ if __name__ == '__main__':
def add_ranges(path: database.Path, def add_ranges(path: database.Path,
match: database.Match, match: database.Match,
_: typing.Any) -> None: ) -> None:
assert isinstance(path, database.AsnPath) assert isinstance(path, database.AsnPath)
assert isinstance(match, database.AsnNode)
asn_str = database.Database.unpack_asn(path) asn_str = database.Database.unpack_asn(path)
DB.enter_step('asn_get_name')
match.name = get_name(asn_str)
DB.enter_step('asn_get_ranges') DB.enter_step('asn_get_ranges')
for prefix in get_ranges(asn_str): for prefix in get_ranges(asn_str):
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
@ -52,7 +64,7 @@ if __name__ == '__main__':
else: else:
log.error('Unknown prefix version: %s', prefix) log.error('Unknown prefix version: %s', prefix)
for _ in DB.exec_each_asn(add_ranges, None): for _ in DB.exec_each_asn(add_ranges):
pass pass
DB.save() DB.save()