Reworked rule export
This commit is contained in:
parent
8f6e01c857
commit
7851b038f5
68
database.py
68
database.py
|
@ -92,7 +92,9 @@ class Match():
|
||||||
|
|
||||||
|
|
||||||
class AsnNode(Match):
|
class AsnNode(Match):
|
||||||
pass
|
def __init__(self) -> None:
|
||||||
|
Match.__init__(self)
|
||||||
|
self.name = ''
|
||||||
|
|
||||||
|
|
||||||
class DomainTreeNode():
|
class DomainTreeNode():
|
||||||
|
@ -111,8 +113,7 @@ class IpTreeNode(Match):
|
||||||
|
|
||||||
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
|
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
|
||||||
MatchCallable = typing.Callable[[Path,
|
MatchCallable = typing.Callable[[Path,
|
||||||
Match,
|
Match],
|
||||||
typing.Optional[typing.Any]],
|
|
||||||
typing.Any]
|
typing.Any]
|
||||||
|
|
||||||
|
|
||||||
|
@ -284,7 +285,6 @@ class Database(Profiler):
|
||||||
|
|
||||||
def exec_each_asn(self,
|
def exec_each_asn(self,
|
||||||
callback: MatchCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
for asn in self.asns:
|
for asn in self.asns:
|
||||||
match = self.asns[asn]
|
match = self.asns[asn]
|
||||||
|
@ -292,7 +292,6 @@ class Database(Profiler):
|
||||||
c = callback(
|
c = callback(
|
||||||
AsnPath(asn),
|
AsnPath(asn),
|
||||||
match,
|
match,
|
||||||
arg
|
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
yield from c
|
yield from c
|
||||||
|
@ -301,7 +300,6 @@ class Database(Profiler):
|
||||||
|
|
||||||
def exec_each_domain(self,
|
def exec_each_domain(self,
|
||||||
callback: MatchCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
_dic: DomainTreeNode = None,
|
_dic: DomainTreeNode = None,
|
||||||
_par: DomainPath = None,
|
_par: DomainPath = None,
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
|
@ -311,7 +309,6 @@ class Database(Profiler):
|
||||||
c = callback(
|
c = callback(
|
||||||
HostnamePath(_par.parts),
|
HostnamePath(_par.parts),
|
||||||
_dic.match_hostname,
|
_dic.match_hostname,
|
||||||
arg
|
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
yield from c
|
yield from c
|
||||||
|
@ -321,7 +318,6 @@ class Database(Profiler):
|
||||||
c = callback(
|
c = callback(
|
||||||
ZonePath(_par.parts),
|
ZonePath(_par.parts),
|
||||||
_dic.match_zone,
|
_dic.match_zone,
|
||||||
arg
|
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
yield from c
|
yield from c
|
||||||
|
@ -331,14 +327,12 @@ class Database(Profiler):
|
||||||
dic = _dic.children[part]
|
dic = _dic.children[part]
|
||||||
yield from self.exec_each_domain(
|
yield from self.exec_each_domain(
|
||||||
callback,
|
callback,
|
||||||
arg,
|
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
_par=DomainPath(_par.parts + [part])
|
_par=DomainPath(_par.parts + [part])
|
||||||
)
|
)
|
||||||
|
|
||||||
def exec_each_ip4(self,
|
def exec_each_ip4(self,
|
||||||
callback: MatchCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
_dic: IpTreeNode = None,
|
_dic: IpTreeNode = None,
|
||||||
_par: Ip4Path = None,
|
_par: Ip4Path = None,
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
|
@ -348,7 +342,6 @@ class Database(Profiler):
|
||||||
c = callback(
|
c = callback(
|
||||||
_par,
|
_par,
|
||||||
_dic,
|
_dic,
|
||||||
arg
|
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
yield from c
|
yield from c
|
||||||
|
@ -363,7 +356,6 @@ class Database(Profiler):
|
||||||
assert addr0 == _par.value
|
assert addr0 == _par.value
|
||||||
yield from self.exec_each_ip4(
|
yield from self.exec_each_ip4(
|
||||||
callback,
|
callback,
|
||||||
arg,
|
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
_par=Ip4Path(addr0, pref)
|
_par=Ip4Path(addr0, pref)
|
||||||
)
|
)
|
||||||
|
@ -373,14 +365,12 @@ class Database(Profiler):
|
||||||
addr1 = _par.value | (1 << (32-pref))
|
addr1 = _par.value | (1 << (32-pref))
|
||||||
yield from self.exec_each_ip4(
|
yield from self.exec_each_ip4(
|
||||||
callback,
|
callback,
|
||||||
arg,
|
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
_par=Ip4Path(addr1, pref)
|
_par=Ip4Path(addr1, pref)
|
||||||
)
|
)
|
||||||
|
|
||||||
def exec_each(self,
|
def exec_each(self,
|
||||||
callback: MatchCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
yield from self.exec_each_domain(callback)
|
yield from self.exec_each_domain(callback)
|
||||||
yield from self.exec_each_ip4(callback)
|
yield from self.exec_each_ip4(callback)
|
||||||
|
@ -390,19 +380,19 @@ class Database(Profiler):
|
||||||
# Should be correctly calculated normally,
|
# Should be correctly calculated normally,
|
||||||
# keeping this just in case
|
# keeping this just in case
|
||||||
def reset_references_cb(path: Path,
|
def reset_references_cb(path: Path,
|
||||||
match: Match, _: typing.Any
|
match: Match
|
||||||
) -> None:
|
) -> None:
|
||||||
match.references = 0
|
match.references = 0
|
||||||
for _ in self.exec_each(reset_references_cb, None):
|
for _ in self.exec_each(reset_references_cb):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def increment_references_cb(path: Path,
|
def increment_references_cb(path: Path,
|
||||||
match: Match, _: typing.Any
|
match: Match
|
||||||
) -> None:
|
) -> None:
|
||||||
if match.source:
|
if match.source:
|
||||||
source = self.get_match(match.source)
|
source = self.get_match(match.source)
|
||||||
source.references += 1
|
source.references += 1
|
||||||
for _ in self.exec_each(increment_references_cb, None):
|
for _ in self.exec_each(increment_references_cb):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def prune(self, before: int, base_only: bool = False) -> None:
|
def prune(self, before: int, base_only: bool = False) -> None:
|
||||||
|
@ -410,7 +400,10 @@ class Database(Profiler):
|
||||||
|
|
||||||
def explain(self, path: Path) -> str:
|
def explain(self, path: Path) -> str:
|
||||||
match = self.get_match(path)
|
match = self.get_match(path)
|
||||||
string = f'{path} #{match.references}'
|
if isinstance(match, AsnNode):
|
||||||
|
string = f'{path} ({match.name}) #{match.references}'
|
||||||
|
else:
|
||||||
|
string = f'{path} #{match.references}'
|
||||||
if match.source:
|
if match.source:
|
||||||
string += f' ← {self.explain(match.source)}'
|
string += f' ← {self.explain(match.source)}'
|
||||||
return string
|
return string
|
||||||
|
@ -421,7 +414,7 @@ class Database(Profiler):
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
|
|
||||||
def export_cb(path: Path, match: Match, _: typing.Any
|
def export_cb(path: Path, match: Match
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
assert isinstance(path, DomainPath)
|
assert isinstance(path, DomainPath)
|
||||||
if not isinstance(path, HostnamePath):
|
if not isinstance(path, HostnamePath):
|
||||||
|
@ -435,27 +428,49 @@ class Database(Profiler):
|
||||||
else:
|
else:
|
||||||
yield self.unpack_domain(path)
|
yield self.unpack_domain(path)
|
||||||
|
|
||||||
yield from self.exec_each_domain(export_cb, None)
|
yield from self.exec_each_domain(export_cb)
|
||||||
|
|
||||||
def list_rules(self,
|
def list_rules(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
|
|
||||||
def list_rules_cb(path: Path, match: Match, _: typing.Any
|
def list_rules_cb(path: Path, match: Match
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
if first_party_only and not match.first_party:
|
if first_party_only and not match.first_party:
|
||||||
return
|
return
|
||||||
if isinstance(path, ZonePath) \
|
if isinstance(path, ZonePath) \
|
||||||
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
|
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
|
||||||
# if match.level == 0:
|
# if match.level == 1:
|
||||||
|
# It should be the latter condition but it is more
|
||||||
|
# useful when using the former
|
||||||
yield self.explain(path)
|
yield self.explain(path)
|
||||||
|
|
||||||
yield from self.exec_each(list_rules_cb, None)
|
yield from self.exec_each(list_rules_cb)
|
||||||
|
|
||||||
def count_rules(self,
|
def count_records(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
|
rules_only: bool = False,
|
||||||
) -> str:
|
) -> str:
|
||||||
raise NotImplementedError
|
memo: typing.Dict[str, int] = dict()
|
||||||
|
|
||||||
|
def count_records_cb(path: Path, match: Match) -> None:
|
||||||
|
if first_party_only and not match.first_party:
|
||||||
|
return
|
||||||
|
# if isinstance(path, ZonePath) \
|
||||||
|
# or (isinstance(path, Ip4Path) and path.prefixlen < 32):
|
||||||
|
if rules_only and match.level > 1:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
memo[path.__class__.__name__] += 1
|
||||||
|
except KeyError:
|
||||||
|
memo[path.__class__.__name__] = 1
|
||||||
|
|
||||||
|
for _ in self.exec_each(count_records_cb):
|
||||||
|
pass
|
||||||
|
split: typing.List[str] = list()
|
||||||
|
for key, value in sorted(memo.items(), key=lambda s: s[0]):
|
||||||
|
split.append(f'{key[:-4]}: {value}')
|
||||||
|
return ', '.join(split)
|
||||||
|
|
||||||
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
|
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
|
||||||
self.enter_step('get_domain_pack')
|
self.enter_step('get_domain_pack')
|
||||||
|
@ -486,7 +501,6 @@ class Database(Profiler):
|
||||||
dic = self.ip4tree
|
dic = self.ip4tree
|
||||||
for i in range(31, 31-ip4.prefixlen, -1):
|
for i in range(31, 31-ip4.prefixlen, -1):
|
||||||
bit = (ip4.value >> i) & 0b1
|
bit = (ip4.value >> i) & 0b1
|
||||||
# TODO PERF copy value and slide once every loop
|
|
||||||
if dic.active():
|
if dic.active():
|
||||||
self.enter_step('get_ip4_yield')
|
self.enter_step('get_ip4_yield')
|
||||||
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)
|
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)
|
||||||
|
|
13
export.py
13
export.py
|
@ -32,15 +32,14 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
DB = database.Database()
|
DB = database.Database()
|
||||||
|
|
||||||
if args.rules:
|
if args.count:
|
||||||
if args.count:
|
print(DB.count_records(
|
||||||
print(DB.count_rules(first_party_only=args.first_party))
|
first_party_only=args.first_party,
|
||||||
else:
|
rules_only=args.rules))
|
||||||
|
else:
|
||||||
|
if args.rules:
|
||||||
for line in DB.list_rules():
|
for line in DB.list_rules():
|
||||||
print(line)
|
print(line)
|
||||||
else:
|
|
||||||
if args.count:
|
|
||||||
raise NotImplementedError
|
|
||||||
for domain in DB.export(
|
for domain in DB.export(
|
||||||
first_party_only=args.first_party,
|
first_party_only=args.first_party,
|
||||||
end_chain_only=args.end_chain,
|
end_chain_only=args.end_chain,
|
||||||
|
|
|
@ -4,21 +4,25 @@ function log() {
|
||||||
echo -e "\033[33m$@\033[0m"
|
echo -e "\033[33m$@\033[0m"
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Pruning old data…"
|
|
||||||
./database.py --prune
|
|
||||||
|
|
||||||
log "Recounting references…"
|
|
||||||
./database.py --references
|
|
||||||
|
|
||||||
log "Exporting lists…"
|
log "Exporting lists…"
|
||||||
./export.py --first-party --output dist/firstparty-trackers.txt
|
./export.py --first-party --output dist/firstparty-trackers.txt
|
||||||
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
|
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
|
||||||
./export.py --output dist/multiparty-trackers.txt
|
./export.py --output dist/multiparty-trackers.txt
|
||||||
./export.py --end-chain --output dist/multiparty-only-trackers.txt
|
./export.py --end-chain --output dist/multiparty-only-trackers.txt
|
||||||
|
|
||||||
log "Generating hosts lists…"
|
log "Generating statistics…"
|
||||||
|
./export.py --count --first-party > temp/count_recs_firstparty.txt
|
||||||
|
./export.py --count > temp/count_recs_multiparty.txt
|
||||||
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
|
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
|
||||||
./export.py --rules --count > temp/count_rules_multiparty.txt
|
./export.py --rules --count > temp/count_rules_multiparty.txt
|
||||||
|
|
||||||
|
log "Sorting lists…"
|
||||||
|
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
|
||||||
|
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
|
||||||
|
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
|
||||||
|
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
|
||||||
|
|
||||||
|
log "Generating hosts lists…"
|
||||||
function generate_hosts {
|
function generate_hosts {
|
||||||
basename="$1"
|
basename="$1"
|
||||||
description="$2"
|
description="$2"
|
||||||
|
@ -46,13 +50,15 @@ function generate_hosts {
|
||||||
echo "# Generation software: eulaurarien $(git describe --tags)"
|
echo "# Generation software: eulaurarien $(git describe --tags)"
|
||||||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
||||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||||
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)"
|
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
|
||||||
echo "#"
|
echo "#"
|
||||||
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
|
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
|
||||||
|
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
|
||||||
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "#"
|
echo "#"
|
||||||
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
|
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
|
||||||
|
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
|
||||||
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
|
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
|
||||||
echo
|
echo
|
16
feed_asn.py
16
feed_asn.py
|
@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
|
||||||
yield pref['prefix']
|
yield pref['prefix']
|
||||||
|
|
||||||
|
|
||||||
|
def get_name(asn: str) -> str:
|
||||||
|
req = requests.get(
|
||||||
|
'https://stat.ripe.net/data/as-overview/data.json',
|
||||||
|
params={'resource': asn}
|
||||||
|
)
|
||||||
|
data = req.json()
|
||||||
|
return data['data']['holder']
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
log = logging.getLogger('feed_asn')
|
log = logging.getLogger('feed_asn')
|
||||||
|
@ -34,9 +43,12 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
def add_ranges(path: database.Path,
|
def add_ranges(path: database.Path,
|
||||||
match: database.Match,
|
match: database.Match,
|
||||||
_: typing.Any) -> None:
|
) -> None:
|
||||||
assert isinstance(path, database.AsnPath)
|
assert isinstance(path, database.AsnPath)
|
||||||
|
assert isinstance(match, database.AsnNode)
|
||||||
asn_str = database.Database.unpack_asn(path)
|
asn_str = database.Database.unpack_asn(path)
|
||||||
|
DB.enter_step('asn_get_name')
|
||||||
|
match.name = get_name(asn_str)
|
||||||
DB.enter_step('asn_get_ranges')
|
DB.enter_step('asn_get_ranges')
|
||||||
for prefix in get_ranges(asn_str):
|
for prefix in get_ranges(asn_str):
|
||||||
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
|
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
|
||||||
|
@ -52,7 +64,7 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
log.error('Unknown prefix version: %s', prefix)
|
log.error('Unknown prefix version: %s', prefix)
|
||||||
|
|
||||||
for _ in DB.exec_each_asn(add_ranges, None):
|
for _ in DB.exec_each_asn(add_ranges):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
DB.save()
|
DB.save()
|
||||||
|
|
Loading…
Reference in a new issue