Browse Source

Reworked rule export

newworkflow
Geoffrey Frogeye 2 years ago
parent
commit
7851b038f5
Signed by: geoffrey GPG Key ID: D8A7ECA00A8CD3DD
  1. 68
      database.py
  2. 13
      export.py
  3. 22
      export_lists.sh
  4. 16
      feed_asn.py

68
database.py

@ -92,7 +92,9 @@ class Match():
class AsnNode(Match):
pass
def __init__(self) -> None:
Match.__init__(self)
self.name = ''
class DomainTreeNode():
@ -111,8 +113,7 @@ class IpTreeNode(Match):
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
MatchCallable = typing.Callable[[Path,
Match,
typing.Optional[typing.Any]],
Match],
typing.Any]
@ -284,7 +285,6 @@ class Database(Profiler):
def exec_each_asn(self,
callback: MatchCallable,
arg: typing.Any = None,
) -> typing.Any:
for asn in self.asns:
match = self.asns[asn]
@ -292,7 +292,6 @@ class Database(Profiler):
c = callback(
AsnPath(asn),
match,
arg
)
try:
yield from c
@ -301,7 +300,6 @@ class Database(Profiler):
def exec_each_domain(self,
callback: MatchCallable,
arg: typing.Any = None,
_dic: DomainTreeNode = None,
_par: DomainPath = None,
) -> typing.Any:
@ -311,7 +309,6 @@ class Database(Profiler):
c = callback(
HostnamePath(_par.parts),
_dic.match_hostname,
arg
)
try:
yield from c
@ -321,7 +318,6 @@ class Database(Profiler):
c = callback(
ZonePath(_par.parts),
_dic.match_zone,
arg
)
try:
yield from c
@ -331,14 +327,12 @@ class Database(Profiler):
dic = _dic.children[part]
yield from self.exec_each_domain(
callback,
arg,
_dic=dic,
_par=DomainPath(_par.parts + [part])
)
def exec_each_ip4(self,
callback: MatchCallable,
arg: typing.Any = None,
_dic: IpTreeNode = None,
_par: Ip4Path = None,
) -> typing.Any:
@ -348,7 +342,6 @@ class Database(Profiler):
c = callback(
_par,
_dic,
arg
)
try:
yield from c
@ -363,7 +356,6 @@ class Database(Profiler):
assert addr0 == _par.value
yield from self.exec_each_ip4(
callback,
arg,
_dic=dic,
_par=Ip4Path(addr0, pref)
)
@ -373,14 +365,12 @@ class Database(Profiler):
addr1 = _par.value | (1 << (32-pref))
yield from self.exec_each_ip4(
callback,
arg,
_dic=dic,
_par=Ip4Path(addr1, pref)
)
def exec_each(self,
callback: MatchCallable,
arg: typing.Any = None,
) -> typing.Any:
yield from self.exec_each_domain(callback)
yield from self.exec_each_ip4(callback)
@ -390,19 +380,19 @@ class Database(Profiler):
# Should be correctly calculated normally,
# keeping this just in case
def reset_references_cb(path: Path,
match: Match, _: typing.Any
match: Match
) -> None:
match.references = 0
for _ in self.exec_each(reset_references_cb, None):
for _ in self.exec_each(reset_references_cb):
pass
def increment_references_cb(path: Path,
match: Match, _: typing.Any
match: Match
) -> None:
if match.source:
source = self.get_match(match.source)
source.references += 1
for _ in self.exec_each(increment_references_cb, None):
for _ in self.exec_each(increment_references_cb):
pass
def prune(self, before: int, base_only: bool = False) -> None:
@ -410,7 +400,10 @@ class Database(Profiler):
def explain(self, path: Path) -> str:
match = self.get_match(path)
string = f'{path} #{match.references}'
if isinstance(match, AsnNode):
string = f'{path} ({match.name}) #{match.references}'
else:
string = f'{path} #{match.references}'
if match.source:
string += f' ← {self.explain(match.source)}'
return string
@ -421,7 +414,7 @@ class Database(Profiler):
explain: bool = False,
) -> typing.Iterable[str]:
def export_cb(path: Path, match: Match, _: typing.Any
def export_cb(path: Path, match: Match
) -> typing.Iterable[str]:
assert isinstance(path, DomainPath)
if not isinstance(path, HostnamePath):
@ -435,27 +428,49 @@ class Database(Profiler):
else:
yield self.unpack_domain(path)
yield from self.exec_each_domain(export_cb, None)
yield from self.exec_each_domain(export_cb)
def list_rules(self,
first_party_only: bool = False,
) -> typing.Iterable[str]:
def list_rules_cb(path: Path, match: Match, _: typing.Any
def list_rules_cb(path: Path, match: Match
) -> typing.Iterable[str]:
if first_party_only and not match.first_party:
return
if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 0:
# if match.level == 1:
# It should be the latter condition but it is more
# useful when using the former
yield self.explain(path)
yield from self.exec_each(list_rules_cb, None)
yield from self.exec_each(list_rules_cb)
def count_rules(self,
def count_records(self,
first_party_only: bool = False,
rules_only: bool = False,
) -> str:
raise NotImplementedError
memo: typing.Dict[str, int] = dict()
def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
# if isinstance(path, ZonePath) \
# or (isinstance(path, Ip4Path) and path.prefixlen < 32):
if rules_only and match.level > 1:
return
try:
memo[path.__class__.__name__] += 1
except KeyError:
memo[path.__class__.__name__] = 1
for _ in self.exec_each(count_records_cb):
pass
split: typing.List[str] = list()
for key, value in sorted(memo.items(), key=lambda s: s[0]):
split.append(f'{key[:-4]}: {value}')
return ', '.join(split)
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
self.enter_step('get_domain_pack')
@ -486,7 +501,6 @@ class Database(Profiler):
dic = self.ip4tree
for i in range(31, 31-ip4.prefixlen, -1):
bit = (ip4.value >> i) & 0b1
# TODO PERF copy value and slide once every loop
if dic.active():
self.enter_step('get_ip4_yield')
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)

13
export.py

@ -32,15 +32,14 @@ if __name__ == '__main__':
DB = database.Database()
if args.rules:
if args.count:
print(DB.count_rules(first_party_only=args.first_party))
else:
if args.count:
print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules))
else:
if args.rules:
for line in DB.list_rules():
print(line)
else:
if args.count:
raise NotImplementedError
for domain in DB.export(
first_party_only=args.first_party,
end_chain_only=args.end_chain,

22
filter_subdomains.sh → export_lists.sh

@ -4,21 +4,25 @@ function log() {
echo -e "\033[33m$@\033[0m"
}
log "Pruning old data…"
./database.py --prune
log "Recounting references…"
./database.py --references
log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
function generate_hosts {
basename="$1"
description="$2"
@ -46,13 +50,15 @@ function generate_hosts {
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
echo "#"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo

16
feed_asn.py

@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
yield pref['prefix']
def get_name(asn: str) -> str:
req = requests.get(
'https://stat.ripe.net/data/as-overview/data.json',
params={'resource': asn}
)
data = req.json()
return data['data']['holder']
if __name__ == '__main__':
log = logging.getLogger('feed_asn')
@ -34,9 +43,12 @@ if __name__ == '__main__':
def add_ranges(path: database.Path,
match: database.Match,
_: typing.Any) -> None:
) -> None:
assert isinstance(path, database.AsnPath)
assert isinstance(match, database.AsnNode)
asn_str = database.Database.unpack_asn(path)
DB.enter_step('asn_get_name')
match.name = get_name(asn_str)
DB.enter_step('asn_get_ranges')
for prefix in get_ranges(asn_str):
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
@ -52,7 +64,7 @@ if __name__ == '__main__':
else:
log.error('Unknown prefix version: %s', prefix)
for _ in DB.exec_each_asn(add_ranges, None):
for _ in DB.exec_each_asn(add_ranges):
pass
DB.save()
Loading…
Cancel
Save