diff --git a/database.py b/database.py index 3fc93c5..bff6638 100644 --- a/database.py +++ b/database.py @@ -92,7 +92,9 @@ class Match(): class AsnNode(Match): - pass + def __init__(self) -> None: + Match.__init__(self) + self.name = '' class DomainTreeNode(): @@ -111,8 +113,7 @@ class IpTreeNode(Match): Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] MatchCallable = typing.Callable[[Path, - Match, - typing.Optional[typing.Any]], + Match], typing.Any] @@ -284,7 +285,6 @@ class Database(Profiler): def exec_each_asn(self, callback: MatchCallable, - arg: typing.Any = None, ) -> typing.Any: for asn in self.asns: match = self.asns[asn] @@ -292,7 +292,6 @@ class Database(Profiler): c = callback( AsnPath(asn), match, - arg ) try: yield from c @@ -301,7 +300,6 @@ class Database(Profiler): def exec_each_domain(self, callback: MatchCallable, - arg: typing.Any = None, _dic: DomainTreeNode = None, _par: DomainPath = None, ) -> typing.Any: @@ -311,7 +309,6 @@ class Database(Profiler): c = callback( HostnamePath(_par.parts), _dic.match_hostname, - arg ) try: yield from c @@ -321,7 +318,6 @@ class Database(Profiler): c = callback( ZonePath(_par.parts), _dic.match_zone, - arg ) try: yield from c @@ -331,14 +327,12 @@ class Database(Profiler): dic = _dic.children[part] yield from self.exec_each_domain( callback, - arg, _dic=dic, _par=DomainPath(_par.parts + [part]) ) def exec_each_ip4(self, callback: MatchCallable, - arg: typing.Any = None, _dic: IpTreeNode = None, _par: Ip4Path = None, ) -> typing.Any: @@ -348,7 +342,6 @@ class Database(Profiler): c = callback( _par, _dic, - arg ) try: yield from c @@ -363,7 +356,6 @@ class Database(Profiler): assert addr0 == _par.value yield from self.exec_each_ip4( callback, - arg, _dic=dic, _par=Ip4Path(addr0, pref) ) @@ -373,14 +365,12 @@ class Database(Profiler): addr1 = _par.value | (1 << (32-pref)) yield from self.exec_each_ip4( callback, - arg, _dic=dic, _par=Ip4Path(addr1, pref) ) def exec_each(self, callback: MatchCallable, - arg: typing.Any = None, ) -> typing.Any: yield from self.exec_each_domain(callback) yield from self.exec_each_ip4(callback) @@ -390,19 +380,19 @@ class Database(Profiler): # Should be correctly calculated normally, # keeping this just in case def reset_references_cb(path: Path, - match: Match, _: typing.Any + match: Match ) -> None: match.references = 0 - for _ in self.exec_each(reset_references_cb, None): + for _ in self.exec_each(reset_references_cb): pass def increment_references_cb(path: Path, - match: Match, _: typing.Any + match: Match ) -> None: if match.source: source = self.get_match(match.source) source.references += 1 - for _ in self.exec_each(increment_references_cb, None): + for _ in self.exec_each(increment_references_cb): pass def prune(self, before: int, base_only: bool = False) -> None: @@ -410,7 +400,10 @@ class Database(Profiler): def explain(self, path: Path) -> str: match = self.get_match(path) - string = f'{path} #{match.references}' + if isinstance(match, AsnNode): + string = f'{path} ({match.name}) #{match.references}' + else: + string = f'{path} #{match.references}' if match.source: string += f' ← {self.explain(match.source)}' return string @@ -421,7 +414,7 @@ class Database(Profiler): explain: bool = False, ) -> typing.Iterable[str]: - def export_cb(path: Path, match: Match, _: typing.Any + def export_cb(path: Path, match: Match ) -> typing.Iterable[str]: assert isinstance(path, DomainPath) if not isinstance(path, HostnamePath): @@ -435,27 +428,49 @@ class Database(Profiler): else: yield self.unpack_domain(path) - yield from self.exec_each_domain(export_cb, None) + yield from self.exec_each_domain(export_cb) def list_rules(self, first_party_only: bool = False, ) -> typing.Iterable[str]: - def list_rules_cb(path: Path, match: Match, _: typing.Any + def list_rules_cb(path: Path, match: Match ) -> typing.Iterable[str]: if first_party_only and not match.first_party: return if isinstance(path, ZonePath) \ or (isinstance(path, Ip4Path) and path.prefixlen < 32): - # if match.level == 0: + # if match.level == 1: + # It should be the latter condition but it is more + # useful when using the former yield self.explain(path) - yield from self.exec_each(list_rules_cb, None) + yield from self.exec_each(list_rules_cb) - def count_rules(self, + def count_records(self, first_party_only: bool = False, + rules_only: bool = False, ) -> str: - raise NotImplementedError + memo: typing.Dict[str, int] = dict() + + def count_records_cb(path: Path, match: Match) -> None: + if first_party_only and not match.first_party: + return + # if isinstance(path, ZonePath) \ + # or (isinstance(path, Ip4Path) and path.prefixlen < 32): + if rules_only and match.level > 1: + return + try: + memo[path.__class__.__name__] += 1 + except KeyError: + memo[path.__class__.__name__] = 1 + + for _ in self.exec_each(count_records_cb): + pass + split: typing.List[str] = list() + for key, value in sorted(memo.items(), key=lambda s: s[0]): + split.append(f'{key[:-4]}: {value}') + return ', '.join(split) def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: self.enter_step('get_domain_pack') @@ -486,7 +501,6 @@ class Database(Profiler): dic = self.ip4tree for i in range(31, 31-ip4.prefixlen, -1): bit = (ip4.value >> i) & 0b1 - # TODO PERF copy value and slide once every loop if dic.active(): self.enter_step('get_ip4_yield') yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) diff --git a/export.py b/export.py index 0df4229..91f7193 100755 --- a/export.py +++ b/export.py @@ -32,15 +32,14 @@ if __name__ == '__main__': DB = database.Database() - if args.rules: - if args.count: - print(DB.count_rules(first_party_only=args.first_party)) - else: + if args.count: + print(DB.count_records( + first_party_only=args.first_party, + rules_only=args.rules)) + else: + if args.rules: for line in DB.list_rules(): print(line) - else: - if args.count: - raise NotImplementedError for domain in DB.export( first_party_only=args.first_party, end_chain_only=args.end_chain, diff --git a/filter_subdomains.sh b/export_lists.sh similarity index 81% rename from filter_subdomains.sh rename to export_lists.sh index d4b90ae..20a34cb 100755 --- a/filter_subdomains.sh +++ b/export_lists.sh @@ -4,21 +4,25 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Pruning old data…" -./database.py --prune - -log "Recounting references…" -./database.py --references - log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt ./export.py --output dist/multiparty-trackers.txt ./export.py --end-chain --output dist/multiparty-only-trackers.txt -log "Generating hosts lists…" +log "Generating statistics…" +./export.py --count --first-party > temp/count_recs_firstparty.txt +./export.py --count > temp/count_recs_multiparty.txt ./export.py --rules --count --first-party > temp/count_rules_firstparty.txt ./export.py --rules --count > temp/count_rules_multiparty.txt + +log "Sorting lists…" +sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt +sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt +sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt +sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt + +log "Generating hosts lists…" function generate_hosts { basename="$1" description="$2" @@ -46,13 +50,15 @@ function generate_hosts { echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" + echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO echo "#" echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" + echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)" echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" + echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)" echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo diff --git a/feed_asn.py b/feed_asn.py index fbdefcd..aa28dfe 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]: yield pref['prefix'] +def get_name(asn: str) -> str: + req = requests.get( + 'https://stat.ripe.net/data/as-overview/data.json', + params={'resource': asn} + ) + data = req.json() + return data['data']['holder'] + + if __name__ == '__main__': log = logging.getLogger('feed_asn') @@ -34,9 +43,12 @@ if __name__ == '__main__': def add_ranges(path: database.Path, match: database.Match, - _: typing.Any) -> None: + ) -> None: assert isinstance(path, database.AsnPath) + assert isinstance(match, database.AsnNode) asn_str = database.Database.unpack_asn(path) + DB.enter_step('asn_get_name') + match.name = get_name(asn_str) DB.enter_step('asn_get_ranges') for prefix in get_ranges(asn_str): parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) @@ -52,7 +64,7 @@ if __name__ == '__main__': else: log.error('Unknown prefix version: %s', prefix) - for _ in DB.exec_each_asn(add_ranges, None): + for _ in DB.exec_each_asn(add_ranges): pass DB.save()