diff --git a/database.py b/database.py index 3327438..1e8c4da 100755 --- a/database.py +++ b/database.py @@ -23,7 +23,7 @@ DbValue = typing.Union[None, int, float, str, bytes] class Database(): - VERSION = 4 + VERSION = 5 PATH = "blocking.db" def open(self) -> None: @@ -250,6 +250,24 @@ class Database(): else: yield val + def count_rules(self, + first_party_only: bool = False, + ) -> str: + counts: typing.List[str] = list() + cursor = self.conn.cursor() + for table in ['asn', 'ip4network', 'ip4address', 'zone', 'hostname']: + command = f'SELECT count(*) FROM rules ' \ + f'INNER JOIN {table} ON rules.id = {table}.entry ' \ + 'WHERE rules.level = 0' + if first_party_only: + command += ' AND first_party=1' + cursor.execute(command) + count, = cursor.fetchone() + if count > 0: + counts.append(f'{table}: {count}') + + return ', '.join(counts) + def get_domain(self, domain: str) -> typing.Iterable[int]: self.enter_step('get_domain_prepare') domain_prep = self.pack_hostname(domain) diff --git a/database_schema.sql b/database_schema.sql index a61f7f2..3116a09 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -11,6 +11,8 @@ CREATE TABLE rules ( FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE ); CREATE INDEX rules_source ON rules (source); -- for references recounting +CREATE INDEX rules_updated ON rules (updated); -- for pruning +CREATE INDEX rules_level_firstparty ON rules (level, first_party); -- for counting rules CREATE TABLE asn ( val INTEGER PRIMARY KEY, diff --git a/export.py b/export.py index 49051c9..886582c 100755 --- a/export.py +++ b/export.py @@ -22,15 +22,28 @@ if __name__ == '__main__': parser.add_argument( '-x', '--explain', action='store_true', help="TODO") + parser.add_argument( + '-r', '--rules', action='store_true', + help="TODO") + parser.add_argument( + '-c', '--count', action='store_true', + help="TODO") args = parser.parse_args() DB = database.Database() - for domain in DB.export( - first_party_only=args.first_party, - end_chain_only=args.end_chain, - explain=args.explain, - ): - print(domain, file=args.output) + if args.rules: + if not args.count: + raise NotImplementedError + print(DB.count_rules(first_party_only=args.first_party)) + else: + if args.count: + raise NotImplementedError + for domain in DB.export( + first_party_only=args.first_party, + end_chain_only=args.end_chain, + explain=args.explain, + ): + print(domain, file=args.output) DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 516efae..d4b90ae 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,6 +4,9 @@ function log() { echo -e "\033[33m$@\033[0m" } +log "Pruning old data…" +./database.py --prune + log "Recounting references…" ./database.py --references @@ -14,6 +17,8 @@ log "Exporting lists…" ./export.py --end-chain --output dist/multiparty-only-trackers.txt log "Generating hosts lists…" +./export.py --rules --count --first-party > temp/count_rules_firstparty.txt +./export.py --rules --count > temp/count_rules_multiparty.txt function generate_hosts { basename="$1" description="$2" @@ -39,15 +44,16 @@ function generate_hosts { echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: TODO" - echo "# Number of source subdomains: TODO" + echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" + echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" echo "#" - echo "# Number of known first-party trackers: TODO" - echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" + echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" + echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" - echo "# Number of known multi-party trackers: TODO" - echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" + echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" + echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo sed 's|^|0.0.0.0 |' "dist/$basename.txt" diff --git a/new_workflow.sh b/new_workflow.sh index bc2a78b..c98cd46 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -4,9 +4,7 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Preparing database…" -./database.py --expire - +./fetch_resources.sh ./import_rules.sh # TODO Fetch 'em