diff --git a/.gitignore b/.gitignore index e38bcd9..188051c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.log +*.db +*.db-journal nameservers nameservers.head diff --git a/database.py b/database.py new file mode 100755 index 0000000..370d25b --- /dev/null +++ b/database.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 + +import sqlite3 +import os +import argparse +import typing +import ipaddress +import enum +import time +import pprint + +""" +Utility functions to interact with the database. +""" + +VERSION = 1 +PATH = f"blocking.db" +CONN = None +C = None # Cursor +TIME_DICT: typing.Dict[str, float] = dict() +TIME_LAST = time.perf_counter() +TIME_STEP = 'start' + + +def time_step(step: str) -> None: + global TIME_LAST + global TIME_STEP + now = time.perf_counter() + TIME_DICT.setdefault(TIME_STEP, 0.0) + TIME_DICT[TIME_STEP] += now - TIME_LAST + TIME_STEP = step + TIME_LAST = time.perf_counter() + + +def time_print() -> None: + time_step('postprint') + total = sum(TIME_DICT.values()) + for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): + print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s") + print(f"{'total':<20}: {1:7.2%} = {total:.6f} s") + + +class RowType(enum.Enum): + AS = 1 + DomainTree = 2 + Domain = 3 + IPv4Network = 4 + IPv6Network = 6 + + +def open_db() -> None: + time_step('open_db') + global CONN + global C + CONN = sqlite3.connect(PATH) + C = CONN.cursor() + # C.execute("PRAGMA foreign_keys = ON"); + initialized = False + try: + C.execute("SELECT value FROM meta WHERE key='version'") + version_ex = C.fetchone() + if version_ex: + if version_ex[0] == VERSION: + initialized = True + else: + print(f"Database version {version_ex[0]} found," + "it will be deleted.") + except sqlite3.OperationalError: + pass + if not initialized: + time_step('init_db') + print(f"Creating database version {VERSION}.") + CONN.close() + os.unlink(PATH) + CONN = sqlite3.connect(PATH) + C = CONN.cursor() + with open("database_schema.sql", 'r') as db_schema: + C.executescript(db_schema.read()) + C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,)) + CONN.commit() + time_step('other') + + +def close_db() -> None: + assert CONN + time_step('close_db_commit') + CONN.commit() + time_step('close_db') + CONN.close() + time_step('other') + time_print() + + +def refresh() -> None: + assert C + C.execute('UPDATE blocking SET updated = 0') + # TODO PERF Use a meta value instead + + +RULE_SUBDOMAIN_COMMAND = \ + 'INSERT INTO blocking (key, type, updated, firstparty) ' \ + f'VALUES (?, {RowType.DomainTree.value}, 1, ?) ' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ + 'updated=1, firstparty=?' + + +def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: + assert C + subdomain = subdomain[::-1] + C.execute(RULE_SUBDOMAIN_COMMAND, + (subdomain, int(first_party), int(first_party))) + # Since regex type takes precedence over domain type, + # and firstparty takes precedence over multiparty, + # we can afford to replace the whole row without checking + # the row without checking previous values and making sure + # firstparty subdomains are updated last + + +def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: + for char in address.packed: + for i in range(7, -1, -1): + yield (char >> i) & 0b1 + + +def ip_flat(address: ipaddress.IPv4Address) -> str: + return ''.join(map(str, ip_get_bits(address))) + + +def ip4_flat(address: str) -> str: + return '{:08b}{:08b}{:08b}{:08b}'.format( + *[int(c) for c in address.split('.')]) + + +RULE_IP4NETWORK_COMMAND = \ + 'INSERT INTO blocking (key, type, updated, firstparty) ' \ + f'VALUES (?, {RowType.IPv4Network.value}, 1, ?) ' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ + 'updated=1, firstparty=?' + + +def feed_rule_ip4network(network: ipaddress.IPv4Network, + first_party: bool = False) -> None: + assert C + flat = ip_flat(network.network_address)[:network.prefixlen] + C.execute(RULE_IP4NETWORK_COMMAND, + (flat, int(first_party), int(first_party))) + + +FEED_A_COMMAND_FETCH = \ + 'SELECT key, firstparty FROM blocking ' \ + 'WHERE key<=? ' \ + 'AND updated=1 ' \ + f'AND type={RowType.IPv4Network.value} ' \ + 'ORDER BY key DESC ' \ + 'LIMIT 1' + +FEED_A_COMMAND_UPSERT = \ + 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ + f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ + 'updated=1, firstparty=? ' \ + 'WHERE updated=0 OR firstparty None: + assert C + assert CONN + time_step('a_flat') + try: + value = ip4_flat(value_ip) + except (ValueError, IndexError): + # Malformed IPs + return + time_step('a_fetch') + C.execute(FEED_A_COMMAND_FETCH, (value,)) + base = C.fetchone() + time_step('a_fetch_confirm') + if not base: + return + b_key, b_firstparty = base + if not value.startswith(b_key): + return + name = name[::-1] + time_step('a_upsert') + C.execute(FEED_A_COMMAND_UPSERT, + (name, b_key, b_firstparty, # Insert + b_key, b_firstparty, b_firstparty) # Update + ) + time_step('other') + + +FEED_CNAME_COMMAND_FETCH = \ + 'SELECT key, type, firstparty FROM blocking ' \ + 'WHERE key<=? ' \ + f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ + 'AND updated=1 ' \ + 'ORDER BY key DESC ' \ + 'LIMIT 1' +# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ +# f'(type={RowType.Domain.value} AND key=?)) ' \ +# This optimisation is counter productive + +FEED_CNAME_COMMAND_UPSERT = \ + 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ + f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ + 'updated=1, firstparty=? ' \ + 'WHERE updated=0 OR firstparty None: + assert C + assert CONN + value = value[::-1] + time_step('cname_fetch') + C.execute(FEED_CNAME_COMMAND_FETCH, (value,)) + base = C.fetchone() + time_step('cname_fetch_confirm') + if not base: + # Should only happen at an extremum of the database + return + b_key, b_type, b_firstparty = base + matching = b_key == value[:len(b_key)] and ( + len(value) == len(b_key) + or ( + b_type == RowType.DomainTree.value + and value[len(b_key)] == '.' + ) + ) + if not matching: + return + name = name[::-1] + time_step('cname_upsert') + C.execute(FEED_CNAME_COMMAND_UPSERT, + (name, b_key, b_firstparty, # Insert + b_key, b_firstparty, b_firstparty) # Update + ) + time_step('other') + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="Database operations") + parser.add_argument( + '-r', '--refresh', action='store_true', + help="Set the whole database as an old source") + args = parser.parse_args() + + open_db() + + if args.refresh: + refresh() + + close_db() diff --git a/database_schema.sql b/database_schema.sql new file mode 100644 index 0000000..5e9618b --- /dev/null +++ b/database_schema.sql @@ -0,0 +1,22 @@ +-- Remember to increment DB_VERSION +-- in database.py on changes to this file + +CREATE TABLE blocking ( + key text PRIMARY KEY, -- Contains the reversed domain name or IP in binary form + source TEXT, -- The rule this one is based on + type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network + updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) + firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) + FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE +); +CREATE INDEX "blocking_type_updated_key" ON "blocking" ( + "type", + "updated", + "key" DESC +); + +-- Store various things +CREATE TABLE meta ( + key text PRIMARY KEY, + value integer +); diff --git a/feed_dns.py b/feed_dns.py new file mode 100755 index 0000000..47ea5d8 --- /dev/null +++ b/feed_dns.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys + +FUNCTION_MAP = { + 'a': database.feed_a, + 'cname': database.feed_cname, +} + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + args = parser.parse_args() + + database.open_db() + + try: + database.time_step('iowait') + for line in args.input: + database.time_step('feed_json_parse') + split = line.split('"') + name = split[7] + dtype = split[11] + value = split[15] + # data = json.loads(line) + # assert dtype == data['type'] + # assert name == data['name'] + # assert value == data['value'] + database.time_step('feed_switch') + FUNCTION_MAP[dtype](name, value) + database.time_step('iowait') + except KeyboardInterrupt: + print("Interupted.") + pass + + database.close_db() diff --git a/feed_rules.py b/feed_rules.py new file mode 100755 index 0000000..d32b360 --- /dev/null +++ b/feed_rules.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys +import ipaddress + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + 'type', + choices={'subdomains', 'ip4network'}, + help="Type of rule inputed") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="List of domains domains to block (with their subdomains)") + parser.add_argument( + '-f', '--first-party', action='store_true', + help="The input only comes from verified first-party sources") + args = parser.parse_args() + + database.open_db() + + if args.type == 'subdomains': + for rule in args.input: + database.feed_rule_subdomains( + rule.strip(), first_party=args.first_party) + elif args.type == 'ip4network': + for rule in args.input: + network = ipaddress.ip_network(rule.strip()) + database.feed_rule_ip4network( + network, first_party=args.first_party) + else: + assert False + + database.close_db() diff --git a/new_workflow.sh b/new_workflow.sh new file mode 100755 index 0000000..23ae589 --- /dev/null +++ b/new_workflow.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +log "Preparing database…" +./database.py --refresh + +log "Compiling rules…" +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains +cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +# NOTE: Ensure first-party sources are last +cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party +cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party + +# log "Reading A records…" +# pv a.json.gz | gunzip | ./feed_dns.py +# log "Reading CNAME records…" +# pv cname.json.gz | gunzip | ./feed_dns.py