From 57416b6e2c31e39d7a4e0f4de8d043449889f371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 00:11:21 +0100 Subject: [PATCH] Workflow: POO and individual tables per types Mostly for performances reasons. First one to implement threading later. Second one to speed up the dichotomy, but it doesn't seem that much better so far. --- Makefile | 5 - accel.c | 37 --- database.py | 598 +++++++++++++++++++++++++++---------------- database_schema.sql | 50 +++- export.py | 30 +++ feed_dns.py | 52 ++-- feed_rules.py | 26 +- filter_subdomains.sh | 45 +--- import_rules.sh | 14 + new_workflow.sh | 28 +- 10 files changed, 525 insertions(+), 360 deletions(-) delete mode 100644 Makefile delete mode 100644 accel.c create mode 100755 export.py create mode 100755 import_rules.sh diff --git a/Makefile b/Makefile deleted file mode 100644 index fb06f61..0000000 --- a/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -libaccel.so: accel.o - clang -shared -Wl,-soname,libaccel.so -o libaccel.so accel.o - -accel.o: accel.c - clang -c -fPIC -O3 accel.c -o accel.o diff --git a/accel.c b/accel.c deleted file mode 100644 index bda0072..0000000 --- a/accel.c +++ /dev/null @@ -1,37 +0,0 @@ -#include - -int ip4_flat(char* value, wchar_t* flat) -{ - unsigned char value_index = 0; - unsigned char octet_index = 0; - unsigned char octet_value = 0; - char flat_index; - unsigned char value_chara; - do { - value_chara = value[value_index]; - if (value_chara >= '0' && value_chara <= '9') { - octet_value *= 10; - octet_value += value_chara - '0'; - } else if (value_chara == '.') { - for (flat_index = (octet_index+1)*8-1; flat_index >= octet_index*8; flat_index--) { - flat[flat_index] = '0' + (octet_value & 1); - octet_value >>= 1; - } - octet_index++; - octet_value = 0; - } else if (value_chara == '\0') { - if (octet_index != 3) { - return 1; - } - for (flat_index = 31; flat_index >= 24; flat_index--) { - flat[flat_index] = '0' + (octet_value & 1); - octet_value >>= 1; - } - return 0; - } else { - return 1; - } - value_index++; - } while (1); // This ugly thing save one comparison - return 1; -} diff --git a/database.py b/database.py index 4fb5463..4daf0ec 100755 --- a/database.py +++ b/database.py @@ -1,256 +1,385 @@ #!/usr/bin/env python3 -import sqlite3 -import os -import argparse -import typing -import ipaddress -import enum -import time -import ctypes - """ Utility functions to interact with the database. """ -# TODO Rule level and source priority +import sqlite3 +import typing +import time +import os +import logging +import argparse +import coloredlogs +import ipaddress +import ctypes -VERSION = 2 -PATH = f"blocking.db" -CONN = None -C = None # Cursor -TIME_DICT: typing.Dict[str, float] = dict() -TIME_LAST = time.perf_counter() -TIME_STEP = 'start' -ACCEL = ctypes.cdll.LoadLibrary('./libaccel.so') -ACCEL_IP4_BUF = ctypes.create_unicode_buffer('Z'*32, 32) +coloredlogs.install( + level='DEBUG', + fmt='%(asctime)s %(name)s %(levelname)s %(message)s' +) + +DbValue = typing.Union[None, int, float, str, bytes] -def time_step(step: str) -> None: - global TIME_LAST - global TIME_STEP - now = time.perf_counter() - TIME_DICT.setdefault(TIME_STEP, 0.0) - TIME_DICT[TIME_STEP] += now - TIME_LAST - TIME_STEP = step - TIME_LAST = time.perf_counter() +class Database(): + VERSION = 3 + PATH = "blocking.db" + def open(self) -> None: + self.conn = sqlite3.connect(self.PATH) + self.cursor = self.conn.cursor() + self.execute("PRAGMA foreign_keys = ON") + # self.conn.create_function("prepare_ip4address", 1, + # Database.prepare_ip4address, + # deterministic=True) -def time_print() -> None: - time_step('postprint') - total = sum(TIME_DICT.values()) - for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): - print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s") - print(f"{'total':<20}: {1:7.2%} = {total:.6f} s") + def execute(self, cmd: str, args: typing.Union[ + typing.Tuple[DbValue, ...], + typing.Dict[str, DbValue]] = None) -> None: + self.cursor.execute(cmd, args or tuple()) - -class RowType(enum.Enum): - AS = 1 - DomainTree = 2 - Domain = 3 - IPv4Network = 4 - IPv6Network = 6 - - -def open_db() -> None: - time_step('open_db') - global CONN - global C - CONN = sqlite3.connect(PATH) - C = CONN.cursor() - # C.execute("PRAGMA foreign_keys = ON"); - initialized = False - try: - C.execute("SELECT value FROM meta WHERE key='version'") - version_ex = C.fetchone() - if version_ex: - if version_ex[0] == VERSION: - initialized = True - else: - print(f"Database version {version_ex[0]} found," - "it will be deleted.") - except sqlite3.OperationalError: - pass - if not initialized: - time_step('init_db') - print(f"Creating database version {VERSION}.") - CONN.close() - os.unlink(PATH) - CONN = sqlite3.connect(PATH) - C = CONN.cursor() - with open("database_schema.sql", 'r') as db_schema: - C.executescript(db_schema.read()) - C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,)) - CONN.commit() - time_step('other') - - -def close_db() -> None: - assert CONN - time_step('close_db_commit') - CONN.commit() - time_step('close_db') - CONN.close() - time_step('other') - time_print() - - -def refresh() -> None: - assert C - C.execute('UPDATE blocking SET updated = 0') - # TODO PERF Use a meta value instead - - -RULE_SUBDOMAIN_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstpart, level) ' \ - f'VALUES (?, {RowType.DomainTree.value}, 1, ?, 0) ' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ - 'updated=1, firstparty=?, level=0' - - -def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: - assert C - subdomain = subdomain[::-1] - C.execute(RULE_SUBDOMAIN_COMMAND, - (subdomain, int(first_party), int(first_party))) - # Since regex type takes precedence over domain type, - # and firstparty takes precedence over multiparty, - # we can afford to replace the whole row without checking - # the row without checking previous values and making sure - # firstparty subdomains are updated last - - -def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: - for char in address.packed: - for i in range(7, -1, -1): - yield (char >> i) & 0b1 - - -def ip_flat(address: ipaddress.IPv4Address) -> str: - return ''.join(map(str, ip_get_bits(address))) - - -def ip4_flat(address: bytes) -> typing.Optional[str]: - carg = ctypes.c_char_p(address) - ret = ACCEL.ip4_flat(carg, ACCEL_IP4_BUF) - if ret != 0: + def get_meta(self, key: str) -> typing.Optional[int]: + try: + self.execute("SELECT value FROM meta WHERE key=?", (key,)) + except sqlite3.OperationalError: + return None + for ver, in self.cursor: + return ver return None - return ACCEL_IP4_BUF.value + def set_meta(self, key: str, val: int) -> None: + self.execute("INSERT INTO meta VALUES (?, ?) " + "ON CONFLICT (key) DO " + "UPDATE set value=?", + (key, val, val)) -RULE_IP4NETWORK_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstparty, level) ' \ - f'VALUES (?, {RowType.IPv4Network.value}, 1, ?, 0) ' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ - 'updated=1, firstparty=?, level=0' + def close(self) -> None: + self.enter_step('close_commit') + self.conn.commit() + self.enter_step('close') + self.conn.close() + self.profile() + def initialize(self) -> None: + self.enter_step('initialize') + self.close() + os.unlink(self.PATH) + self.open() + self.log.info("Creating database version %d.", self.VERSION) + with open("database_schema.sql", 'r') as db_schema: + self.cursor.executescript(db_schema.read()) + self.set_meta('version', self.VERSION) + self.conn.commit() -def feed_rule_ip4network(network: ipaddress.IPv4Network, - first_party: bool = False) -> None: - assert C - flat = ip_flat(network.network_address)[:network.prefixlen] - C.execute(RULE_IP4NETWORK_COMMAND, - (flat, int(first_party), int(first_party))) + def __init__(self) -> None: + self.log = logging.getLogger('db') + self.time_last = time.perf_counter() + self.time_step = 'init' + self.time_dict: typing.Dict[str, float] = dict() + self.step_dict: typing.Dict[str, int] = dict() + self.accel_ip4_buf = ctypes.create_unicode_buffer('Z'*32, 32) + self.open() + version = self.get_meta('version') + if version != self.VERSION: + if version is not None: + self.log.warning( + "Outdated database version: %d found, will be rebuilt.", + version) + self.initialize() -FEED_A_COMMAND_FETCH = \ - 'SELECT key, firstparty FROM blocking ' \ - 'WHERE key<=? ' \ - 'AND instr(?, key) > 0 ' \ - f'AND type={RowType.IPv4Network.value} ' \ - 'ORDER BY key DESC ' + updated = self.get_meta('updated') + if updated is None: + self.execute('SELECT max(updated) FROM rules') + data = self.cursor.fetchone() + updated, = data + self.updated = updated or 1 -# UPSERT are not issued often relative to FETCH, -# merging the both might be counterproductive + def enter_step(self, name: str) -> None: + now = time.perf_counter() + try: + self.time_dict[self.time_step] += now - self.time_last + self.step_dict[self.time_step] += 1 + except KeyError: + self.time_dict[self.time_step] = now - self.time_last + self.step_dict[self.time_step] = 1 + self.time_step = name + self.time_last = time.perf_counter() -FEED_A_COMMAND_UPSERT = \ - 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ - f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ - 'updated=1, firstparty=? ' \ - 'WHERE updated=0 OR firstparty None: + self.enter_step('profile') + total = sum(self.time_dict.values()) + for key, secs in sorted(self.time_dict.items(), key=lambda t: t[1]): + times = self.step_dict[key] + self.log.debug(f"{key:<20}: {times:9d} × {secs/times:5.3e} " + f"= {secs:9.2f} s ({secs/total:7.2%}) ") + self.log.debug(f"{'total':<20}: " + f"{total:9.2f} s ({1:7.2%})") + def prepare_hostname(self, hostname: str) -> str: + return hostname[::-1] + '.' -def feed_a(name: bytes, value_ip: bytes) -> None: - assert C - assert CONN - time_step('a_flat') - value_dec = ip4_flat(value_ip) - if value_dec is None: - # Malformed IPs - time_step('a_malformed') - return - time_step('a_fetch') - C.execute(FEED_A_COMMAND_FETCH, (value_dec, value_dec)) - base = C.fetchone() - time_step('a_fetch_confirm') - name = name[::-1] - for b_key, b_firstparty in C: - time_step('a_upsert') - C.execute(FEED_A_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('a_fetch_confirm') - time_step('a_end') + def prepare_zone(self, zone: str) -> str: + return self.prepare_hostname(zone) + @staticmethod + def prepare_ip4address(address: str) -> int: + total = 0 + for i, octet in enumerate(address.split('.')): + total += int(octet) << (3-i)*8 + return total + # return '{:02x}{:02x}{:02x}{:02x}'.format( + # *[int(c) for c in address.split('.')]) + # return base64.b16encode(packed).decode() + # return '{:08b}{:08b}{:08b}{:08b}'.format( + # *[int(c) for c in address.split('.')]) + # carg = ctypes.c_wchar_p(address) + # ret = ACCEL.ip4_flat(carg, self.accel_ip4_buf) + # if ret != 0: + # raise ValueError + # return self.accel_ip4_buf.value + # packed = ipaddress.ip_address(address).packed + # return packed -FEED_CNAME_COMMAND_FETCH = \ - 'SELECT key, type, firstparty FROM blocking ' \ - 'WHERE key<=? ' \ - f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ - 'ORDER BY key DESC ' \ - 'LIMIT 1' -# Optimisations that renders the index unused -# (and thus counterproductive until fixed): + def prepare_ip4network(self, network: str) -> typing.Tuple[int, int]: + # def prepare_ip4network(network: str) -> str: + net = ipaddress.ip_network(network) + mini = self.prepare_ip4address(net.network_address.exploded) + maxi = self.prepare_ip4address(net.broadcast_address.exploded) + # mini = net.network_address.packed + # maxi = net.broadcast_address.packed + return mini, maxi + # return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen] -# 'AND instr(?, key) > 0 ' \ + def expire(self) -> None: + self.enter_step('expire') + self.updated += 1 + self.set_meta('updated', self.updated) -# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ -# f'(type={RowType.Domain.value} AND key=?)) ' \ + def update_references(self) -> None: + self.enter_step('update_refs') + self.execute('UPDATE rules AS r SET refs=' + '(SELECT count(*) FROM rules ' + 'WHERE source=r.id)') -# Might be fixable by using multiple SELECT and a JOIN -# In the meantime the confirm is very light so it's ok + def prune(self) -> None: + self.enter_step('prune') + self.execute('DELETE FROM rules WHERE updated typing.Iterable[str]: + command = 'SELECT val FROM rules ' \ + 'INNER JOIN hostname ON rules.id = hostname.entry' + restrictions: typing.List[str] = list() + if first_party_only: + restrictions.append('rules.first_party = 1') + if end_chain_only: + restrictions.append('rules.refs = 0') + if restrictions: + command += ' WHERE ' + ' AND '.join(restrictions) + self.execute(command) + for val, in self.cursor: + yield val[:-1][::-1] - -def feed_cname(name: bytes, value: bytes) -> None: - assert C - assert CONN - time_step('cname_decode') - value = value[::-1] - value_dec = value.decode() - time_step('cname_fetch') - C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,)) - time_step('cname_fetch_confirm') - for b_key, b_type, b_firstparty in C: - matching = b_key == value_dec[:len(b_key)] and ( - len(value_dec) == len(b_key) - or ( - b_type == RowType.DomainTree.value - and value_dec[len(b_key)] == '.' - ) + def get_domain(self, domain: str) -> typing.Iterable[int]: + self.enter_step('get_domain_prepare') + domain_prep = self.prepare_hostname(domain) + self.enter_step('get_domain_select') + self.execute( + 'SELECT null, entry FROM hostname ' + 'WHERE val=:d ' + 'UNION ' + 'SELECT * FROM (' + 'SELECT val, entry FROM zone ' + 'WHERE val<=:d ' + 'ORDER BY val DESC LIMIT 1' + ')', + {'d': domain_prep} + ) + for val, entry in self.cursor: + self.enter_step('get_domain_confirm') + if not (val is None or domain_prep.startswith(val)): + continue + self.enter_step('get_domain_yield') + yield entry + + def get_ip4(self, address: str) -> typing.Iterable[int]: + self.enter_step('get_ip4_prepare') + try: + address_prep = self.prepare_ip4address(address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", address) + return + self.enter_step('get_ip4_select') + self.execute( + 'SELECT entry FROM ip4address ' + # 'SELECT null, entry FROM ip4address ' + 'WHERE val=:a ' + 'UNION ' + # 'SELECT * FROM (' + # 'SELECT val, entry FROM ip4network ' + # 'WHERE val<=:a ' + # 'AND instr(:a, val) > 0 ' + # 'ORDER BY val DESC' + # ')' + 'SELECT entry FROM ip4network ' + 'WHERE :a BETWEEN mini AND maxi ', + {'a': address_prep} + ) + for val, entry in self.cursor: + # self.enter_step('get_ip4_confirm') + # if not (val is None or val.startswith(address_prep)): + # # PERF startswith but from the end + # continue + self.enter_step('get_ip4_yield') + yield entry + + def _set_generic(self, + table: str, + select_query: str, + insert_query: str, + prep: typing.Dict[str, DbValue], + is_first_party: bool = False, + source: int = None, + ) -> None: + # Since this isn't the bulk of the processing, + # here abstraction > performaces + + # Fields based on the source + if source is None: + first_party = int(is_first_party) + level = 0 + else: + self.enter_step(f'set_{table}_source') + self.execute( + 'SELECT first_party, level FROM rules ' + 'WHERE id=?', + (source,) + ) + first_party, level = self.cursor.fetchone() + level += 1 + + self.enter_step(f'set_{table}_select') + self.execute(select_query, prep) + + rules_prep = { + "source": source, + "updated": self.updated, + "first_party": first_party, + "level": level, + } + + # If the entry already exists + for entry, in self.cursor: # only one + self.enter_step(f'set_{table}_update') + rules_prep['entry'] = entry + self.execute( + 'UPDATE rules SET ' + 'source=:source, updated=:updated, ' + 'first_party=:first_party, level=:level ' + 'WHERE id=:entry AND (updated<:updated OR ' + 'first_party<:first_party OR level<:level)', + rules_prep + ) + # Only update if any of the following: + # - the entry is outdataed + # - the entry was not a first_party but this is + # - this is closer to the original rule + return + + # If it does not exist + + if source is not None: + self.enter_step(f'set_{table}_incsrc') + self.execute('UPDATE rules SET refs = refs + 1 WHERE id=?', + (source,)) + + self.enter_step(f'set_{table}_insert') + self.execute( + 'INSERT INTO rules ' + '(source, updated, first_party, refs, level) ' + 'VALUES (:source, :updated, :first_party, 0, :level) ', + rules_prep + ) + self.execute('SELECT id FROM rules WHERE rowid=?', + (self.cursor.lastrowid,)) + for entry, in self.cursor: # only one + prep['entry'] = entry + self.execute(insert_query, prep) + return + assert False + + def set_hostname(self, hostname: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_hostname_prepare') + prep: typing.Dict[str, DbValue] = { + 'val': self.prepare_hostname(hostname), + } + self._set_generic( + 'hostname', + 'SELECT entry FROM hostname WHERE val=:val', + 'INSERT INTO hostname (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_ip4address(self, ip4address: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_ip4add_prepare') + try: + ip4address_prep = self.prepare_ip4address(ip4address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", ip4address) + return + prep: typing.Dict[str, DbValue] = { + 'val': ip4address_prep, + } + self._set_generic( + 'ip4add', + 'SELECT entry FROM ip4address WHERE val=:val', + 'INSERT INTO ip4address (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_zone(self, zone: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_zone_prepare') + prep: typing.Dict[str, DbValue] = { + 'val': self.prepare_zone(zone), + } + self._set_generic( + 'zone', + 'SELECT entry FROM zone WHERE val=:val', + 'INSERT INTO zone (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_ip4network(self, ip4network: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_ip4net_prepare') + try: + ip4network_prep = self.prepare_ip4network(ip4network) + except (ValueError, IndexError): + self.log.error("Invalid ip4network: %s", ip4network) + return + prep: typing.Dict[str, DbValue] = { + 'mini': ip4network_prep[0], + 'maxi': ip4network_prep[1], + } + self._set_generic( + 'ip4net', + 'SELECT entry FROM ip4network WHERE mini=:mini AND maxi=:maxi', + 'INSERT INTO ip4network (mini, maxi, entry) ' + 'VALUES (:mini, :maxi, :entry)', + prep, + *args, **kwargs ) - if not matching: - continue - name = name[::-1] - time_step('cname_upsert') - C.execute(FEED_CNAME_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('cname_fetch_confirm') - time_step('cname_end') if __name__ == '__main__': @@ -259,13 +388,28 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description="Database operations") parser.add_argument( - '-r', '--refresh', action='store_true', + '-i', '--initialize', action='store_true', + help="Reconstruct the whole database") + parser.add_argument( + '-p', '--prune', action='store_true', + help="Remove old entries from database") + parser.add_argument( + '-e', '--expire', action='store_true', help="Set the whole database as an old source") + parser.add_argument( + '-r', '--references', action='store_true', + help="Update the reference count") args = parser.parse_args() - open_db() + DB = Database() - if args.refresh: - refresh() + if args.initialize: + DB.initialize() + if args.prune: + DB.prune() + if args.expire: + DB.expire() + if args.references and not args.prune: + DB.update_references() - close_db() + DB.close() diff --git a/database_schema.sql b/database_schema.sql index 833338d..9be81b0 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -1,21 +1,49 @@ -- Remember to increment DB_VERSION -- in database.py on changes to this file -CREATE TABLE blocking ( - key TEXT PRIMARY KEY, -- Contains the reversed domain name or IP in binary form - source TEXT, -- The rule this one is based on - type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network +CREATE TABLE rules ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source INTEGER, -- The rule this one is based on updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) - firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) - refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) (used for -only lists) - level INTEGER, -- Level of recursion to the original rule (used for source priority) - FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE + first_party INTEGER, -- 1: this blocks a first party for sure, 0: maybe + refs INTEGER, -- Number of entries issued from this one + level INTEGER, -- Level of recursion to the root source rule (used for source priority) + FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE ); -CREATE INDEX "blocking_type_key" ON "blocking" ( - "type", - "key" DESC + +CREATE TABLE asn ( + val INTEGER PRIMARY KEY, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE TABLE hostname ( + val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone) + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE zone ( + val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching) + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE ip4address ( + val INTEGER PRIMARY KEY, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE ip4network ( + -- val TEXT PRIMARY KEY, + mini INTEGER, + maxi INTEGER, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); +CREATE INDEX ip4network_minmax ON ip4network (mini, maxi); + -- Store various things CREATE TABLE meta ( key TEXT PRIMARY KEY, diff --git a/export.py b/export.py new file mode 100755 index 0000000..58b276b --- /dev/null +++ b/export.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="TODO") + parser.add_argument( + '-f', '--first-party', action='store_true', + help="TODO") + parser.add_argument( + '-e', '--end-chain', action='store_true', + help="TODO") + args = parser.parse_args() + + DB = database.Database() + + for domain in DB.export(first_party_only=args.first_party, + end_chain_only=args.end_chain): + print(domain, file=args.output) + + DB.close() diff --git a/feed_dns.py b/feed_dns.py index 1cc3247..f46d97b 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -3,42 +3,56 @@ import database import argparse import sys - -FUNCTION_MAP = { - b'a': database.feed_a, - b'cname': database.feed_cname, -} +import logging if __name__ == '__main__': # Parsing arguments + log = logging.getLogger('feed_dns') parser = argparse.ArgumentParser( description="TODO") parser.add_argument( - '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="TODO") args = parser.parse_args() - database.open_db() + DB = database.Database() try: - database.time_step('iowait') - line: bytes + DB.enter_step('iowait') + # line: bytes + line: str for line in args.input: - database.time_step('feed_json_parse') - split = line.split(b'"') - name = split[7] - dtype = split[11] - value = split[15] + DB.enter_step('feed_json_parse') + # split = line.split(b'"') + split = line.split('"') + try: + name = split[7] + dtype = split[11] + value = split[15] + except IndexError: + log.error("Invalid JSON: %s", line) + continue + # DB.enter_step('feed_json_assert') # data = json.loads(line) # assert dtype == data['type'] # assert name == data['name'] # assert value == data['value'] - database.time_step('feed_switch') - FUNCTION_MAP[dtype](name, value) - database.time_step('iowait') + + DB.enter_step('feed_switch') + if dtype == 'a': + for rule in DB.get_ip4(value): + DB.set_hostname(name, source=rule) + elif dtype == 'cname': + for rule in DB.get_domain(value): + DB.set_hostname(name, source=rule) + elif dtype == 'ptr': + for rule in DB.get_domain(value): + DB.set_ip4address(name, source=rule) + DB.enter_step('iowait') except KeyboardInterrupt: - print("Interupted.") + log.warning("Interupted.") pass - database.close_db() + DB.close() diff --git a/feed_rules.py b/feed_rules.py index d32b360..7a19614 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -13,7 +13,7 @@ if __name__ == '__main__': description="TODO") parser.add_argument( 'type', - choices={'subdomains', 'ip4network'}, + choices={'zone', 'ip4network'}, help="Type of rule inputed") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, @@ -23,18 +23,16 @@ if __name__ == '__main__': help="The input only comes from verified first-party sources") args = parser.parse_args() - database.open_db() + DB = database.Database() - if args.type == 'subdomains': - for rule in args.input: - database.feed_rule_subdomains( - rule.strip(), first_party=args.first_party) - elif args.type == 'ip4network': - for rule in args.input: - network = ipaddress.ip_network(rule.strip()) - database.feed_rule_ip4network( - network, first_party=args.first_party) - else: - assert False + FUNCTION_MAP = { + 'zone': DB.set_zone, + 'ip4network': DB.set_ip4network, + } - database.close_db() + fun = FUNCTION_MAP[args.type] + + for rule in args.input: + fun(rule.strip(), is_first_party=args.first_party) + + DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 9a09b9a..98638a9 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,37 +4,14 @@ function log() { echo -e "\033[33m$@\033[0m" } -if [ ! -f temp/all_resolved.csv ] -then - echo "Run ./resolve_subdomains.sh first!" - exit 1 -fi +log "Updating references…" +./database.py --references -# Gather all the rules for filtering -log "Compiling rules…" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt -./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list -cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list -cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt -cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt - -log "Filtering first-party tracking domains…" -./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list -sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt - -log "Filtering first-party curated tracking domains…" -./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list -sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt - -log "Filtering multi-party tracking domains…" -./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list -sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt - -log "Filtering multi-party curated tracking domains…" -./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list -sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt +log "Exporting lists…" +./export.py --first-party | sort -u > dist/firstparty-trackers.txt +./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt +./export.py | sort -u > dist/multiparty-trackers.txt +./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist function generate_hosts { @@ -61,14 +38,14 @@ function generate_hosts { echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of source websites: TODO" + echo "# Number of source subdomains: TODO" echo "#" - echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)" + echo "# Number of known first-party trackers: TODO" echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" - echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)" + echo "# Number of known multi-party trackers: TODO" echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo diff --git a/import_rules.sh b/import_rules.sh new file mode 100755 index 0000000..d4d4719 --- /dev/null +++ b/import_rules.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +log "Importing rules…" +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone +cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party +cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party + diff --git a/new_workflow.sh b/new_workflow.sh index 23ae589..bc2a78b 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -5,18 +5,20 @@ function log() { } log "Preparing database…" -./database.py --refresh +./database.py --expire -log "Compiling rules…" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains -cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network -# NOTE: Ensure first-party sources are last -cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party -cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party +./import_rules.sh + +# TODO Fetch 'em +log "Reading PTR records…" +pv ptr.json.gz | gunzip | ./feed_dns.py +log "Reading A records…" +pv a.json.gz | gunzip | ./feed_dns.py +log "Reading CNAME records…" +pv cname.json.gz | gunzip | ./feed_dns.py + +log "Pruning old data…" +./database.py --prune + +./filter_subdomains.sh -# log "Reading A records…" -# pv a.json.gz | gunzip | ./feed_dns.py -# log "Reading CNAME records…" -# pv cname.json.gz | gunzip | ./feed_dns.py