diff --git a/Makefile b/Makefile deleted file mode 100644 index fb06f61..0000000 --- a/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -libaccel.so: accel.o - clang -shared -Wl,-soname,libaccel.so -o libaccel.so accel.o - -accel.o: accel.c - clang -c -fPIC -O3 accel.c -o accel.o diff --git a/accel.c b/accel.c deleted file mode 100644 index bda0072..0000000 --- a/accel.c +++ /dev/null @@ -1,37 +0,0 @@ -#include - -int ip4_flat(char* value, wchar_t* flat) -{ - unsigned char value_index = 0; - unsigned char octet_index = 0; - unsigned char octet_value = 0; - char flat_index; - unsigned char value_chara; - do { - value_chara = value[value_index]; - if (value_chara >= '0' && value_chara <= '9') { - octet_value *= 10; - octet_value += value_chara - '0'; - } else if (value_chara == '.') { - for (flat_index = (octet_index+1)*8-1; flat_index >= octet_index*8; flat_index--) { - flat[flat_index] = '0' + (octet_value & 1); - octet_value >>= 1; - } - octet_index++; - octet_value = 0; - } else if (value_chara == '\0') { - if (octet_index != 3) { - return 1; - } - for (flat_index = 31; flat_index >= 24; flat_index--) { - flat[flat_index] = '0' + (octet_value & 1); - octet_value >>= 1; - } - return 0; - } else { - return 1; - } - value_index++; - } while (1); // This ugly thing save one comparison - return 1; -} diff --git a/database.py b/database.py index 4fb5463..4daf0ec 100755 --- a/database.py +++ b/database.py @@ -1,256 +1,385 @@ #!/usr/bin/env python3 -import sqlite3 -import os -import argparse -import typing -import ipaddress -import enum -import time -import ctypes - """ Utility functions to interact with the database. """ -# TODO Rule level and source priority +import sqlite3 +import typing +import time +import os +import logging +import argparse +import coloredlogs +import ipaddress +import ctypes -VERSION = 2 -PATH = f"blocking.db" -CONN = None -C = None # Cursor -TIME_DICT: typing.Dict[str, float] = dict() -TIME_LAST = time.perf_counter() -TIME_STEP = 'start' -ACCEL = ctypes.cdll.LoadLibrary('./libaccel.so') -ACCEL_IP4_BUF = ctypes.create_unicode_buffer('Z'*32, 32) +coloredlogs.install( + level='DEBUG', + fmt='%(asctime)s %(name)s %(levelname)s %(message)s' +) + +DbValue = typing.Union[None, int, float, str, bytes] -def time_step(step: str) -> None: - global TIME_LAST - global TIME_STEP - now = time.perf_counter() - TIME_DICT.setdefault(TIME_STEP, 0.0) - TIME_DICT[TIME_STEP] += now - TIME_LAST - TIME_STEP = step - TIME_LAST = time.perf_counter() +class Database(): + VERSION = 3 + PATH = "blocking.db" + def open(self) -> None: + self.conn = sqlite3.connect(self.PATH) + self.cursor = self.conn.cursor() + self.execute("PRAGMA foreign_keys = ON") + # self.conn.create_function("prepare_ip4address", 1, + # Database.prepare_ip4address, + # deterministic=True) -def time_print() -> None: - time_step('postprint') - total = sum(TIME_DICT.values()) - for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): - print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s") - print(f"{'total':<20}: {1:7.2%} = {total:.6f} s") + def execute(self, cmd: str, args: typing.Union[ + typing.Tuple[DbValue, ...], + typing.Dict[str, DbValue]] = None) -> None: + self.cursor.execute(cmd, args or tuple()) - -class RowType(enum.Enum): - AS = 1 - DomainTree = 2 - Domain = 3 - IPv4Network = 4 - IPv6Network = 6 - - -def open_db() -> None: - time_step('open_db') - global CONN - global C - CONN = sqlite3.connect(PATH) - C = CONN.cursor() - # C.execute("PRAGMA foreign_keys = ON"); - initialized = False - try: - C.execute("SELECT value FROM meta WHERE key='version'") - version_ex = C.fetchone() - if version_ex: - if version_ex[0] == VERSION: - initialized = True - else: - print(f"Database version {version_ex[0]} found," - "it will be deleted.") - except sqlite3.OperationalError: - pass - if not initialized: - time_step('init_db') - print(f"Creating database version {VERSION}.") - CONN.close() - os.unlink(PATH) - CONN = sqlite3.connect(PATH) - C = CONN.cursor() - with open("database_schema.sql", 'r') as db_schema: - C.executescript(db_schema.read()) - C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,)) - CONN.commit() - time_step('other') - - -def close_db() -> None: - assert CONN - time_step('close_db_commit') - CONN.commit() - time_step('close_db') - CONN.close() - time_step('other') - time_print() - - -def refresh() -> None: - assert C - C.execute('UPDATE blocking SET updated = 0') - # TODO PERF Use a meta value instead - - -RULE_SUBDOMAIN_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstpart, level) ' \ - f'VALUES (?, {RowType.DomainTree.value}, 1, ?, 0) ' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ - 'updated=1, firstparty=?, level=0' - - -def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: - assert C - subdomain = subdomain[::-1] - C.execute(RULE_SUBDOMAIN_COMMAND, - (subdomain, int(first_party), int(first_party))) - # Since regex type takes precedence over domain type, - # and firstparty takes precedence over multiparty, - # we can afford to replace the whole row without checking - # the row without checking previous values and making sure - # firstparty subdomains are updated last - - -def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: - for char in address.packed: - for i in range(7, -1, -1): - yield (char >> i) & 0b1 - - -def ip_flat(address: ipaddress.IPv4Address) -> str: - return ''.join(map(str, ip_get_bits(address))) - - -def ip4_flat(address: bytes) -> typing.Optional[str]: - carg = ctypes.c_char_p(address) - ret = ACCEL.ip4_flat(carg, ACCEL_IP4_BUF) - if ret != 0: + def get_meta(self, key: str) -> typing.Optional[int]: + try: + self.execute("SELECT value FROM meta WHERE key=?", (key,)) + except sqlite3.OperationalError: + return None + for ver, in self.cursor: + return ver return None - return ACCEL_IP4_BUF.value + def set_meta(self, key: str, val: int) -> None: + self.execute("INSERT INTO meta VALUES (?, ?) " + "ON CONFLICT (key) DO " + "UPDATE set value=?", + (key, val, val)) -RULE_IP4NETWORK_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstparty, level) ' \ - f'VALUES (?, {RowType.IPv4Network.value}, 1, ?, 0) ' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ - 'updated=1, firstparty=?, level=0' + def close(self) -> None: + self.enter_step('close_commit') + self.conn.commit() + self.enter_step('close') + self.conn.close() + self.profile() + def initialize(self) -> None: + self.enter_step('initialize') + self.close() + os.unlink(self.PATH) + self.open() + self.log.info("Creating database version %d.", self.VERSION) + with open("database_schema.sql", 'r') as db_schema: + self.cursor.executescript(db_schema.read()) + self.set_meta('version', self.VERSION) + self.conn.commit() -def feed_rule_ip4network(network: ipaddress.IPv4Network, - first_party: bool = False) -> None: - assert C - flat = ip_flat(network.network_address)[:network.prefixlen] - C.execute(RULE_IP4NETWORK_COMMAND, - (flat, int(first_party), int(first_party))) + def __init__(self) -> None: + self.log = logging.getLogger('db') + self.time_last = time.perf_counter() + self.time_step = 'init' + self.time_dict: typing.Dict[str, float] = dict() + self.step_dict: typing.Dict[str, int] = dict() + self.accel_ip4_buf = ctypes.create_unicode_buffer('Z'*32, 32) + self.open() + version = self.get_meta('version') + if version != self.VERSION: + if version is not None: + self.log.warning( + "Outdated database version: %d found, will be rebuilt.", + version) + self.initialize() -FEED_A_COMMAND_FETCH = \ - 'SELECT key, firstparty FROM blocking ' \ - 'WHERE key<=? ' \ - 'AND instr(?, key) > 0 ' \ - f'AND type={RowType.IPv4Network.value} ' \ - 'ORDER BY key DESC ' + updated = self.get_meta('updated') + if updated is None: + self.execute('SELECT max(updated) FROM rules') + data = self.cursor.fetchone() + updated, = data + self.updated = updated or 1 -# UPSERT are not issued often relative to FETCH, -# merging the both might be counterproductive + def enter_step(self, name: str) -> None: + now = time.perf_counter() + try: + self.time_dict[self.time_step] += now - self.time_last + self.step_dict[self.time_step] += 1 + except KeyError: + self.time_dict[self.time_step] = now - self.time_last + self.step_dict[self.time_step] = 1 + self.time_step = name + self.time_last = time.perf_counter() -FEED_A_COMMAND_UPSERT = \ - 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ - f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ - 'updated=1, firstparty=? ' \ - 'WHERE updated=0 OR firstparty None: + self.enter_step('profile') + total = sum(self.time_dict.values()) + for key, secs in sorted(self.time_dict.items(), key=lambda t: t[1]): + times = self.step_dict[key] + self.log.debug(f"{key:<20}: {times:9d} × {secs/times:5.3e} " + f"= {secs:9.2f} s ({secs/total:7.2%}) ") + self.log.debug(f"{'total':<20}: " + f"{total:9.2f} s ({1:7.2%})") + def prepare_hostname(self, hostname: str) -> str: + return hostname[::-1] + '.' -def feed_a(name: bytes, value_ip: bytes) -> None: - assert C - assert CONN - time_step('a_flat') - value_dec = ip4_flat(value_ip) - if value_dec is None: - # Malformed IPs - time_step('a_malformed') - return - time_step('a_fetch') - C.execute(FEED_A_COMMAND_FETCH, (value_dec, value_dec)) - base = C.fetchone() - time_step('a_fetch_confirm') - name = name[::-1] - for b_key, b_firstparty in C: - time_step('a_upsert') - C.execute(FEED_A_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('a_fetch_confirm') - time_step('a_end') + def prepare_zone(self, zone: str) -> str: + return self.prepare_hostname(zone) + @staticmethod + def prepare_ip4address(address: str) -> int: + total = 0 + for i, octet in enumerate(address.split('.')): + total += int(octet) << (3-i)*8 + return total + # return '{:02x}{:02x}{:02x}{:02x}'.format( + # *[int(c) for c in address.split('.')]) + # return base64.b16encode(packed).decode() + # return '{:08b}{:08b}{:08b}{:08b}'.format( + # *[int(c) for c in address.split('.')]) + # carg = ctypes.c_wchar_p(address) + # ret = ACCEL.ip4_flat(carg, self.accel_ip4_buf) + # if ret != 0: + # raise ValueError + # return self.accel_ip4_buf.value + # packed = ipaddress.ip_address(address).packed + # return packed -FEED_CNAME_COMMAND_FETCH = \ - 'SELECT key, type, firstparty FROM blocking ' \ - 'WHERE key<=? ' \ - f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ - 'ORDER BY key DESC ' \ - 'LIMIT 1' -# Optimisations that renders the index unused -# (and thus counterproductive until fixed): + def prepare_ip4network(self, network: str) -> typing.Tuple[int, int]: + # def prepare_ip4network(network: str) -> str: + net = ipaddress.ip_network(network) + mini = self.prepare_ip4address(net.network_address.exploded) + maxi = self.prepare_ip4address(net.broadcast_address.exploded) + # mini = net.network_address.packed + # maxi = net.broadcast_address.packed + return mini, maxi + # return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen] -# 'AND instr(?, key) > 0 ' \ + def expire(self) -> None: + self.enter_step('expire') + self.updated += 1 + self.set_meta('updated', self.updated) -# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ -# f'(type={RowType.Domain.value} AND key=?)) ' \ + def update_references(self) -> None: + self.enter_step('update_refs') + self.execute('UPDATE rules AS r SET refs=' + '(SELECT count(*) FROM rules ' + 'WHERE source=r.id)') -# Might be fixable by using multiple SELECT and a JOIN -# In the meantime the confirm is very light so it's ok + def prune(self) -> None: + self.enter_step('prune') + self.execute('DELETE FROM rules WHERE updated typing.Iterable[str]: + command = 'SELECT val FROM rules ' \ + 'INNER JOIN hostname ON rules.id = hostname.entry' + restrictions: typing.List[str] = list() + if first_party_only: + restrictions.append('rules.first_party = 1') + if end_chain_only: + restrictions.append('rules.refs = 0') + if restrictions: + command += ' WHERE ' + ' AND '.join(restrictions) + self.execute(command) + for val, in self.cursor: + yield val[:-1][::-1] - -def feed_cname(name: bytes, value: bytes) -> None: - assert C - assert CONN - time_step('cname_decode') - value = value[::-1] - value_dec = value.decode() - time_step('cname_fetch') - C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,)) - time_step('cname_fetch_confirm') - for b_key, b_type, b_firstparty in C: - matching = b_key == value_dec[:len(b_key)] and ( - len(value_dec) == len(b_key) - or ( - b_type == RowType.DomainTree.value - and value_dec[len(b_key)] == '.' - ) + def get_domain(self, domain: str) -> typing.Iterable[int]: + self.enter_step('get_domain_prepare') + domain_prep = self.prepare_hostname(domain) + self.enter_step('get_domain_select') + self.execute( + 'SELECT null, entry FROM hostname ' + 'WHERE val=:d ' + 'UNION ' + 'SELECT * FROM (' + 'SELECT val, entry FROM zone ' + 'WHERE val<=:d ' + 'ORDER BY val DESC LIMIT 1' + ')', + {'d': domain_prep} + ) + for val, entry in self.cursor: + self.enter_step('get_domain_confirm') + if not (val is None or domain_prep.startswith(val)): + continue + self.enter_step('get_domain_yield') + yield entry + + def get_ip4(self, address: str) -> typing.Iterable[int]: + self.enter_step('get_ip4_prepare') + try: + address_prep = self.prepare_ip4address(address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", address) + return + self.enter_step('get_ip4_select') + self.execute( + 'SELECT entry FROM ip4address ' + # 'SELECT null, entry FROM ip4address ' + 'WHERE val=:a ' + 'UNION ' + # 'SELECT * FROM (' + # 'SELECT val, entry FROM ip4network ' + # 'WHERE val<=:a ' + # 'AND instr(:a, val) > 0 ' + # 'ORDER BY val DESC' + # ')' + 'SELECT entry FROM ip4network ' + 'WHERE :a BETWEEN mini AND maxi ', + {'a': address_prep} + ) + for val, entry in self.cursor: + # self.enter_step('get_ip4_confirm') + # if not (val is None or val.startswith(address_prep)): + # # PERF startswith but from the end + # continue + self.enter_step('get_ip4_yield') + yield entry + + def _set_generic(self, + table: str, + select_query: str, + insert_query: str, + prep: typing.Dict[str, DbValue], + is_first_party: bool = False, + source: int = None, + ) -> None: + # Since this isn't the bulk of the processing, + # here abstraction > performaces + + # Fields based on the source + if source is None: + first_party = int(is_first_party) + level = 0 + else: + self.enter_step(f'set_{table}_source') + self.execute( + 'SELECT first_party, level FROM rules ' + 'WHERE id=?', + (source,) + ) + first_party, level = self.cursor.fetchone() + level += 1 + + self.enter_step(f'set_{table}_select') + self.execute(select_query, prep) + + rules_prep = { + "source": source, + "updated": self.updated, + "first_party": first_party, + "level": level, + } + + # If the entry already exists + for entry, in self.cursor: # only one + self.enter_step(f'set_{table}_update') + rules_prep['entry'] = entry + self.execute( + 'UPDATE rules SET ' + 'source=:source, updated=:updated, ' + 'first_party=:first_party, level=:level ' + 'WHERE id=:entry AND (updated<:updated OR ' + 'first_party<:first_party OR level<:level)', + rules_prep + ) + # Only update if any of the following: + # - the entry is outdataed + # - the entry was not a first_party but this is + # - this is closer to the original rule + return + + # If it does not exist + + if source is not None: + self.enter_step(f'set_{table}_incsrc') + self.execute('UPDATE rules SET refs = refs + 1 WHERE id=?', + (source,)) + + self.enter_step(f'set_{table}_insert') + self.execute( + 'INSERT INTO rules ' + '(source, updated, first_party, refs, level) ' + 'VALUES (:source, :updated, :first_party, 0, :level) ', + rules_prep + ) + self.execute('SELECT id FROM rules WHERE rowid=?', + (self.cursor.lastrowid,)) + for entry, in self.cursor: # only one + prep['entry'] = entry + self.execute(insert_query, prep) + return + assert False + + def set_hostname(self, hostname: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_hostname_prepare') + prep: typing.Dict[str, DbValue] = { + 'val': self.prepare_hostname(hostname), + } + self._set_generic( + 'hostname', + 'SELECT entry FROM hostname WHERE val=:val', + 'INSERT INTO hostname (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_ip4address(self, ip4address: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_ip4add_prepare') + try: + ip4address_prep = self.prepare_ip4address(ip4address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", ip4address) + return + prep: typing.Dict[str, DbValue] = { + 'val': ip4address_prep, + } + self._set_generic( + 'ip4add', + 'SELECT entry FROM ip4address WHERE val=:val', + 'INSERT INTO ip4address (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_zone(self, zone: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_zone_prepare') + prep: typing.Dict[str, DbValue] = { + 'val': self.prepare_zone(zone), + } + self._set_generic( + 'zone', + 'SELECT entry FROM zone WHERE val=:val', + 'INSERT INTO zone (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_ip4network(self, ip4network: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_ip4net_prepare') + try: + ip4network_prep = self.prepare_ip4network(ip4network) + except (ValueError, IndexError): + self.log.error("Invalid ip4network: %s", ip4network) + return + prep: typing.Dict[str, DbValue] = { + 'mini': ip4network_prep[0], + 'maxi': ip4network_prep[1], + } + self._set_generic( + 'ip4net', + 'SELECT entry FROM ip4network WHERE mini=:mini AND maxi=:maxi', + 'INSERT INTO ip4network (mini, maxi, entry) ' + 'VALUES (:mini, :maxi, :entry)', + prep, + *args, **kwargs ) - if not matching: - continue - name = name[::-1] - time_step('cname_upsert') - C.execute(FEED_CNAME_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('cname_fetch_confirm') - time_step('cname_end') if __name__ == '__main__': @@ -259,13 +388,28 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description="Database operations") parser.add_argument( - '-r', '--refresh', action='store_true', + '-i', '--initialize', action='store_true', + help="Reconstruct the whole database") + parser.add_argument( + '-p', '--prune', action='store_true', + help="Remove old entries from database") + parser.add_argument( + '-e', '--expire', action='store_true', help="Set the whole database as an old source") + parser.add_argument( + '-r', '--references', action='store_true', + help="Update the reference count") args = parser.parse_args() - open_db() + DB = Database() - if args.refresh: - refresh() + if args.initialize: + DB.initialize() + if args.prune: + DB.prune() + if args.expire: + DB.expire() + if args.references and not args.prune: + DB.update_references() - close_db() + DB.close() diff --git a/database_schema.sql b/database_schema.sql index 833338d..9be81b0 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -1,21 +1,49 @@ -- Remember to increment DB_VERSION -- in database.py on changes to this file -CREATE TABLE blocking ( - key TEXT PRIMARY KEY, -- Contains the reversed domain name or IP in binary form - source TEXT, -- The rule this one is based on - type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network +CREATE TABLE rules ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source INTEGER, -- The rule this one is based on updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) - firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) - refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) (used for -only lists) - level INTEGER, -- Level of recursion to the original rule (used for source priority) - FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE + first_party INTEGER, -- 1: this blocks a first party for sure, 0: maybe + refs INTEGER, -- Number of entries issued from this one + level INTEGER, -- Level of recursion to the root source rule (used for source priority) + FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE ); -CREATE INDEX "blocking_type_key" ON "blocking" ( - "type", - "key" DESC + +CREATE TABLE asn ( + val INTEGER PRIMARY KEY, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE TABLE hostname ( + val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone) + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE zone ( + val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching) + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE ip4address ( + val INTEGER PRIMARY KEY, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE ip4network ( + -- val TEXT PRIMARY KEY, + mini INTEGER, + maxi INTEGER, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); +CREATE INDEX ip4network_minmax ON ip4network (mini, maxi); + -- Store various things CREATE TABLE meta ( key TEXT PRIMARY KEY, diff --git a/export.py b/export.py new file mode 100755 index 0000000..58b276b --- /dev/null +++ b/export.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="TODO") + parser.add_argument( + '-f', '--first-party', action='store_true', + help="TODO") + parser.add_argument( + '-e', '--end-chain', action='store_true', + help="TODO") + args = parser.parse_args() + + DB = database.Database() + + for domain in DB.export(first_party_only=args.first_party, + end_chain_only=args.end_chain): + print(domain, file=args.output) + + DB.close() diff --git a/feed_dns.py b/feed_dns.py index 1cc3247..f46d97b 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -3,42 +3,56 @@ import database import argparse import sys - -FUNCTION_MAP = { - b'a': database.feed_a, - b'cname': database.feed_cname, -} +import logging if __name__ == '__main__': # Parsing arguments + log = logging.getLogger('feed_dns') parser = argparse.ArgumentParser( description="TODO") parser.add_argument( - '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="TODO") args = parser.parse_args() - database.open_db() + DB = database.Database() try: - database.time_step('iowait') - line: bytes + DB.enter_step('iowait') + # line: bytes + line: str for line in args.input: - database.time_step('feed_json_parse') - split = line.split(b'"') - name = split[7] - dtype = split[11] - value = split[15] + DB.enter_step('feed_json_parse') + # split = line.split(b'"') + split = line.split('"') + try: + name = split[7] + dtype = split[11] + value = split[15] + except IndexError: + log.error("Invalid JSON: %s", line) + continue + # DB.enter_step('feed_json_assert') # data = json.loads(line) # assert dtype == data['type'] # assert name == data['name'] # assert value == data['value'] - database.time_step('feed_switch') - FUNCTION_MAP[dtype](name, value) - database.time_step('iowait') + + DB.enter_step('feed_switch') + if dtype == 'a': + for rule in DB.get_ip4(value): + DB.set_hostname(name, source=rule) + elif dtype == 'cname': + for rule in DB.get_domain(value): + DB.set_hostname(name, source=rule) + elif dtype == 'ptr': + for rule in DB.get_domain(value): + DB.set_ip4address(name, source=rule) + DB.enter_step('iowait') except KeyboardInterrupt: - print("Interupted.") + log.warning("Interupted.") pass - database.close_db() + DB.close() diff --git a/feed_rules.py b/feed_rules.py index d32b360..7a19614 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -13,7 +13,7 @@ if __name__ == '__main__': description="TODO") parser.add_argument( 'type', - choices={'subdomains', 'ip4network'}, + choices={'zone', 'ip4network'}, help="Type of rule inputed") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, @@ -23,18 +23,16 @@ if __name__ == '__main__': help="The input only comes from verified first-party sources") args = parser.parse_args() - database.open_db() + DB = database.Database() - if args.type == 'subdomains': - for rule in args.input: - database.feed_rule_subdomains( - rule.strip(), first_party=args.first_party) - elif args.type == 'ip4network': - for rule in args.input: - network = ipaddress.ip_network(rule.strip()) - database.feed_rule_ip4network( - network, first_party=args.first_party) - else: - assert False + FUNCTION_MAP = { + 'zone': DB.set_zone, + 'ip4network': DB.set_ip4network, + } - database.close_db() + fun = FUNCTION_MAP[args.type] + + for rule in args.input: + fun(rule.strip(), is_first_party=args.first_party) + + DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 9a09b9a..98638a9 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,37 +4,14 @@ function log() { echo -e "\033[33m$@\033[0m" } -if [ ! -f temp/all_resolved.csv ] -then - echo "Run ./resolve_subdomains.sh first!" - exit 1 -fi +log "Updating references…" +./database.py --references -# Gather all the rules for filtering -log "Compiling rules…" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt -./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list -cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list -cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt -cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt - -log "Filtering first-party tracking domains…" -./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list -sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt - -log "Filtering first-party curated tracking domains…" -./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list -sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt - -log "Filtering multi-party tracking domains…" -./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list -sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt - -log "Filtering multi-party curated tracking domains…" -./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list -sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt +log "Exporting lists…" +./export.py --first-party | sort -u > dist/firstparty-trackers.txt +./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt +./export.py | sort -u > dist/multiparty-trackers.txt +./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist function generate_hosts { @@ -61,14 +38,14 @@ function generate_hosts { echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of source websites: TODO" + echo "# Number of source subdomains: TODO" echo "#" - echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)" + echo "# Number of known first-party trackers: TODO" echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" - echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)" + echo "# Number of known multi-party trackers: TODO" echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo diff --git a/import_rules.sh b/import_rules.sh new file mode 100755 index 0000000..d4d4719 --- /dev/null +++ b/import_rules.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +log "Importing rules…" +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone +cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party +cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party + diff --git a/new_workflow.sh b/new_workflow.sh index 23ae589..bc2a78b 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -5,18 +5,20 @@ function log() { } log "Preparing database…" -./database.py --refresh +./database.py --expire -log "Compiling rules…" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains -cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network -# NOTE: Ensure first-party sources are last -cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party -cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party +./import_rules.sh + +# TODO Fetch 'em +log "Reading PTR records…" +pv ptr.json.gz | gunzip | ./feed_dns.py +log "Reading A records…" +pv a.json.gz | gunzip | ./feed_dns.py +log "Reading CNAME records…" +pv cname.json.gz | gunzip | ./feed_dns.py + +log "Pruning old data…" +./database.py --prune + +./filter_subdomains.sh -# log "Reading A records…" -# pv a.json.gz | gunzip | ./feed_dns.py -# log "Reading CNAME records…" -# pv cname.json.gz | gunzip | ./feed_dns.py