From 79374968825b56be755881313db29b0ba040836b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 9 Dec 2019 08:12:48 +0100 Subject: [PATCH 01/40] Workflow: Base for new one While I'm automating this you'll need to download the A set from https://opendata.rapid7.com/sonar.fdns_v2/ to the file a.json.gz. --- .gitignore | 2 + database.py | 260 ++++++++++++++++++++++++++++++++++++++++++++ database_schema.sql | 22 ++++ feed_dns.py | 43 ++++++++ feed_rules.py | 40 +++++++ new_workflow.sh | 22 ++++ 6 files changed, 389 insertions(+) create mode 100755 database.py create mode 100644 database_schema.sql create mode 100755 feed_dns.py create mode 100755 feed_rules.py create mode 100755 new_workflow.sh diff --git a/.gitignore b/.gitignore index e38bcd9..188051c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.log +*.db +*.db-journal nameservers nameservers.head diff --git a/database.py b/database.py new file mode 100755 index 0000000..370d25b --- /dev/null +++ b/database.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 + +import sqlite3 +import os +import argparse +import typing +import ipaddress +import enum +import time +import pprint + +""" +Utility functions to interact with the database. +""" + +VERSION = 1 +PATH = f"blocking.db" +CONN = None +C = None # Cursor +TIME_DICT: typing.Dict[str, float] = dict() +TIME_LAST = time.perf_counter() +TIME_STEP = 'start' + + +def time_step(step: str) -> None: + global TIME_LAST + global TIME_STEP + now = time.perf_counter() + TIME_DICT.setdefault(TIME_STEP, 0.0) + TIME_DICT[TIME_STEP] += now - TIME_LAST + TIME_STEP = step + TIME_LAST = time.perf_counter() + + +def time_print() -> None: + time_step('postprint') + total = sum(TIME_DICT.values()) + for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): + print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s") + print(f"{'total':<20}: {1:7.2%} = {total:.6f} s") + + +class RowType(enum.Enum): + AS = 1 + DomainTree = 2 + Domain = 3 + IPv4Network = 4 + IPv6Network = 6 + + +def open_db() -> None: + time_step('open_db') + global CONN + global C + CONN = sqlite3.connect(PATH) + C = CONN.cursor() + # C.execute("PRAGMA foreign_keys = ON"); + initialized = False + try: + C.execute("SELECT value FROM meta WHERE key='version'") + version_ex = C.fetchone() + if version_ex: + if version_ex[0] == VERSION: + initialized = True + else: + print(f"Database version {version_ex[0]} found," + "it will be deleted.") + except sqlite3.OperationalError: + pass + if not initialized: + time_step('init_db') + print(f"Creating database version {VERSION}.") + CONN.close() + os.unlink(PATH) + CONN = sqlite3.connect(PATH) + C = CONN.cursor() + with open("database_schema.sql", 'r') as db_schema: + C.executescript(db_schema.read()) + C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,)) + CONN.commit() + time_step('other') + + +def close_db() -> None: + assert CONN + time_step('close_db_commit') + CONN.commit() + time_step('close_db') + CONN.close() + time_step('other') + time_print() + + +def refresh() -> None: + assert C + C.execute('UPDATE blocking SET updated = 0') + # TODO PERF Use a meta value instead + + +RULE_SUBDOMAIN_COMMAND = \ + 'INSERT INTO blocking (key, type, updated, firstparty) ' \ + f'VALUES (?, {RowType.DomainTree.value}, 1, ?) ' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ + 'updated=1, firstparty=?' + + +def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: + assert C + subdomain = subdomain[::-1] + C.execute(RULE_SUBDOMAIN_COMMAND, + (subdomain, int(first_party), int(first_party))) + # Since regex type takes precedence over domain type, + # and firstparty takes precedence over multiparty, + # we can afford to replace the whole row without checking + # the row without checking previous values and making sure + # firstparty subdomains are updated last + + +def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: + for char in address.packed: + for i in range(7, -1, -1): + yield (char >> i) & 0b1 + + +def ip_flat(address: ipaddress.IPv4Address) -> str: + return ''.join(map(str, ip_get_bits(address))) + + +def ip4_flat(address: str) -> str: + return '{:08b}{:08b}{:08b}{:08b}'.format( + *[int(c) for c in address.split('.')]) + + +RULE_IP4NETWORK_COMMAND = \ + 'INSERT INTO blocking (key, type, updated, firstparty) ' \ + f'VALUES (?, {RowType.IPv4Network.value}, 1, ?) ' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ + 'updated=1, firstparty=?' + + +def feed_rule_ip4network(network: ipaddress.IPv4Network, + first_party: bool = False) -> None: + assert C + flat = ip_flat(network.network_address)[:network.prefixlen] + C.execute(RULE_IP4NETWORK_COMMAND, + (flat, int(first_party), int(first_party))) + + +FEED_A_COMMAND_FETCH = \ + 'SELECT key, firstparty FROM blocking ' \ + 'WHERE key<=? ' \ + 'AND updated=1 ' \ + f'AND type={RowType.IPv4Network.value} ' \ + 'ORDER BY key DESC ' \ + 'LIMIT 1' + +FEED_A_COMMAND_UPSERT = \ + 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ + f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ + 'updated=1, firstparty=? ' \ + 'WHERE updated=0 OR firstparty None: + assert C + assert CONN + time_step('a_flat') + try: + value = ip4_flat(value_ip) + except (ValueError, IndexError): + # Malformed IPs + return + time_step('a_fetch') + C.execute(FEED_A_COMMAND_FETCH, (value,)) + base = C.fetchone() + time_step('a_fetch_confirm') + if not base: + return + b_key, b_firstparty = base + if not value.startswith(b_key): + return + name = name[::-1] + time_step('a_upsert') + C.execute(FEED_A_COMMAND_UPSERT, + (name, b_key, b_firstparty, # Insert + b_key, b_firstparty, b_firstparty) # Update + ) + time_step('other') + + +FEED_CNAME_COMMAND_FETCH = \ + 'SELECT key, type, firstparty FROM blocking ' \ + 'WHERE key<=? ' \ + f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ + 'AND updated=1 ' \ + 'ORDER BY key DESC ' \ + 'LIMIT 1' +# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ +# f'(type={RowType.Domain.value} AND key=?)) ' \ +# This optimisation is counter productive + +FEED_CNAME_COMMAND_UPSERT = \ + 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ + f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ + 'ON CONFLICT(key)' \ + f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ + 'updated=1, firstparty=? ' \ + 'WHERE updated=0 OR firstparty None: + assert C + assert CONN + value = value[::-1] + time_step('cname_fetch') + C.execute(FEED_CNAME_COMMAND_FETCH, (value,)) + base = C.fetchone() + time_step('cname_fetch_confirm') + if not base: + # Should only happen at an extremum of the database + return + b_key, b_type, b_firstparty = base + matching = b_key == value[:len(b_key)] and ( + len(value) == len(b_key) + or ( + b_type == RowType.DomainTree.value + and value[len(b_key)] == '.' + ) + ) + if not matching: + return + name = name[::-1] + time_step('cname_upsert') + C.execute(FEED_CNAME_COMMAND_UPSERT, + (name, b_key, b_firstparty, # Insert + b_key, b_firstparty, b_firstparty) # Update + ) + time_step('other') + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="Database operations") + parser.add_argument( + '-r', '--refresh', action='store_true', + help="Set the whole database as an old source") + args = parser.parse_args() + + open_db() + + if args.refresh: + refresh() + + close_db() diff --git a/database_schema.sql b/database_schema.sql new file mode 100644 index 0000000..5e9618b --- /dev/null +++ b/database_schema.sql @@ -0,0 +1,22 @@ +-- Remember to increment DB_VERSION +-- in database.py on changes to this file + +CREATE TABLE blocking ( + key text PRIMARY KEY, -- Contains the reversed domain name or IP in binary form + source TEXT, -- The rule this one is based on + type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network + updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) + firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) + FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE +); +CREATE INDEX "blocking_type_updated_key" ON "blocking" ( + "type", + "updated", + "key" DESC +); + +-- Store various things +CREATE TABLE meta ( + key text PRIMARY KEY, + value integer +); diff --git a/feed_dns.py b/feed_dns.py new file mode 100755 index 0000000..47ea5d8 --- /dev/null +++ b/feed_dns.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys + +FUNCTION_MAP = { + 'a': database.feed_a, + 'cname': database.feed_cname, +} + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + args = parser.parse_args() + + database.open_db() + + try: + database.time_step('iowait') + for line in args.input: + database.time_step('feed_json_parse') + split = line.split('"') + name = split[7] + dtype = split[11] + value = split[15] + # data = json.loads(line) + # assert dtype == data['type'] + # assert name == data['name'] + # assert value == data['value'] + database.time_step('feed_switch') + FUNCTION_MAP[dtype](name, value) + database.time_step('iowait') + except KeyboardInterrupt: + print("Interupted.") + pass + + database.close_db() diff --git a/feed_rules.py b/feed_rules.py new file mode 100755 index 0000000..d32b360 --- /dev/null +++ b/feed_rules.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys +import ipaddress + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + 'type', + choices={'subdomains', 'ip4network'}, + help="Type of rule inputed") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="List of domains domains to block (with their subdomains)") + parser.add_argument( + '-f', '--first-party', action='store_true', + help="The input only comes from verified first-party sources") + args = parser.parse_args() + + database.open_db() + + if args.type == 'subdomains': + for rule in args.input: + database.feed_rule_subdomains( + rule.strip(), first_party=args.first_party) + elif args.type == 'ip4network': + for rule in args.input: + network = ipaddress.ip_network(rule.strip()) + database.feed_rule_ip4network( + network, first_party=args.first_party) + else: + assert False + + database.close_db() diff --git a/new_workflow.sh b/new_workflow.sh new file mode 100755 index 0000000..23ae589 --- /dev/null +++ b/new_workflow.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +log "Preparing database…" +./database.py --refresh + +log "Compiling rules…" +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains +cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +# NOTE: Ensure first-party sources are last +cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party +cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party + +# log "Reading A records…" +# pv a.json.gz | gunzip | ./feed_dns.py +# log "Reading CNAME records…" +# pv cname.json.gz | gunzip | ./feed_dns.py From 55877be8912dc04db0ad35bb24f6c2a9579188ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 9 Dec 2019 08:55:34 +0100 Subject: [PATCH 02/40] IP parsing C accelerated, use bytes everywhere --- .gitignore | 2 ++ Makefile | 5 +++++ accel.c | 37 +++++++++++++++++++++++++++++++++++++ database.py | 36 +++++++++++++++++++++--------------- database_schema.sql | 5 +++-- feed_dns.py | 9 +++++---- 6 files changed, 73 insertions(+), 21 deletions(-) create mode 100644 Makefile create mode 100644 accel.c diff --git a/.gitignore b/.gitignore index 188051c..aa3f3eb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ *.db-journal nameservers nameservers.head +*.o +*.so diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..fb06f61 --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +libaccel.so: accel.o + clang -shared -Wl,-soname,libaccel.so -o libaccel.so accel.o + +accel.o: accel.c + clang -c -fPIC -O3 accel.c -o accel.o diff --git a/accel.c b/accel.c new file mode 100644 index 0000000..bda0072 --- /dev/null +++ b/accel.c @@ -0,0 +1,37 @@ +#include + +int ip4_flat(char* value, wchar_t* flat) +{ + unsigned char value_index = 0; + unsigned char octet_index = 0; + unsigned char octet_value = 0; + char flat_index; + unsigned char value_chara; + do { + value_chara = value[value_index]; + if (value_chara >= '0' && value_chara <= '9') { + octet_value *= 10; + octet_value += value_chara - '0'; + } else if (value_chara == '.') { + for (flat_index = (octet_index+1)*8-1; flat_index >= octet_index*8; flat_index--) { + flat[flat_index] = '0' + (octet_value & 1); + octet_value >>= 1; + } + octet_index++; + octet_value = 0; + } else if (value_chara == '\0') { + if (octet_index != 3) { + return 1; + } + for (flat_index = 31; flat_index >= 24; flat_index--) { + flat[flat_index] = '0' + (octet_value & 1); + octet_value >>= 1; + } + return 0; + } else { + return 1; + } + value_index++; + } while (1); // This ugly thing save one comparison + return 1; +} diff --git a/database.py b/database.py index 370d25b..bdb92b0 100755 --- a/database.py +++ b/database.py @@ -7,7 +7,7 @@ import typing import ipaddress import enum import time -import pprint +import ctypes """ Utility functions to interact with the database. @@ -20,6 +20,8 @@ C = None # Cursor TIME_DICT: typing.Dict[str, float] = dict() TIME_LAST = time.perf_counter() TIME_STEP = 'start' +ACCEL = ctypes.cdll.LoadLibrary('./libaccel.so') +ACCEL_IP4_BUF = ctypes.create_unicode_buffer('Z'*32, 32) def time_step(step: str) -> None: @@ -127,9 +129,12 @@ def ip_flat(address: ipaddress.IPv4Address) -> str: return ''.join(map(str, ip_get_bits(address))) -def ip4_flat(address: str) -> str: - return '{:08b}{:08b}{:08b}{:08b}'.format( - *[int(c) for c in address.split('.')]) +def ip4_flat(address: bytes) -> typing.Optional[str]: + carg = ctypes.c_char_p(address) + ret = ACCEL.ip4_flat(carg, ACCEL_IP4_BUF) + if ret != 0: + return None + return ACCEL_IP4_BUF.value RULE_IP4NETWORK_COMMAND = \ @@ -165,23 +170,22 @@ FEED_A_COMMAND_UPSERT = \ 'WHERE updated=0 OR firstparty None: +def feed_a(name: bytes, value_ip: bytes) -> None: assert C assert CONN time_step('a_flat') - try: - value = ip4_flat(value_ip) - except (ValueError, IndexError): + value_dec = ip4_flat(value_ip) + if value_dec is None: # Malformed IPs return time_step('a_fetch') - C.execute(FEED_A_COMMAND_FETCH, (value,)) + C.execute(FEED_A_COMMAND_FETCH, (value_dec,)) base = C.fetchone() time_step('a_fetch_confirm') if not base: return b_key, b_firstparty = base - if not value.startswith(b_key): + if not value_dec.startswith(b_key): return name = name[::-1] time_step('a_upsert') @@ -212,23 +216,25 @@ FEED_CNAME_COMMAND_UPSERT = \ 'WHERE updated=0 OR firstparty None: +def feed_cname(name: bytes, value: bytes) -> None: assert C assert CONN + time_step('cname_decode') value = value[::-1] + value_dec = value.decode() time_step('cname_fetch') - C.execute(FEED_CNAME_COMMAND_FETCH, (value,)) + C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,)) base = C.fetchone() time_step('cname_fetch_confirm') if not base: # Should only happen at an extremum of the database return b_key, b_type, b_firstparty = base - matching = b_key == value[:len(b_key)] and ( - len(value) == len(b_key) + matching = b_key == value_dec[:len(b_key)] and ( + len(value_dec) == len(b_key) or ( b_type == RowType.DomainTree.value - and value[len(b_key)] == '.' + and value_dec[len(b_key)] == '.' ) ) if not matching: diff --git a/database_schema.sql b/database_schema.sql index 5e9618b..1985281 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -2,11 +2,12 @@ -- in database.py on changes to this file CREATE TABLE blocking ( - key text PRIMARY KEY, -- Contains the reversed domain name or IP in binary form + key TEXT PRIMARY KEY, -- Contains the reversed domain name or IP in binary form source TEXT, -- The rule this one is based on type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) + -- refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE ); CREATE INDEX "blocking_type_updated_key" ON "blocking" ( @@ -17,6 +18,6 @@ CREATE INDEX "blocking_type_updated_key" ON "blocking" ( -- Store various things CREATE TABLE meta ( - key text PRIMARY KEY, + key TEXT PRIMARY KEY, value integer ); diff --git a/feed_dns.py b/feed_dns.py index 47ea5d8..1cc3247 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -5,8 +5,8 @@ import argparse import sys FUNCTION_MAP = { - 'a': database.feed_a, - 'cname': database.feed_cname, + b'a': database.feed_a, + b'cname': database.feed_cname, } if __name__ == '__main__': @@ -15,7 +15,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description="TODO") parser.add_argument( - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, help="TODO") args = parser.parse_args() @@ -23,9 +23,10 @@ if __name__ == '__main__': try: database.time_step('iowait') + line: bytes for line in args.input: database.time_step('feed_json_parse') - split = line.split('"') + split = line.split(b'"') name = split[7] dtype = split[11] value = split[15] From 1484733a90b1ba645c603405a75fbd8edbd1692c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 9 Dec 2019 18:21:08 +0100 Subject: [PATCH 03/40] Workflow: Small tweaks --- database.py | 93 ++++++++++++++++++++++++--------------------- database_schema.sql | 6 +-- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/database.py b/database.py index bdb92b0..4fb5463 100755 --- a/database.py +++ b/database.py @@ -13,7 +13,9 @@ import ctypes Utility functions to interact with the database. """ -VERSION = 1 +# TODO Rule level and source priority + +VERSION = 2 PATH = f"blocking.db" CONN = None C = None # Cursor @@ -100,11 +102,11 @@ def refresh() -> None: RULE_SUBDOMAIN_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstparty) ' \ - f'VALUES (?, {RowType.DomainTree.value}, 1, ?) ' \ + 'INSERT INTO blocking (key, type, updated, firstpart, level) ' \ + f'VALUES (?, {RowType.DomainTree.value}, 1, ?, 0) ' \ 'ON CONFLICT(key)' \ f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ - 'updated=1, firstparty=?' + 'updated=1, firstparty=?, level=0' def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: @@ -138,11 +140,11 @@ def ip4_flat(address: bytes) -> typing.Optional[str]: RULE_IP4NETWORK_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstparty) ' \ - f'VALUES (?, {RowType.IPv4Network.value}, 1, ?) ' \ + 'INSERT INTO blocking (key, type, updated, firstparty, level) ' \ + f'VALUES (?, {RowType.IPv4Network.value}, 1, ?, 0) ' \ 'ON CONFLICT(key)' \ f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ - 'updated=1, firstparty=?' + 'updated=1, firstparty=?, level=0' def feed_rule_ip4network(network: ipaddress.IPv4Network, @@ -156,10 +158,12 @@ def feed_rule_ip4network(network: ipaddress.IPv4Network, FEED_A_COMMAND_FETCH = \ 'SELECT key, firstparty FROM blocking ' \ 'WHERE key<=? ' \ - 'AND updated=1 ' \ + 'AND instr(?, key) > 0 ' \ f'AND type={RowType.IPv4Network.value} ' \ - 'ORDER BY key DESC ' \ - 'LIMIT 1' + 'ORDER BY key DESC ' + +# UPSERT are not issued often relative to FETCH, +# merging the both might be counterproductive FEED_A_COMMAND_UPSERT = \ 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ @@ -177,35 +181,39 @@ def feed_a(name: bytes, value_ip: bytes) -> None: value_dec = ip4_flat(value_ip) if value_dec is None: # Malformed IPs + time_step('a_malformed') return time_step('a_fetch') - C.execute(FEED_A_COMMAND_FETCH, (value_dec,)) + C.execute(FEED_A_COMMAND_FETCH, (value_dec, value_dec)) base = C.fetchone() time_step('a_fetch_confirm') - if not base: - return - b_key, b_firstparty = base - if not value_dec.startswith(b_key): - return name = name[::-1] - time_step('a_upsert') - C.execute(FEED_A_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('other') + for b_key, b_firstparty in C: + time_step('a_upsert') + C.execute(FEED_A_COMMAND_UPSERT, + (name, b_key, b_firstparty, # Insert + b_key, b_firstparty, b_firstparty) # Update + ) + time_step('a_fetch_confirm') + time_step('a_end') FEED_CNAME_COMMAND_FETCH = \ 'SELECT key, type, firstparty FROM blocking ' \ 'WHERE key<=? ' \ f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ - 'AND updated=1 ' \ 'ORDER BY key DESC ' \ 'LIMIT 1' +# Optimisations that renders the index unused +# (and thus counterproductive until fixed): + +# 'AND instr(?, key) > 0 ' \ + # f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ # f'(type={RowType.Domain.value} AND key=?)) ' \ -# This optimisation is counter productive + +# Might be fixable by using multiple SELECT and a JOIN +# In the meantime the confirm is very light so it's ok FEED_CNAME_COMMAND_UPSERT = \ 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ @@ -224,28 +232,25 @@ def feed_cname(name: bytes, value: bytes) -> None: value_dec = value.decode() time_step('cname_fetch') C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,)) - base = C.fetchone() time_step('cname_fetch_confirm') - if not base: - # Should only happen at an extremum of the database - return - b_key, b_type, b_firstparty = base - matching = b_key == value_dec[:len(b_key)] and ( - len(value_dec) == len(b_key) - or ( - b_type == RowType.DomainTree.value - and value_dec[len(b_key)] == '.' + for b_key, b_type, b_firstparty in C: + matching = b_key == value_dec[:len(b_key)] and ( + len(value_dec) == len(b_key) + or ( + b_type == RowType.DomainTree.value + and value_dec[len(b_key)] == '.' + ) ) - ) - if not matching: - return - name = name[::-1] - time_step('cname_upsert') - C.execute(FEED_CNAME_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('other') + if not matching: + continue + name = name[::-1] + time_step('cname_upsert') + C.execute(FEED_CNAME_COMMAND_UPSERT, + (name, b_key, b_firstparty, # Insert + b_key, b_firstparty, b_firstparty) # Update + ) + time_step('cname_fetch_confirm') + time_step('cname_end') if __name__ == '__main__': diff --git a/database_schema.sql b/database_schema.sql index 1985281..833338d 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -7,12 +7,12 @@ CREATE TABLE blocking ( type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) - -- refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) + refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) (used for -only lists) + level INTEGER, -- Level of recursion to the original rule (used for source priority) FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE ); -CREATE INDEX "blocking_type_updated_key" ON "blocking" ( +CREATE INDEX "blocking_type_key" ON "blocking" ( "type", - "updated", "key" DESC ); From 57416b6e2c31e39d7a4e0f4de8d043449889f371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 00:11:21 +0100 Subject: [PATCH 04/40] Workflow: POO and individual tables per types Mostly for performances reasons. First one to implement threading later. Second one to speed up the dichotomy, but it doesn't seem that much better so far. --- Makefile | 5 - accel.c | 37 --- database.py | 598 +++++++++++++++++++++++++++---------------- database_schema.sql | 50 +++- export.py | 30 +++ feed_dns.py | 52 ++-- feed_rules.py | 26 +- filter_subdomains.sh | 45 +--- import_rules.sh | 14 + new_workflow.sh | 28 +- 10 files changed, 525 insertions(+), 360 deletions(-) delete mode 100644 Makefile delete mode 100644 accel.c create mode 100755 export.py create mode 100755 import_rules.sh diff --git a/Makefile b/Makefile deleted file mode 100644 index fb06f61..0000000 --- a/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -libaccel.so: accel.o - clang -shared -Wl,-soname,libaccel.so -o libaccel.so accel.o - -accel.o: accel.c - clang -c -fPIC -O3 accel.c -o accel.o diff --git a/accel.c b/accel.c deleted file mode 100644 index bda0072..0000000 --- a/accel.c +++ /dev/null @@ -1,37 +0,0 @@ -#include - -int ip4_flat(char* value, wchar_t* flat) -{ - unsigned char value_index = 0; - unsigned char octet_index = 0; - unsigned char octet_value = 0; - char flat_index; - unsigned char value_chara; - do { - value_chara = value[value_index]; - if (value_chara >= '0' && value_chara <= '9') { - octet_value *= 10; - octet_value += value_chara - '0'; - } else if (value_chara == '.') { - for (flat_index = (octet_index+1)*8-1; flat_index >= octet_index*8; flat_index--) { - flat[flat_index] = '0' + (octet_value & 1); - octet_value >>= 1; - } - octet_index++; - octet_value = 0; - } else if (value_chara == '\0') { - if (octet_index != 3) { - return 1; - } - for (flat_index = 31; flat_index >= 24; flat_index--) { - flat[flat_index] = '0' + (octet_value & 1); - octet_value >>= 1; - } - return 0; - } else { - return 1; - } - value_index++; - } while (1); // This ugly thing save one comparison - return 1; -} diff --git a/database.py b/database.py index 4fb5463..4daf0ec 100755 --- a/database.py +++ b/database.py @@ -1,256 +1,385 @@ #!/usr/bin/env python3 -import sqlite3 -import os -import argparse -import typing -import ipaddress -import enum -import time -import ctypes - """ Utility functions to interact with the database. """ -# TODO Rule level and source priority +import sqlite3 +import typing +import time +import os +import logging +import argparse +import coloredlogs +import ipaddress +import ctypes -VERSION = 2 -PATH = f"blocking.db" -CONN = None -C = None # Cursor -TIME_DICT: typing.Dict[str, float] = dict() -TIME_LAST = time.perf_counter() -TIME_STEP = 'start' -ACCEL = ctypes.cdll.LoadLibrary('./libaccel.so') -ACCEL_IP4_BUF = ctypes.create_unicode_buffer('Z'*32, 32) +coloredlogs.install( + level='DEBUG', + fmt='%(asctime)s %(name)s %(levelname)s %(message)s' +) + +DbValue = typing.Union[None, int, float, str, bytes] -def time_step(step: str) -> None: - global TIME_LAST - global TIME_STEP - now = time.perf_counter() - TIME_DICT.setdefault(TIME_STEP, 0.0) - TIME_DICT[TIME_STEP] += now - TIME_LAST - TIME_STEP = step - TIME_LAST = time.perf_counter() +class Database(): + VERSION = 3 + PATH = "blocking.db" + def open(self) -> None: + self.conn = sqlite3.connect(self.PATH) + self.cursor = self.conn.cursor() + self.execute("PRAGMA foreign_keys = ON") + # self.conn.create_function("prepare_ip4address", 1, + # Database.prepare_ip4address, + # deterministic=True) -def time_print() -> None: - time_step('postprint') - total = sum(TIME_DICT.values()) - for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): - print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s") - print(f"{'total':<20}: {1:7.2%} = {total:.6f} s") + def execute(self, cmd: str, args: typing.Union[ + typing.Tuple[DbValue, ...], + typing.Dict[str, DbValue]] = None) -> None: + self.cursor.execute(cmd, args or tuple()) - -class RowType(enum.Enum): - AS = 1 - DomainTree = 2 - Domain = 3 - IPv4Network = 4 - IPv6Network = 6 - - -def open_db() -> None: - time_step('open_db') - global CONN - global C - CONN = sqlite3.connect(PATH) - C = CONN.cursor() - # C.execute("PRAGMA foreign_keys = ON"); - initialized = False - try: - C.execute("SELECT value FROM meta WHERE key='version'") - version_ex = C.fetchone() - if version_ex: - if version_ex[0] == VERSION: - initialized = True - else: - print(f"Database version {version_ex[0]} found," - "it will be deleted.") - except sqlite3.OperationalError: - pass - if not initialized: - time_step('init_db') - print(f"Creating database version {VERSION}.") - CONN.close() - os.unlink(PATH) - CONN = sqlite3.connect(PATH) - C = CONN.cursor() - with open("database_schema.sql", 'r') as db_schema: - C.executescript(db_schema.read()) - C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,)) - CONN.commit() - time_step('other') - - -def close_db() -> None: - assert CONN - time_step('close_db_commit') - CONN.commit() - time_step('close_db') - CONN.close() - time_step('other') - time_print() - - -def refresh() -> None: - assert C - C.execute('UPDATE blocking SET updated = 0') - # TODO PERF Use a meta value instead - - -RULE_SUBDOMAIN_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstpart, level) ' \ - f'VALUES (?, {RowType.DomainTree.value}, 1, ?, 0) ' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ - 'updated=1, firstparty=?, level=0' - - -def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: - assert C - subdomain = subdomain[::-1] - C.execute(RULE_SUBDOMAIN_COMMAND, - (subdomain, int(first_party), int(first_party))) - # Since regex type takes precedence over domain type, - # and firstparty takes precedence over multiparty, - # we can afford to replace the whole row without checking - # the row without checking previous values and making sure - # firstparty subdomains are updated last - - -def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: - for char in address.packed: - for i in range(7, -1, -1): - yield (char >> i) & 0b1 - - -def ip_flat(address: ipaddress.IPv4Address) -> str: - return ''.join(map(str, ip_get_bits(address))) - - -def ip4_flat(address: bytes) -> typing.Optional[str]: - carg = ctypes.c_char_p(address) - ret = ACCEL.ip4_flat(carg, ACCEL_IP4_BUF) - if ret != 0: + def get_meta(self, key: str) -> typing.Optional[int]: + try: + self.execute("SELECT value FROM meta WHERE key=?", (key,)) + except sqlite3.OperationalError: + return None + for ver, in self.cursor: + return ver return None - return ACCEL_IP4_BUF.value + def set_meta(self, key: str, val: int) -> None: + self.execute("INSERT INTO meta VALUES (?, ?) " + "ON CONFLICT (key) DO " + "UPDATE set value=?", + (key, val, val)) -RULE_IP4NETWORK_COMMAND = \ - 'INSERT INTO blocking (key, type, updated, firstparty, level) ' \ - f'VALUES (?, {RowType.IPv4Network.value}, 1, ?, 0) ' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ - 'updated=1, firstparty=?, level=0' + def close(self) -> None: + self.enter_step('close_commit') + self.conn.commit() + self.enter_step('close') + self.conn.close() + self.profile() + def initialize(self) -> None: + self.enter_step('initialize') + self.close() + os.unlink(self.PATH) + self.open() + self.log.info("Creating database version %d.", self.VERSION) + with open("database_schema.sql", 'r') as db_schema: + self.cursor.executescript(db_schema.read()) + self.set_meta('version', self.VERSION) + self.conn.commit() -def feed_rule_ip4network(network: ipaddress.IPv4Network, - first_party: bool = False) -> None: - assert C - flat = ip_flat(network.network_address)[:network.prefixlen] - C.execute(RULE_IP4NETWORK_COMMAND, - (flat, int(first_party), int(first_party))) + def __init__(self) -> None: + self.log = logging.getLogger('db') + self.time_last = time.perf_counter() + self.time_step = 'init' + self.time_dict: typing.Dict[str, float] = dict() + self.step_dict: typing.Dict[str, int] = dict() + self.accel_ip4_buf = ctypes.create_unicode_buffer('Z'*32, 32) + self.open() + version = self.get_meta('version') + if version != self.VERSION: + if version is not None: + self.log.warning( + "Outdated database version: %d found, will be rebuilt.", + version) + self.initialize() -FEED_A_COMMAND_FETCH = \ - 'SELECT key, firstparty FROM blocking ' \ - 'WHERE key<=? ' \ - 'AND instr(?, key) > 0 ' \ - f'AND type={RowType.IPv4Network.value} ' \ - 'ORDER BY key DESC ' + updated = self.get_meta('updated') + if updated is None: + self.execute('SELECT max(updated) FROM rules') + data = self.cursor.fetchone() + updated, = data + self.updated = updated or 1 -# UPSERT are not issued often relative to FETCH, -# merging the both might be counterproductive + def enter_step(self, name: str) -> None: + now = time.perf_counter() + try: + self.time_dict[self.time_step] += now - self.time_last + self.step_dict[self.time_step] += 1 + except KeyError: + self.time_dict[self.time_step] = now - self.time_last + self.step_dict[self.time_step] = 1 + self.time_step = name + self.time_last = time.perf_counter() -FEED_A_COMMAND_UPSERT = \ - 'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ - f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ - 'ON CONFLICT(key)' \ - f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ - 'updated=1, firstparty=? ' \ - 'WHERE updated=0 OR firstparty None: + self.enter_step('profile') + total = sum(self.time_dict.values()) + for key, secs in sorted(self.time_dict.items(), key=lambda t: t[1]): + times = self.step_dict[key] + self.log.debug(f"{key:<20}: {times:9d} × {secs/times:5.3e} " + f"= {secs:9.2f} s ({secs/total:7.2%}) ") + self.log.debug(f"{'total':<20}: " + f"{total:9.2f} s ({1:7.2%})") + def prepare_hostname(self, hostname: str) -> str: + return hostname[::-1] + '.' -def feed_a(name: bytes, value_ip: bytes) -> None: - assert C - assert CONN - time_step('a_flat') - value_dec = ip4_flat(value_ip) - if value_dec is None: - # Malformed IPs - time_step('a_malformed') - return - time_step('a_fetch') - C.execute(FEED_A_COMMAND_FETCH, (value_dec, value_dec)) - base = C.fetchone() - time_step('a_fetch_confirm') - name = name[::-1] - for b_key, b_firstparty in C: - time_step('a_upsert') - C.execute(FEED_A_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('a_fetch_confirm') - time_step('a_end') + def prepare_zone(self, zone: str) -> str: + return self.prepare_hostname(zone) + @staticmethod + def prepare_ip4address(address: str) -> int: + total = 0 + for i, octet in enumerate(address.split('.')): + total += int(octet) << (3-i)*8 + return total + # return '{:02x}{:02x}{:02x}{:02x}'.format( + # *[int(c) for c in address.split('.')]) + # return base64.b16encode(packed).decode() + # return '{:08b}{:08b}{:08b}{:08b}'.format( + # *[int(c) for c in address.split('.')]) + # carg = ctypes.c_wchar_p(address) + # ret = ACCEL.ip4_flat(carg, self.accel_ip4_buf) + # if ret != 0: + # raise ValueError + # return self.accel_ip4_buf.value + # packed = ipaddress.ip_address(address).packed + # return packed -FEED_CNAME_COMMAND_FETCH = \ - 'SELECT key, type, firstparty FROM blocking ' \ - 'WHERE key<=? ' \ - f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ - 'ORDER BY key DESC ' \ - 'LIMIT 1' -# Optimisations that renders the index unused -# (and thus counterproductive until fixed): + def prepare_ip4network(self, network: str) -> typing.Tuple[int, int]: + # def prepare_ip4network(network: str) -> str: + net = ipaddress.ip_network(network) + mini = self.prepare_ip4address(net.network_address.exploded) + maxi = self.prepare_ip4address(net.broadcast_address.exploded) + # mini = net.network_address.packed + # maxi = net.broadcast_address.packed + return mini, maxi + # return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen] -# 'AND instr(?, key) > 0 ' \ + def expire(self) -> None: + self.enter_step('expire') + self.updated += 1 + self.set_meta('updated', self.updated) -# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ -# f'(type={RowType.Domain.value} AND key=?)) ' \ + def update_references(self) -> None: + self.enter_step('update_refs') + self.execute('UPDATE rules AS r SET refs=' + '(SELECT count(*) FROM rules ' + 'WHERE source=r.id)') -# Might be fixable by using multiple SELECT and a JOIN -# In the meantime the confirm is very light so it's ok + def prune(self) -> None: + self.enter_step('prune') + self.execute('DELETE FROM rules WHERE updated typing.Iterable[str]: + command = 'SELECT val FROM rules ' \ + 'INNER JOIN hostname ON rules.id = hostname.entry' + restrictions: typing.List[str] = list() + if first_party_only: + restrictions.append('rules.first_party = 1') + if end_chain_only: + restrictions.append('rules.refs = 0') + if restrictions: + command += ' WHERE ' + ' AND '.join(restrictions) + self.execute(command) + for val, in self.cursor: + yield val[:-1][::-1] - -def feed_cname(name: bytes, value: bytes) -> None: - assert C - assert CONN - time_step('cname_decode') - value = value[::-1] - value_dec = value.decode() - time_step('cname_fetch') - C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,)) - time_step('cname_fetch_confirm') - for b_key, b_type, b_firstparty in C: - matching = b_key == value_dec[:len(b_key)] and ( - len(value_dec) == len(b_key) - or ( - b_type == RowType.DomainTree.value - and value_dec[len(b_key)] == '.' - ) + def get_domain(self, domain: str) -> typing.Iterable[int]: + self.enter_step('get_domain_prepare') + domain_prep = self.prepare_hostname(domain) + self.enter_step('get_domain_select') + self.execute( + 'SELECT null, entry FROM hostname ' + 'WHERE val=:d ' + 'UNION ' + 'SELECT * FROM (' + 'SELECT val, entry FROM zone ' + 'WHERE val<=:d ' + 'ORDER BY val DESC LIMIT 1' + ')', + {'d': domain_prep} + ) + for val, entry in self.cursor: + self.enter_step('get_domain_confirm') + if not (val is None or domain_prep.startswith(val)): + continue + self.enter_step('get_domain_yield') + yield entry + + def get_ip4(self, address: str) -> typing.Iterable[int]: + self.enter_step('get_ip4_prepare') + try: + address_prep = self.prepare_ip4address(address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", address) + return + self.enter_step('get_ip4_select') + self.execute( + 'SELECT entry FROM ip4address ' + # 'SELECT null, entry FROM ip4address ' + 'WHERE val=:a ' + 'UNION ' + # 'SELECT * FROM (' + # 'SELECT val, entry FROM ip4network ' + # 'WHERE val<=:a ' + # 'AND instr(:a, val) > 0 ' + # 'ORDER BY val DESC' + # ')' + 'SELECT entry FROM ip4network ' + 'WHERE :a BETWEEN mini AND maxi ', + {'a': address_prep} + ) + for val, entry in self.cursor: + # self.enter_step('get_ip4_confirm') + # if not (val is None or val.startswith(address_prep)): + # # PERF startswith but from the end + # continue + self.enter_step('get_ip4_yield') + yield entry + + def _set_generic(self, + table: str, + select_query: str, + insert_query: str, + prep: typing.Dict[str, DbValue], + is_first_party: bool = False, + source: int = None, + ) -> None: + # Since this isn't the bulk of the processing, + # here abstraction > performaces + + # Fields based on the source + if source is None: + first_party = int(is_first_party) + level = 0 + else: + self.enter_step(f'set_{table}_source') + self.execute( + 'SELECT first_party, level FROM rules ' + 'WHERE id=?', + (source,) + ) + first_party, level = self.cursor.fetchone() + level += 1 + + self.enter_step(f'set_{table}_select') + self.execute(select_query, prep) + + rules_prep = { + "source": source, + "updated": self.updated, + "first_party": first_party, + "level": level, + } + + # If the entry already exists + for entry, in self.cursor: # only one + self.enter_step(f'set_{table}_update') + rules_prep['entry'] = entry + self.execute( + 'UPDATE rules SET ' + 'source=:source, updated=:updated, ' + 'first_party=:first_party, level=:level ' + 'WHERE id=:entry AND (updated<:updated OR ' + 'first_party<:first_party OR level<:level)', + rules_prep + ) + # Only update if any of the following: + # - the entry is outdataed + # - the entry was not a first_party but this is + # - this is closer to the original rule + return + + # If it does not exist + + if source is not None: + self.enter_step(f'set_{table}_incsrc') + self.execute('UPDATE rules SET refs = refs + 1 WHERE id=?', + (source,)) + + self.enter_step(f'set_{table}_insert') + self.execute( + 'INSERT INTO rules ' + '(source, updated, first_party, refs, level) ' + 'VALUES (:source, :updated, :first_party, 0, :level) ', + rules_prep + ) + self.execute('SELECT id FROM rules WHERE rowid=?', + (self.cursor.lastrowid,)) + for entry, in self.cursor: # only one + prep['entry'] = entry + self.execute(insert_query, prep) + return + assert False + + def set_hostname(self, hostname: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_hostname_prepare') + prep: typing.Dict[str, DbValue] = { + 'val': self.prepare_hostname(hostname), + } + self._set_generic( + 'hostname', + 'SELECT entry FROM hostname WHERE val=:val', + 'INSERT INTO hostname (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_ip4address(self, ip4address: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_ip4add_prepare') + try: + ip4address_prep = self.prepare_ip4address(ip4address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", ip4address) + return + prep: typing.Dict[str, DbValue] = { + 'val': ip4address_prep, + } + self._set_generic( + 'ip4add', + 'SELECT entry FROM ip4address WHERE val=:val', + 'INSERT INTO ip4address (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_zone(self, zone: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_zone_prepare') + prep: typing.Dict[str, DbValue] = { + 'val': self.prepare_zone(zone), + } + self._set_generic( + 'zone', + 'SELECT entry FROM zone WHERE val=:val', + 'INSERT INTO zone (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + + def set_ip4network(self, ip4network: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_ip4net_prepare') + try: + ip4network_prep = self.prepare_ip4network(ip4network) + except (ValueError, IndexError): + self.log.error("Invalid ip4network: %s", ip4network) + return + prep: typing.Dict[str, DbValue] = { + 'mini': ip4network_prep[0], + 'maxi': ip4network_prep[1], + } + self._set_generic( + 'ip4net', + 'SELECT entry FROM ip4network WHERE mini=:mini AND maxi=:maxi', + 'INSERT INTO ip4network (mini, maxi, entry) ' + 'VALUES (:mini, :maxi, :entry)', + prep, + *args, **kwargs ) - if not matching: - continue - name = name[::-1] - time_step('cname_upsert') - C.execute(FEED_CNAME_COMMAND_UPSERT, - (name, b_key, b_firstparty, # Insert - b_key, b_firstparty, b_firstparty) # Update - ) - time_step('cname_fetch_confirm') - time_step('cname_end') if __name__ == '__main__': @@ -259,13 +388,28 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description="Database operations") parser.add_argument( - '-r', '--refresh', action='store_true', + '-i', '--initialize', action='store_true', + help="Reconstruct the whole database") + parser.add_argument( + '-p', '--prune', action='store_true', + help="Remove old entries from database") + parser.add_argument( + '-e', '--expire', action='store_true', help="Set the whole database as an old source") + parser.add_argument( + '-r', '--references', action='store_true', + help="Update the reference count") args = parser.parse_args() - open_db() + DB = Database() - if args.refresh: - refresh() + if args.initialize: + DB.initialize() + if args.prune: + DB.prune() + if args.expire: + DB.expire() + if args.references and not args.prune: + DB.update_references() - close_db() + DB.close() diff --git a/database_schema.sql b/database_schema.sql index 833338d..9be81b0 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -1,21 +1,49 @@ -- Remember to increment DB_VERSION -- in database.py on changes to this file -CREATE TABLE blocking ( - key TEXT PRIMARY KEY, -- Contains the reversed domain name or IP in binary form - source TEXT, -- The rule this one is based on - type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network +CREATE TABLE rules ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source INTEGER, -- The rule this one is based on updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) - firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) - refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) (used for -only lists) - level INTEGER, -- Level of recursion to the original rule (used for source priority) - FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE + first_party INTEGER, -- 1: this blocks a first party for sure, 0: maybe + refs INTEGER, -- Number of entries issued from this one + level INTEGER, -- Level of recursion to the root source rule (used for source priority) + FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE ); -CREATE INDEX "blocking_type_key" ON "blocking" ( - "type", - "key" DESC + +CREATE TABLE asn ( + val INTEGER PRIMARY KEY, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE TABLE hostname ( + val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone) + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE zone ( + val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching) + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE ip4address ( + val INTEGER PRIMARY KEY, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); + +CREATE TABLE ip4network ( + -- val TEXT PRIMARY KEY, + mini INTEGER, + maxi INTEGER, + entry INTEGER, + FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE +); +CREATE INDEX ip4network_minmax ON ip4network (mini, maxi); + -- Store various things CREATE TABLE meta ( key TEXT PRIMARY KEY, diff --git a/export.py b/export.py new file mode 100755 index 0000000..58b276b --- /dev/null +++ b/export.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import database +import argparse +import sys + + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="TODO") + parser.add_argument( + '-f', '--first-party', action='store_true', + help="TODO") + parser.add_argument( + '-e', '--end-chain', action='store_true', + help="TODO") + args = parser.parse_args() + + DB = database.Database() + + for domain in DB.export(first_party_only=args.first_party, + end_chain_only=args.end_chain): + print(domain, file=args.output) + + DB.close() diff --git a/feed_dns.py b/feed_dns.py index 1cc3247..f46d97b 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -3,42 +3,56 @@ import database import argparse import sys - -FUNCTION_MAP = { - b'a': database.feed_a, - b'cname': database.feed_cname, -} +import logging if __name__ == '__main__': # Parsing arguments + log = logging.getLogger('feed_dns') parser = argparse.ArgumentParser( description="TODO") parser.add_argument( - '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="TODO") args = parser.parse_args() - database.open_db() + DB = database.Database() try: - database.time_step('iowait') - line: bytes + DB.enter_step('iowait') + # line: bytes + line: str for line in args.input: - database.time_step('feed_json_parse') - split = line.split(b'"') - name = split[7] - dtype = split[11] - value = split[15] + DB.enter_step('feed_json_parse') + # split = line.split(b'"') + split = line.split('"') + try: + name = split[7] + dtype = split[11] + value = split[15] + except IndexError: + log.error("Invalid JSON: %s", line) + continue + # DB.enter_step('feed_json_assert') # data = json.loads(line) # assert dtype == data['type'] # assert name == data['name'] # assert value == data['value'] - database.time_step('feed_switch') - FUNCTION_MAP[dtype](name, value) - database.time_step('iowait') + + DB.enter_step('feed_switch') + if dtype == 'a': + for rule in DB.get_ip4(value): + DB.set_hostname(name, source=rule) + elif dtype == 'cname': + for rule in DB.get_domain(value): + DB.set_hostname(name, source=rule) + elif dtype == 'ptr': + for rule in DB.get_domain(value): + DB.set_ip4address(name, source=rule) + DB.enter_step('iowait') except KeyboardInterrupt: - print("Interupted.") + log.warning("Interupted.") pass - database.close_db() + DB.close() diff --git a/feed_rules.py b/feed_rules.py index d32b360..7a19614 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -13,7 +13,7 @@ if __name__ == '__main__': description="TODO") parser.add_argument( 'type', - choices={'subdomains', 'ip4network'}, + choices={'zone', 'ip4network'}, help="Type of rule inputed") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, @@ -23,18 +23,16 @@ if __name__ == '__main__': help="The input only comes from verified first-party sources") args = parser.parse_args() - database.open_db() + DB = database.Database() - if args.type == 'subdomains': - for rule in args.input: - database.feed_rule_subdomains( - rule.strip(), first_party=args.first_party) - elif args.type == 'ip4network': - for rule in args.input: - network = ipaddress.ip_network(rule.strip()) - database.feed_rule_ip4network( - network, first_party=args.first_party) - else: - assert False + FUNCTION_MAP = { + 'zone': DB.set_zone, + 'ip4network': DB.set_ip4network, + } - database.close_db() + fun = FUNCTION_MAP[args.type] + + for rule in args.input: + fun(rule.strip(), is_first_party=args.first_party) + + DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 9a09b9a..98638a9 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,37 +4,14 @@ function log() { echo -e "\033[33m$@\033[0m" } -if [ ! -f temp/all_resolved.csv ] -then - echo "Run ./resolve_subdomains.sh first!" - exit 1 -fi +log "Updating references…" +./database.py --references -# Gather all the rules for filtering -log "Compiling rules…" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt -./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list -cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list -cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt -cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt - -log "Filtering first-party tracking domains…" -./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list -sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt - -log "Filtering first-party curated tracking domains…" -./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list -sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt - -log "Filtering multi-party tracking domains…" -./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list -sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt - -log "Filtering multi-party curated tracking domains…" -./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list -sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt +log "Exporting lists…" +./export.py --first-party | sort -u > dist/firstparty-trackers.txt +./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt +./export.py | sort -u > dist/multiparty-trackers.txt +./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt # Format the blocklist so it can be used as a hostlist function generate_hosts { @@ -61,14 +38,14 @@ function generate_hosts { echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of source websites: TODO" + echo "# Number of source subdomains: TODO" echo "#" - echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)" + echo "# Number of known first-party trackers: TODO" echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" - echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)" + echo "# Number of known multi-party trackers: TODO" echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo diff --git a/import_rules.sh b/import_rules.sh new file mode 100755 index 0000000..d4d4719 --- /dev/null +++ b/import_rules.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +log "Importing rules…" +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone +cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party +cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party + diff --git a/new_workflow.sh b/new_workflow.sh index 23ae589..bc2a78b 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -5,18 +5,20 @@ function log() { } log "Preparing database…" -./database.py --refresh +./database.py --expire -log "Compiling rules…" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains -cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network -# NOTE: Ensure first-party sources are last -cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party -cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party +./import_rules.sh + +# TODO Fetch 'em +log "Reading PTR records…" +pv ptr.json.gz | gunzip | ./feed_dns.py +log "Reading A records…" +pv a.json.gz | gunzip | ./feed_dns.py +log "Reading CNAME records…" +pv cname.json.gz | gunzip | ./feed_dns.py + +log "Pruning old data…" +./database.py --prune + +./filter_subdomains.sh -# log "Reading A records…" -# pv a.json.gz | gunzip | ./feed_dns.py -# log "Reading CNAME records…" -# pv cname.json.gz | gunzip | ./feed_dns.py From e19f6663312abd1337c722e5274d3409f617793c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 08:23:38 +0100 Subject: [PATCH 05/40] Workflow: Automatically import IP ranges from ASN Closes #9 --- database.py | 43 +++++++++++++++++++++++++++++++-- feed_rules.py | 15 ++++++------ filter_subdomains.sh | 19 ++++++--------- import_rules.sh | 5 ++++ rules_asn/.gitignore | 2 ++ rules_asn/first-party.txt | 10 ++++++++ rules_ip/first-party.txt | 51 --------------------------------------- 7 files changed, 72 insertions(+), 73 deletions(-) create mode 100644 rules_asn/.gitignore create mode 100644 rules_asn/first-party.txt diff --git a/database.py b/database.py index 4daf0ec..aa38604 100755 --- a/database.py +++ b/database.py @@ -33,6 +33,9 @@ class Database(): # self.conn.create_function("prepare_ip4address", 1, # Database.prepare_ip4address, # deterministic=True) + self.conn.create_function("unpack_domain", 1, + lambda s: s[:-1][::-1], + deterministic=True) def execute(self, cmd: str, args: typing.Union[ typing.Tuple[DbValue, ...], @@ -123,6 +126,13 @@ class Database(): def prepare_zone(self, zone: str) -> str: return self.prepare_hostname(zone) + @staticmethod + def prepare_asn(asn: str) -> int: + asn = asn.upper() + if asn.startswith('AS'): + asn = asn[2:] + return int(asn) + @staticmethod def prepare_ip4address(address: str) -> int: total = 0 @@ -169,7 +179,7 @@ class Database(): def export(self, first_party_only: bool = False, end_chain_only: bool = False) -> typing.Iterable[str]: - command = 'SELECT val FROM rules ' \ + command = 'SELECT unpack_domain(val) FROM rules ' \ 'INNER JOIN hostname ON rules.id = hostname.entry' restrictions: typing.List[str] = list() if first_party_only: @@ -178,9 +188,10 @@ class Database(): restrictions.append('rules.refs = 0') if restrictions: command += ' WHERE ' + ' AND '.join(restrictions) + command += ' ORDER BY unpack_domain(val) ASC' self.execute(command) for val, in self.cursor: - yield val[:-1][::-1] + yield val def get_domain(self, domain: str) -> typing.Iterable[int]: self.enter_step('get_domain_prepare') @@ -235,6 +246,13 @@ class Database(): self.enter_step('get_ip4_yield') yield entry + def list_asn(self) -> typing.Iterable[typing.Tuple[str, int]]: + self.enter_step('list_asn_select') + self.enter_step('get_domain_select') + self.execute('SELECT val, entry FROM asn') + for val, entry in self.cursor: + yield f'AS{val}', entry + def _set_generic(self, table: str, select_query: str, @@ -325,8 +343,29 @@ class Database(): *args, **kwargs ) + def set_asn(self, asn: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_asn_prepare') + try: + asn_prep = self.prepare_asn(asn) + except ValueError: + self.log.error("Invalid asn: %s", asn) + return + prep: typing.Dict[str, DbValue] = { + 'val': asn_prep, + } + self._set_generic( + 'asn', + 'SELECT entry FROM asn WHERE val=:val', + 'INSERT INTO asn (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + def set_ip4address(self, ip4address: str, *args: typing.Any, **kwargs: typing.Any) -> None: + # TODO Do not add if already in ip4network self.enter_step('set_ip4add_prepare') try: ip4address_prep = self.prepare_ip4address(ip4address) diff --git a/feed_rules.py b/feed_rules.py index 7a19614..a1d236d 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -3,8 +3,12 @@ import database import argparse import sys -import ipaddress +FUNCTION_MAP = { + 'zone': database.Database.set_zone, + 'ip4network': database.Database.set_ip4network, + 'asn': database.Database.set_asn, +} if __name__ == '__main__': @@ -13,7 +17,7 @@ if __name__ == '__main__': description="TODO") parser.add_argument( 'type', - choices={'zone', 'ip4network'}, + choices=FUNCTION_MAP.keys(), help="Type of rule inputed") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, @@ -25,14 +29,9 @@ if __name__ == '__main__': DB = database.Database() - FUNCTION_MAP = { - 'zone': DB.set_zone, - 'ip4network': DB.set_ip4network, - } - fun = FUNCTION_MAP[args.type] for rule in args.input: - fun(rule.strip(), is_first_party=args.first_party) + fun(DB, rule.strip(), is_first_party=args.first_party) DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 98638a9..67783e8 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,16 +4,13 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Updating references…" -./database.py --references - log "Exporting lists…" -./export.py --first-party | sort -u > dist/firstparty-trackers.txt -./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt -./export.py | sort -u > dist/multiparty-trackers.txt -./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt +./export.py --first-party --output dist/firstparty-trackers.txt +./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt +./export.py --output dist/multiparty-trackers.txt +./export.py --end-chain --output dist/multiparty-only-trackers.txt -# Format the blocklist so it can be used as a hostlist +log "Generating hosts lists…" function generate_hosts { basename="$1" description="$2" @@ -35,6 +32,7 @@ function generate_hosts { echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt" echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt" + echo '# (you can remove `-hosts` to get the raw list)' echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" @@ -49,10 +47,7 @@ function generate_hosts { echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo - cat "dist/$basename.txt" | while read host; - do - echo "0.0.0.0 $host" - done + sed 's|^|0.0.0.0 |' "dist/$basename.txt" ) > "dist/$basename-hosts.txt" } diff --git a/import_rules.sh b/import_rules.sh index d4d4719..358155c 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -9,6 +9,11 @@ cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_dom cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn + cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party +cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn --first-party + +./feed_asn.py diff --git a/rules_asn/.gitignore b/rules_asn/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules_asn/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt diff --git a/rules_asn/first-party.txt b/rules_asn/first-party.txt new file mode 100644 index 0000000..e7b93fa --- /dev/null +++ b/rules_asn/first-party.txt @@ -0,0 +1,10 @@ +# Eulerian +AS50234 +# Criteo +AS44788 +AS19750 +AS55569 +# ThreatMetrix +AS30286 +# Webtrekk +AS60164 diff --git a/rules_ip/first-party.txt b/rules_ip/first-party.txt index 3561894..e69de29 100644 --- a/rules_ip/first-party.txt +++ b/rules_ip/first-party.txt @@ -1,51 +0,0 @@ -# Eulerian (AS50234 EULERIAN TECHNOLOGIES S.A.S.) -109.232.192.0/21 -# Criteo (AS44788 Criteo SA) -91.199.242.0/24 -91.212.98.0/24 -178.250.0.0/21 -178.250.0.0/24 -178.250.1.0/24 -178.250.2.0/24 -178.250.3.0/24 -178.250.4.0/24 -178.250.6.0/24 -185.235.84.0/24 -# Criteo (AS19750 Criteo Corp.) -74.119.116.0/22 -74.119.117.0/24 -74.119.118.0/24 -74.119.119.0/24 -91.199.242.0/24 -185.235.85.0/24 -199.204.168.0/22 -199.204.168.0/24 -199.204.169.0/24 -199.204.170.0/24 -199.204.171.0/24 -178.250.0.0/21 -91.212.98.0/24 -91.199.242.0/24 -185.235.84.0/24 -# Criteo (AS55569 Criteo APAC) -91.199.242.0/24 -116.213.20.0/22 -116.213.20.0/24 -116.213.21.0/24 -182.161.72.0/22 -182.161.72.0/24 -182.161.73.0/24 -185.235.86.0/24 -185.235.87.0/24 -# ThreatMetrix (AS30286 ThreatMetrix Inc.) -69.84.176.0/24 -173.254.179.0/24 -185.32.240.0/23 -185.32.242.0/23 -192.225.156.0/22 -199.101.156.0/23 -199.101.158.0/23 -# Webtrekk (AS60164 Webtrekk GmbH) -185.54.148.0/22 -185.54.150.0/24 -185.54.151.0/24 From 9050a84670a8b06b62fdd0e54c1f5b6a39541a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 12:35:05 +0100 Subject: [PATCH 06/40] Read-only mode --- database.py | 18 ++++++++++++------ feed_dns.py | 2 +- feed_rules.py | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/database.py b/database.py index aa38604..d336eba 100755 --- a/database.py +++ b/database.py @@ -12,7 +12,6 @@ import logging import argparse import coloredlogs import ipaddress -import ctypes coloredlogs.install( level='DEBUG', @@ -27,7 +26,9 @@ class Database(): PATH = "blocking.db" def open(self) -> None: - self.conn = sqlite3.connect(self.PATH) + mode = 'rwc' if self.write else 'ro' + uri = f'file:{self.PATH}?mode={mode}' + self.conn = sqlite3.connect(uri, uri=True) self.cursor = self.conn.cursor() self.execute("PRAGMA foreign_keys = ON") # self.conn.create_function("prepare_ip4address", 1, @@ -40,6 +41,8 @@ class Database(): def execute(self, cmd: str, args: typing.Union[ typing.Tuple[DbValue, ...], typing.Dict[str, DbValue]] = None) -> None: + # self.log.debug(cmd) + # self.log.debug(args) self.cursor.execute(cmd, args or tuple()) def get_meta(self, key: str) -> typing.Optional[int]: @@ -65,8 +68,11 @@ class Database(): self.profile() def initialize(self) -> None: - self.enter_step('initialize') self.close() + self.enter_step('initialize') + if not self.write: + self.log.error("Cannot initialize in read-only mode.") + raise os.unlink(self.PATH) self.open() self.log.info("Creating database version %d.", self.VERSION) @@ -75,13 +81,13 @@ class Database(): self.set_meta('version', self.VERSION) self.conn.commit() - def __init__(self) -> None: + def __init__(self, write: bool = False) -> None: self.log = logging.getLogger('db') self.time_last = time.perf_counter() self.time_step = 'init' self.time_dict: typing.Dict[str, float] = dict() self.step_dict: typing.Dict[str, int] = dict() - self.accel_ip4_buf = ctypes.create_unicode_buffer('Z'*32, 32) + self.write = write self.open() version = self.get_meta('version') @@ -440,7 +446,7 @@ if __name__ == '__main__': help="Update the reference count") args = parser.parse_args() - DB = Database() + DB = Database(write=True) if args.initialize: DB.initialize() diff --git a/feed_dns.py b/feed_dns.py index f46d97b..2993d6d 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -17,7 +17,7 @@ if __name__ == '__main__': help="TODO") args = parser.parse_args() - DB = database.Database() + DB = database.Database(write=True) try: DB.enter_step('iowait') diff --git a/feed_rules.py b/feed_rules.py index a1d236d..72888f5 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -27,7 +27,7 @@ if __name__ == '__main__': help="The input only comes from verified first-party sources") args = parser.parse_args() - DB = database.Database() + DB = database.Database(write=True) fun = FUNCTION_MAP[args.type] From 231bb83667710a2c110f60e9dd48028d4743de3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 12:36:11 +0100 Subject: [PATCH 07/40] Threaded feed_dns Largely disapointing --- feed_dns.py | 117 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 90 insertions(+), 27 deletions(-) mode change 100755 => 100644 feed_dns.py diff --git a/feed_dns.py b/feed_dns.py old mode 100755 new mode 100644 index 2993d6d..fed322d --- a/feed_dns.py +++ b/feed_dns.py @@ -4,27 +4,31 @@ import database import argparse import sys import logging +import threading +import queue +import typing -if __name__ == '__main__': +NUMBER_THREADS = 8 - # Parsing arguments - log = logging.getLogger('feed_dns') - parser = argparse.ArgumentParser( - description="TODO") - parser.add_argument( - # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="TODO") - args = parser.parse_args() - DB = database.Database(write=True) +class Worker(threading.Thread): + def __init__(self, + lines_queue: queue.Queue, + write_queue: queue.Queue, + index: int = 0): + super(Worker, self).__init__() + self.log = logging.getLogger(f'worker{index:03d}') + self.lines_queue = lines_queue + self.write_queue = write_queue + self.index = index - try: - DB.enter_step('iowait') - # line: bytes + def run(self) -> None: + self.db = database.Database(write=False) + self.db.log = logging.getLogger(f'db{self.index:03d}') + self.db.enter_step('wait_line') line: str - for line in args.input: - DB.enter_step('feed_json_parse') + for line in iter(self.lines_queue.get, None): + self.db.enter_step('feed_json_parse') # split = line.split(b'"') split = line.split('"') try: @@ -40,19 +44,78 @@ if __name__ == '__main__': # assert name == data['name'] # assert value == data['value'] - DB.enter_step('feed_switch') + self.db.enter_step('feed_switch') if dtype == 'a': - for rule in DB.get_ip4(value): - DB.set_hostname(name, source=rule) + for rule in self.db.get_ip4(value): + self.db.enter_step('wait_put') + self.write_queue.put( + (database.Database.set_hostname, name, rule)) elif dtype == 'cname': - for rule in DB.get_domain(value): - DB.set_hostname(name, source=rule) + for rule in self.db.get_domain(value): + self.db.enter_step('wait_put') + self.write_queue.put( + (database.Database.set_hostname, name, rule)) elif dtype == 'ptr': - for rule in DB.get_domain(value): - DB.set_ip4address(name, source=rule) - DB.enter_step('iowait') - except KeyboardInterrupt: - log.warning("Interupted.") - pass + for rule in self.db.get_domain(value): + self.db.enter_step('wait_put') + self.write_queue.put( + (database.Database.set_ip4address, name, rule)) + self.db.enter_step('wait_line') + self.db.enter_step('end') + self.write_queue.put(None) + self.db.close() + + +if __name__ == '__main__': + + # Parsing arguments + log = logging.getLogger('feed_dns') + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + args = parser.parse_args() + + DB = database.Database(write=False) # Not needed, just for timing + DB.log = logging.getLogger('dbf') + DBW = database.Database(write=True) + DBW.log = logging.getLogger('dbw') + + lines_queue: queue.Queue = queue.Queue(maxsize=NUMBER_THREADS) + write_queue: queue.Queue = queue.Queue(maxsize=NUMBER_THREADS) + + def fill_lines_queue() -> None: + DB.enter_step('iowait') + for line in args.input: + DB.enter_step('wait_put') + lines_queue.put(line) + DB.enter_step('iowait') + + DB.enter_step('end_put') + for _ in range(NUMBER_THREADS): + lines_queue.put(None) + + for w in range(NUMBER_THREADS): + Worker(lines_queue, write_queue, w).start() + + threading.Thread(target=fill_lines_queue).start() + + for _ in range(NUMBER_THREADS): + fun: typing.Callable + name: str + source: int + DBW.enter_step('wait_fun') + for fun, name, source in iter(write_queue.get, None): + DBW.enter_step('exec_fun') + fun(DBW, name, source=source) + DBW.enter_step('commit') + DBW.conn.commit() + DBW.enter_step('wait_fun') + + DBW.enter_step('end') + + DBW.close() DB.close() From 8d94b80fd04540c544cf99a866c3c14c494db495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 13:38:23 +0100 Subject: [PATCH 08/40] Integrated DNS resolving to workflow Since the bigger datasets are only updated once a month, this might help for quick updates. --- resolve_subdomains.py | 81 ++++++++++++++++--------------------------- resolve_subdomains.sh | 10 +++--- 2 files changed, 34 insertions(+), 57 deletions(-) diff --git a/resolve_subdomains.py b/resolve_subdomains.py index ec10c47..b675b11 100755 --- a/resolve_subdomains.py +++ b/resolve_subdomains.py @@ -12,12 +12,11 @@ import queue import sys import threading import typing -import csv +import time import coloredlogs import dns.exception import dns.resolver -import progressbar DNS_TIMEOUT = 5.0 NUMBER_THREADS = 512 @@ -26,8 +25,6 @@ NUMBER_TRIES = 5 # TODO All the domains don't get treated, # so it leaves with 4-5 subdomains not resolved -glob = None - class Worker(threading.Thread): """ @@ -59,9 +56,9 @@ class Worker(threading.Thread): self.change_nameserver() def resolve_subdomain(self, subdomain: str) -> typing.Optional[ - typing.List[ - str - ] + typing.List[ + dns.rrset.RRset + ] ]: """ Returns the resolution chain of the subdomain to an A record, @@ -93,18 +90,7 @@ class Worker(threading.Thread): except dns.name.EmptyLabel: self.log.warning("Empty label for %s", subdomain) return None - resolved = list() - last = len(query.response.answer) - 1 - for a, answer in enumerate(query.response.answer): - if answer.rdtype == dns.rdatatype.CNAME: - assert a < last - resolved.append(answer.items[0].to_text()[:-1]) - elif answer.rdtype == dns.rdatatype.A: - assert a == last - resolved.append(answer.items[0].address) - else: - assert False - return resolved + return query.response.answer def run(self) -> None: self.log.info("Started") @@ -124,7 +110,6 @@ class Worker(threading.Thread): self.log.error("Gave up on %s", subdomain) resolved = [] - resolved.insert(0, subdomain) assert isinstance(resolved, list) self.orchestrator.results_queue.put(resolved) @@ -182,7 +167,23 @@ class Orchestrator(): for _ in range(NUMBER_THREADS): self.subdomains_queue.put(None) - def run(self) -> typing.Iterable[typing.List[str]]: + @staticmethod + def format_rrset(rrset: dns.rrset.RRset) -> typing.Iterable[str]: + if rrset.rdtype == dns.rdatatype.CNAME: + dtype = 'cname' + elif rrset.rdtype == dns.rdatatype.A: + dtype = 'a' + else: + raise NotImplementedError + name = rrset.name.to_text()[:-1] + for item in rrset.items: + value = item.to_text() + if rrset.rdtype == dns.rdatatype.CNAME: + value = value[:-1] + yield '{"timestamp":"' + str(int(time.time())) + '","name":"' + \ + name + '","type":"' + dtype + '","value":"' + value + '"}\n' + + def run(self) -> typing.Iterable[str]: """ Yield the results. """ @@ -197,9 +198,10 @@ class Orchestrator(): # Wait for one sentinel per worker # In the meantime output results for _ in range(NUMBER_THREADS): - result: typing.List[str] - for result in iter(self.results_queue.get, None): - yield result + resolved: typing.List[dns.rrset.RRset] + for resolved in iter(self.results_queue.get, None): + for rrset in resolved: + yield from self.format_rrset(rrset) self.log.info("Waiting for reader thread") fill_thread.join() @@ -214,16 +216,14 @@ def main() -> None: the last CNAME resolved and the IP adress it resolves to. Takes as an input a filename (or nothing, for stdin), and as an output a filename (or nothing, for stdout). - The input must be a subdomain per line, the output is a comma-sep - file with the columns source CNAME and A. + The input must be a subdomain per line, the output is a TODO Use the file `nameservers` as the list of nameservers to use, or else it will use the system defaults. - Also shows a nice progressbar. """ # Initialization coloredlogs.install( - level='DEBUG', + # level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' ) @@ -244,20 +244,6 @@ def main() -> None: # help="Number of threads to use") args = parser.parse_args() - # Progress bar - widgets = [ - progressbar.Percentage(), - ' ', progressbar.SimpleProgress(), - ' ', progressbar.Bar(), - ' ', progressbar.Timer(), - ' ', progressbar.AdaptiveTransferSpeed(unit='req'), - ' ', progressbar.AdaptiveETA(), - ] - progress = progressbar.ProgressBar(widgets=widgets) - if args.input.seekable(): - progress.max_value = len(args.input.readlines()) - args.input.seek(0) - # Cleaning input iterator = iter(args.input) iterator = map(str.strip, iterator) @@ -269,15 +255,8 @@ def main() -> None: servers = open('nameservers').readlines() servers = list(filter(None, map(str.strip, servers))) - writer = csv.writer(args.output) - - progress.start() - global glob - glob = Orchestrator(iterator, servers) - for resolved in glob.run(): - progress.update(progress.value + 1) - writer.writerow(resolved) - progress.finish() + for resolved in Orchestrator(iterator, servers).run(): + args.output.write(resolved) if __name__ == '__main__': diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index ed7af79..ee5f83c 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -4,11 +4,9 @@ function log() { echo -e "\033[33m$@\033[0m" } -# Resolve the CNAME chain of all the known subdomains for later analysis -log "Compiling subdomain lists..." -pv subdomains/*.list | sort -u > temp/all_subdomains.list +log "Compiling locally known subdomain…" # Sort by last character to utilize the DNS server caching mechanism -pv temp/all_subdomains.list | rev | sort | rev > temp/all_subdomains_reversort.list -./resolve_subdomains.py --input temp/all_subdomains_reversort.list --output temp/all_resolved.csv -sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv +pv subdomains/*.list | rev | sort -u | rev > temp/all_subdomains.list +log "Resolving locally known subdomain…" +pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.json From f3eedcba2260108d1e854ef1b0daca93f93b2eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 13:54:00 +0100 Subject: [PATCH 09/40] Updated now based on timestamp Did I forget to add feed_asn.py a few commits ago? Oh well... --- database.py | 30 +++++++---------------------- feed_asn.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ feed_dns.py | 7 ++++--- feed_rules.py | 7 ++++++- 4 files changed, 69 insertions(+), 27 deletions(-) create mode 100755 feed_asn.py diff --git a/database.py b/database.py index d336eba..2a3e8df 100755 --- a/database.py +++ b/database.py @@ -98,13 +98,6 @@ class Database(): version) self.initialize() - updated = self.get_meta('updated') - if updated is None: - self.execute('SELECT max(updated) FROM rules') - data = self.cursor.fetchone() - updated, = data - self.updated = updated or 1 - def enter_step(self, name: str) -> None: now = time.perf_counter() try: @@ -168,20 +161,15 @@ class Database(): return mini, maxi # return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen] - def expire(self) -> None: - self.enter_step('expire') - self.updated += 1 - self.set_meta('updated', self.updated) - def update_references(self) -> None: self.enter_step('update_refs') self.execute('UPDATE rules AS r SET refs=' '(SELECT count(*) FROM rules ' 'WHERE source=r.id)') - def prune(self) -> None: + def prune(self, before: int) -> None: self.enter_step('prune') - self.execute('DELETE FROM rules WHERE updated typing.Iterable[str]: @@ -264,6 +252,7 @@ class Database(): select_query: str, insert_query: str, prep: typing.Dict[str, DbValue], + updated: int, is_first_party: bool = False, source: int = None, ) -> None: @@ -287,9 +276,9 @@ class Database(): self.enter_step(f'set_{table}_select') self.execute(select_query, prep) - rules_prep = { + rules_prep: typing.Dict[str, DbValue] = { "source": source, - "updated": self.updated, + "updated": updated, "first_party": first_party, "level": level, } @@ -437,10 +426,7 @@ if __name__ == '__main__': help="Reconstruct the whole database") parser.add_argument( '-p', '--prune', action='store_true', - help="Remove old entries from database") - parser.add_argument( - '-e', '--expire', action='store_true', - help="Set the whole database as an old source") + help="Remove old (+6 months) entries from database") parser.add_argument( '-r', '--references', action='store_true', help="Update the reference count") @@ -451,9 +437,7 @@ if __name__ == '__main__': if args.initialize: DB.initialize() if args.prune: - DB.prune() - if args.expire: - DB.expire() + DB.prune(before=int(time.time()) - 60*60*24*31*6) if args.references and not args.prune: DB.update_references() diff --git a/feed_asn.py b/feed_asn.py new file mode 100755 index 0000000..a1343c0 --- /dev/null +++ b/feed_asn.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import database +import argparse +import requests +import typing +import ipaddress +import logging +import time + +IPNetwork = typing.Union[ipaddress.IPv4Network, ipaddress.IPv6Network] + + +def get_ranges(asn: str) -> typing.Iterable[str]: + req = requests.get( + 'https://stat.ripe.net/data/as-routing-consistency/data.json', + params={'resource': asn} + ) + data = req.json() + for pref in data['data']['prefixes']: + yield pref['prefix'] + + +if __name__ == '__main__': + + log = logging.getLogger('feed_asn') + + # Parsing arguments + parser = argparse.ArgumentParser( + description="TODO") + args = parser.parse_args() + + DB = database.Database() + DBW = database.Database(write=True) + + for asn, entry in DB.list_asn(): + DB.enter_step('asn_get_ranges') + for prefix in get_ranges(asn): + parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) + if parsed_prefix.version == 4: + DBW.set_ip4network( + prefix, + source=entry, + updated=int(time.time()) + ) + log.info('Added %s from %s (id=%s)', prefix, asn, entry) + elif parsed_prefix.version == 6: + log.warning('Unimplemented prefix version: %s', prefix) + else: + log.error('Unknown prefix version: %s', prefix) + + DB.close() diff --git a/feed_dns.py b/feed_dns.py index 2993d6d..e3cc02c 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -28,6 +28,7 @@ if __name__ == '__main__': # split = line.split(b'"') split = line.split('"') try: + updated = int(split[3]) name = split[7] dtype = split[11] value = split[15] @@ -43,13 +44,13 @@ if __name__ == '__main__': DB.enter_step('feed_switch') if dtype == 'a': for rule in DB.get_ip4(value): - DB.set_hostname(name, source=rule) + DB.set_hostname(name, source=rule, updated=updated) elif dtype == 'cname': for rule in DB.get_domain(value): - DB.set_hostname(name, source=rule) + DB.set_hostname(name, source=rule, updated=updated) elif dtype == 'ptr': for rule in DB.get_domain(value): - DB.set_ip4address(name, source=rule) + DB.set_ip4address(name, source=rule, updated=updated) DB.enter_step('iowait') except KeyboardInterrupt: log.warning("Interupted.") diff --git a/feed_rules.py b/feed_rules.py index 72888f5..715126e 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -3,6 +3,7 @@ import database import argparse import sys +import time FUNCTION_MAP = { 'zone': database.Database.set_zone, @@ -32,6 +33,10 @@ if __name__ == '__main__': fun = FUNCTION_MAP[args.type] for rule in args.input: - fun(DB, rule.strip(), is_first_party=args.first_party) + fun(DB, + rule.strip(), + is_first_party=args.first_party, + updated=int(time.time()), + ) DB.close() From ab7ef609dd5fd50386d0fc43099c6e1eb5b15446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 18:00:00 +0100 Subject: [PATCH 10/40] Workflow: Various optimisations and fixes I forgot to close this one earlier, so: Closes #7 --- database.py | 243 +++++++++++++++++++++++++++++------------- database_schema.sql | 6 ++ export.py | 10 +- feed_asn.py | 1 + feed_dns.py | 9 +- filter_subdomains.sh | 3 + regexes.py | 21 ---- resolve_subdomains.py | 38 +++---- 8 files changed, 214 insertions(+), 117 deletions(-) delete mode 100644 regexes.py diff --git a/database.py b/database.py index 2a3e8df..3327438 100755 --- a/database.py +++ b/database.py @@ -12,6 +12,7 @@ import logging import argparse import coloredlogs import ipaddress +import math coloredlogs.install( level='DEBUG', @@ -22,43 +23,47 @@ DbValue = typing.Union[None, int, float, str, bytes] class Database(): - VERSION = 3 + VERSION = 4 PATH = "blocking.db" def open(self) -> None: mode = 'rwc' if self.write else 'ro' uri = f'file:{self.PATH}?mode={mode}' self.conn = sqlite3.connect(uri, uri=True) - self.cursor = self.conn.cursor() - self.execute("PRAGMA foreign_keys = ON") - # self.conn.create_function("prepare_ip4address", 1, - # Database.prepare_ip4address, - # deterministic=True) + cursor = self.conn.cursor() + cursor.execute("PRAGMA foreign_keys = ON") + self.conn.create_function("unpack_asn", 1, + self.unpack_asn, + deterministic=True) + self.conn.create_function("unpack_ip4address", 1, + self.unpack_ip4address, + deterministic=True) + self.conn.create_function("unpack_ip4network", 2, + self.unpack_ip4network, + deterministic=True) self.conn.create_function("unpack_domain", 1, lambda s: s[:-1][::-1], deterministic=True) - - def execute(self, cmd: str, args: typing.Union[ - typing.Tuple[DbValue, ...], - typing.Dict[str, DbValue]] = None) -> None: - # self.log.debug(cmd) - # self.log.debug(args) - self.cursor.execute(cmd, args or tuple()) + self.conn.create_function("format_zone", 1, + lambda s: '*' + s[::-1], + deterministic=True) def get_meta(self, key: str) -> typing.Optional[int]: + cursor = self.conn.cursor() try: - self.execute("SELECT value FROM meta WHERE key=?", (key,)) + cursor.execute("SELECT value FROM meta WHERE key=?", (key,)) except sqlite3.OperationalError: return None - for ver, in self.cursor: + for ver, in cursor: return ver return None def set_meta(self, key: str, val: int) -> None: - self.execute("INSERT INTO meta VALUES (?, ?) " - "ON CONFLICT (key) DO " - "UPDATE set value=?", - (key, val, val)) + cursor = self.conn.cursor() + cursor.execute("INSERT INTO meta VALUES (?, ?) " + "ON CONFLICT (key) DO " + "UPDATE set value=?", + (key, val, val)) def close(self) -> None: self.enter_step('close_commit') @@ -76,8 +81,9 @@ class Database(): os.unlink(self.PATH) self.open() self.log.info("Creating database version %d.", self.VERSION) + cursor = self.conn.cursor() with open("database_schema.sql", 'r') as db_schema: - self.cursor.executescript(db_schema.read()) + cursor.executescript(db_schema.read()) self.set_meta('version', self.VERSION) self.conn.commit() @@ -119,21 +125,27 @@ class Database(): self.log.debug(f"{'total':<20}: " f"{total:9.2f} s ({1:7.2%})") - def prepare_hostname(self, hostname: str) -> str: + @staticmethod + def pack_hostname(hostname: str) -> str: return hostname[::-1] + '.' - def prepare_zone(self, zone: str) -> str: - return self.prepare_hostname(zone) + @staticmethod + def pack_zone(zone: str) -> str: + return Database.pack_hostname(zone) @staticmethod - def prepare_asn(asn: str) -> int: + def pack_asn(asn: str) -> int: asn = asn.upper() if asn.startswith('AS'): asn = asn[2:] return int(asn) @staticmethod - def prepare_ip4address(address: str) -> int: + def unpack_asn(asn: int) -> str: + return f'AS{asn}' + + @staticmethod + def pack_ip4address(address: str) -> int: total = 0 for i, octet in enumerate(address.split('.')): total += int(octet) << (3-i)*8 @@ -151,29 +163,75 @@ class Database(): # packed = ipaddress.ip_address(address).packed # return packed - def prepare_ip4network(self, network: str) -> typing.Tuple[int, int]: - # def prepare_ip4network(network: str) -> str: + @staticmethod + def unpack_ip4address(address: int) -> str: + return '.'.join(str((address >> (i * 8)) & 0xFF) + for i in reversed(range(4))) + + @staticmethod + def pack_ip4network(network: str) -> typing.Tuple[int, int]: + # def pack_ip4network(network: str) -> str: net = ipaddress.ip_network(network) - mini = self.prepare_ip4address(net.network_address.exploded) - maxi = self.prepare_ip4address(net.broadcast_address.exploded) + mini = Database.pack_ip4address(net.network_address.exploded) + maxi = Database.pack_ip4address(net.broadcast_address.exploded) # mini = net.network_address.packed # maxi = net.broadcast_address.packed return mini, maxi - # return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen] + # return Database.pack_ip4address(net.network_address.exploded)[:net.prefixlen] + + @staticmethod + def unpack_ip4network(mini: int, maxi: int) -> str: + addr = Database.unpack_ip4address(mini) + prefixlen = 32-int(math.log2(maxi-mini+1)) + return f'{addr}/{prefixlen}' def update_references(self) -> None: self.enter_step('update_refs') - self.execute('UPDATE rules AS r SET refs=' - '(SELECT count(*) FROM rules ' - 'WHERE source=r.id)') + cursor = self.conn.cursor() + cursor.execute('UPDATE rules AS r SET refs=' + '(SELECT count(*) FROM rules ' + 'WHERE source=r.id)') def prune(self, before: int) -> None: self.enter_step('prune') - self.execute('DELETE FROM rules WHERE updated typing.Iterable[str]: - command = 'SELECT unpack_domain(val) FROM rules ' \ + def explain(self, entry: int) -> str: + # Format current + string = '???' + cursor = self.conn.cursor() + cursor.execute( + 'SELECT unpack_asn(val) FROM asn WHERE entry=:entry ' + 'UNION ' + 'SELECT unpack_domain(val) FROM hostname WHERE entry=:entry ' + 'UNION ' + 'SELECT format_zone(val) FROM zone WHERE entry=:entry ' + 'UNION ' + 'SELECT unpack_ip4address(val) FROM ip4address WHERE entry=:entry ' + 'UNION ' + 'SELECT unpack_ip4network(mini, maxi) ' + 'FROM ip4network WHERE entry=:entry ', + {"entry": entry} + ) + for val, in cursor: # only one + string = str(val) + string += f' #{entry}' + + # Add source if any + cursor.execute('SELECT source FROM rules WHERE id=?', (entry,)) + for source, in cursor: + if source: + string += f' ← {self.explain(source)}' + return string + + def export(self, + first_party_only: bool = False, + end_chain_only: bool = False, + explain: bool = False, + ) -> typing.Iterable[str]: + selection = 'entry' if explain else 'unpack_domain(val)' + command = f'SELECT {selection} FROM rules ' \ 'INNER JOIN hostname ON rules.id = hostname.entry' restrictions: typing.List[str] = list() if first_party_only: @@ -182,16 +240,22 @@ class Database(): restrictions.append('rules.refs = 0') if restrictions: command += ' WHERE ' + ' AND '.join(restrictions) - command += ' ORDER BY unpack_domain(val) ASC' - self.execute(command) - for val, in self.cursor: - yield val + if not explain: + command += ' ORDER BY unpack_domain(val) ASC' + cursor = self.conn.cursor() + cursor.execute(command) + for val, in cursor: + if explain: + yield self.explain(val) + else: + yield val def get_domain(self, domain: str) -> typing.Iterable[int]: self.enter_step('get_domain_prepare') - domain_prep = self.prepare_hostname(domain) + domain_prep = self.pack_hostname(domain) + cursor = self.conn.cursor() self.enter_step('get_domain_select') - self.execute( + cursor.execute( 'SELECT null, entry FROM hostname ' 'WHERE val=:d ' 'UNION ' @@ -202,22 +266,41 @@ class Database(): ')', {'d': domain_prep} ) - for val, entry in self.cursor: + for val, entry in cursor: self.enter_step('get_domain_confirm') if not (val is None or domain_prep.startswith(val)): continue self.enter_step('get_domain_yield') yield entry + def get_domain_in_zone(self, domain: str) -> typing.Iterable[int]: + self.enter_step('get_domainiz_prepare') + domain_prep = self.pack_hostname(domain) + cursor = self.conn.cursor() + self.enter_step('get_domainiz_select') + cursor.execute( + 'SELECT val, entry FROM zone ' + 'WHERE val<=:d ' + 'ORDER BY val DESC LIMIT 1', + {'d': domain_prep} + ) + for val, entry in cursor: + self.enter_step('get_domainiz_confirm') + if not (val is None or domain_prep.startswith(val)): + continue + self.enter_step('get_domainiz_yield') + yield entry + def get_ip4(self, address: str) -> typing.Iterable[int]: self.enter_step('get_ip4_prepare') try: - address_prep = self.prepare_ip4address(address) + address_prep = self.pack_ip4address(address) except (ValueError, IndexError): self.log.error("Invalid ip4address: %s", address) return + cursor = self.conn.cursor() self.enter_step('get_ip4_select') - self.execute( + cursor.execute( 'SELECT entry FROM ip4address ' # 'SELECT null, entry FROM ip4address ' 'WHERE val=:a ' @@ -232,7 +315,7 @@ class Database(): 'WHERE :a BETWEEN mini AND maxi ', {'a': address_prep} ) - for val, entry in self.cursor: + for entry, in cursor: # self.enter_step('get_ip4_confirm') # if not (val is None or val.startswith(address_prep)): # # PERF startswith but from the end @@ -240,11 +323,29 @@ class Database(): self.enter_step('get_ip4_yield') yield entry + def get_ip4_in_network(self, address: str) -> typing.Iterable[int]: + self.enter_step('get_ip4in_prepare') + try: + address_prep = self.pack_ip4address(address) + except (ValueError, IndexError): + self.log.error("Invalid ip4address: %s", address) + return + cursor = self.conn.cursor() + self.enter_step('get_ip4in_select') + cursor.execute( + 'SELECT entry FROM ip4network ' + 'WHERE :a BETWEEN mini AND maxi ', + {'a': address_prep} + ) + for entry, in cursor: + self.enter_step('get_ip4in_yield') + yield entry + def list_asn(self) -> typing.Iterable[typing.Tuple[str, int]]: + cursor = self.conn.cursor() self.enter_step('list_asn_select') - self.enter_step('get_domain_select') - self.execute('SELECT val, entry FROM asn') - for val, entry in self.cursor: + cursor.execute('SELECT val, entry FROM asn') + for val, entry in cursor: yield f'AS{val}', entry def _set_generic(self, @@ -260,21 +361,23 @@ class Database(): # here abstraction > performaces # Fields based on the source + self.enter_step(f'set_{table}_prepare') + cursor = self.conn.cursor() if source is None: first_party = int(is_first_party) level = 0 else: self.enter_step(f'set_{table}_source') - self.execute( + cursor.execute( 'SELECT first_party, level FROM rules ' 'WHERE id=?', (source,) ) - first_party, level = self.cursor.fetchone() + first_party, level = cursor.fetchone() level += 1 self.enter_step(f'set_{table}_select') - self.execute(select_query, prep) + cursor.execute(select_query, prep) rules_prep: typing.Dict[str, DbValue] = { "source": source, @@ -284,10 +387,10 @@ class Database(): } # If the entry already exists - for entry, in self.cursor: # only one + for entry, in cursor: # only one self.enter_step(f'set_{table}_update') rules_prep['entry'] = entry - self.execute( + cursor.execute( 'UPDATE rules SET ' 'source=:source, updated=:updated, ' 'first_party=:first_party, level=:level ' @@ -303,23 +406,18 @@ class Database(): # If it does not exist - if source is not None: - self.enter_step(f'set_{table}_incsrc') - self.execute('UPDATE rules SET refs = refs + 1 WHERE id=?', - (source,)) - self.enter_step(f'set_{table}_insert') - self.execute( + cursor.execute( 'INSERT INTO rules ' - '(source, updated, first_party, refs, level) ' - 'VALUES (:source, :updated, :first_party, 0, :level) ', + '(source, updated, first_party, level) ' + 'VALUES (:source, :updated, :first_party, :level) ', rules_prep ) - self.execute('SELECT id FROM rules WHERE rowid=?', - (self.cursor.lastrowid,)) - for entry, in self.cursor: # only one + cursor.execute('SELECT id FROM rules WHERE rowid=?', + (cursor.lastrowid,)) + for entry, in cursor: # only one prep['entry'] = entry - self.execute(insert_query, prep) + cursor.execute(insert_query, prep) return assert False @@ -327,7 +425,7 @@ class Database(): *args: typing.Any, **kwargs: typing.Any) -> None: self.enter_step('set_hostname_prepare') prep: typing.Dict[str, DbValue] = { - 'val': self.prepare_hostname(hostname), + 'val': self.pack_hostname(hostname), } self._set_generic( 'hostname', @@ -342,7 +440,7 @@ class Database(): *args: typing.Any, **kwargs: typing.Any) -> None: self.enter_step('set_asn_prepare') try: - asn_prep = self.prepare_asn(asn) + asn_prep = self.pack_asn(asn) except ValueError: self.log.error("Invalid asn: %s", asn) return @@ -360,10 +458,9 @@ class Database(): def set_ip4address(self, ip4address: str, *args: typing.Any, **kwargs: typing.Any) -> None: - # TODO Do not add if already in ip4network self.enter_step('set_ip4add_prepare') try: - ip4address_prep = self.prepare_ip4address(ip4address) + ip4address_prep = self.pack_ip4address(ip4address) except (ValueError, IndexError): self.log.error("Invalid ip4address: %s", ip4address) return @@ -383,7 +480,7 @@ class Database(): *args: typing.Any, **kwargs: typing.Any) -> None: self.enter_step('set_zone_prepare') prep: typing.Dict[str, DbValue] = { - 'val': self.prepare_zone(zone), + 'val': self.pack_zone(zone), } self._set_generic( 'zone', @@ -398,7 +495,7 @@ class Database(): *args: typing.Any, **kwargs: typing.Any) -> None: self.enter_step('set_ip4net_prepare') try: - ip4network_prep = self.prepare_ip4network(ip4network) + ip4network_prep = self.pack_ip4network(ip4network) except (ValueError, IndexError): self.log.error("Invalid ip4network: %s", ip4network) return diff --git a/database_schema.sql b/database_schema.sql index 9be81b0..a61f7f2 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -10,30 +10,35 @@ CREATE TABLE rules ( level INTEGER, -- Level of recursion to the root source rule (used for source priority) FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE INDEX rules_source ON rules (source); -- for references recounting CREATE TABLE asn ( val INTEGER PRIMARY KEY, entry INTEGER, FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE INDEX asn_entry ON asn (entry); -- for explainations CREATE TABLE hostname ( val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone) entry INTEGER, FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE INDEX hostname_entry ON hostname (entry); -- for explainations CREATE TABLE zone ( val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching) entry INTEGER, FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE INDEX zone_entry ON zone (entry); -- for explainations CREATE TABLE ip4address ( val INTEGER PRIMARY KEY, entry INTEGER, FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); +CREATE INDEX ip4address_entry ON ip4address (entry); -- for explainations CREATE TABLE ip4network ( -- val TEXT PRIMARY KEY, @@ -43,6 +48,7 @@ CREATE TABLE ip4network ( FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE ); CREATE INDEX ip4network_minmax ON ip4network (mini, maxi); +CREATE INDEX ip4network_entry ON ip4network (entry); -- for explainations -- Store various things CREATE TABLE meta ( diff --git a/export.py b/export.py index 58b276b..49051c9 100755 --- a/export.py +++ b/export.py @@ -19,12 +19,18 @@ if __name__ == '__main__': parser.add_argument( '-e', '--end-chain', action='store_true', help="TODO") + parser.add_argument( + '-x', '--explain', action='store_true', + help="TODO") args = parser.parse_args() DB = database.Database() - for domain in DB.export(first_party_only=args.first_party, - end_chain_only=args.end_chain): + for domain in DB.export( + first_party_only=args.first_party, + end_chain_only=args.end_chain, + explain=args.explain, + ): print(domain, file=args.output) DB.close() diff --git a/feed_asn.py b/feed_asn.py index a1343c0..098f931 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -50,3 +50,4 @@ if __name__ == '__main__': log.error('Unknown prefix version: %s', prefix) DB.close() + DBW.close() diff --git a/feed_dns.py b/feed_dns.py index e3cc02c..f3fde5a 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -44,13 +44,16 @@ if __name__ == '__main__': DB.enter_step('feed_switch') if dtype == 'a': for rule in DB.get_ip4(value): - DB.set_hostname(name, source=rule, updated=updated) + if not list(DB.get_domain_in_zone(name)): + DB.set_hostname(name, source=rule, updated=updated) elif dtype == 'cname': for rule in DB.get_domain(value): - DB.set_hostname(name, source=rule, updated=updated) + if not list(DB.get_domain_in_zone(name)): + DB.set_hostname(name, source=rule, updated=updated) elif dtype == 'ptr': for rule in DB.get_domain(value): - DB.set_ip4address(name, source=rule, updated=updated) + if not list(DB.get_ip4_in_network(name)): + DB.set_ip4address(name, source=rule, updated=updated) DB.enter_step('iowait') except KeyboardInterrupt: log.warning("Interupted.") diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 67783e8..516efae 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,6 +4,9 @@ function log() { echo -e "\033[33m$@\033[0m" } +log "Recounting references…" +./database.py --references + log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt diff --git a/regexes.py b/regexes.py deleted file mode 100644 index 0e48441..0000000 --- a/regexes.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 - -""" -List of regex matching first-party trackers. -""" - -# Syntax: https://docs.python.org/3/library/re.html#regular-expression-syntax - -REGEXES = [ - r'^.+\.eulerian\.net\.$', # Eulerian - r'^.+\.criteo\.com\.$', # Criteo - r'^.+\.dnsdelegation\.io\.$', # Criteo - r'^.+\.keyade\.com\.$', # Keyade - r'^.+\.omtrdc\.net\.$', # Adobe Experience Cloud - r'^.+\.bp01\.net\.$', # NP6 - r'^.+\.ati-host\.net\.$', # Xiti (AT Internet) - r'^.+\.at-o\.net\.$', # Xiti (AT Internet) - r'^.+\.edgkey\.net\.$', # Edgekey (Akamai) - r'^.+\.akaimaiedge\.net\.$', # Edgekey (Akamai) - r'^.+\.storetail\.io\.$', # Storetail (Criteo) -] diff --git a/resolve_subdomains.py b/resolve_subdomains.py index b675b11..fa2ea59 100755 --- a/resolve_subdomains.py +++ b/resolve_subdomains.py @@ -19,12 +19,8 @@ import dns.exception import dns.resolver DNS_TIMEOUT = 5.0 -NUMBER_THREADS = 512 NUMBER_TRIES = 5 -# TODO All the domains don't get treated, -# so it leaves with 4-5 subdomains not resolved - class Worker(threading.Thread): """ @@ -135,15 +131,17 @@ class Orchestrator(): def __init__(self, subdomains: typing.Iterable[str], nameservers: typing.List[str] = None, + nb_workers: int = 1, ): self.log = logging.getLogger('orchestrator') self.subdomains = subdomains + self.nb_workers = nb_workers # Use interal resolver by default self.nameservers = nameservers or dns.resolver.Resolver().nameservers self.subdomains_queue: queue.Queue = queue.Queue( - maxsize=NUMBER_THREADS) + maxsize=self.nb_workers) self.results_queue: queue.Queue = queue.Queue() self.nameservers_queue: queue.Queue = queue.Queue() @@ -164,7 +162,7 @@ class Orchestrator(): self.log.info("Finished reading subdomains") # Send sentinel to each worker # sentinel = None ~= EOF - for _ in range(NUMBER_THREADS): + for _ in range(self.nb_workers): self.subdomains_queue.put(None) @staticmethod @@ -189,7 +187,7 @@ class Orchestrator(): """ # Create workers self.log.info("Creating workers") - for i in range(NUMBER_THREADS): + for i in range(self.nb_workers): Worker(self, i).start() fill_thread = threading.Thread(target=self.fill_subdomain_queue) @@ -197,7 +195,7 @@ class Orchestrator(): # Wait for one sentinel per worker # In the meantime output results - for _ in range(NUMBER_THREADS): + for _ in range(self.nb_workers): resolved: typing.List[dns.rrset.RRset] for resolved in iter(self.results_queue.get, None): for rrset in resolved: @@ -223,7 +221,7 @@ def main() -> None: # Initialization coloredlogs.install( - # level='DEBUG', + level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' ) @@ -236,12 +234,12 @@ def main() -> None: parser.add_argument( '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="Outptut file with DNS chains") - # parser.add_argument( - # '-n', '--nameserver', type=argparse.FileType('r'), - # default='nameservers', help="File with one nameserver per line") - # parser.add_argument( - # '-j', '--workers', type=int, default=512, - # help="Number of threads to use") + parser.add_argument( + '-n', '--nameservers', default='nameservers', + help="File with one nameserver per line") + parser.add_argument( + '-j', '--workers', type=int, default=512, + help="Number of threads to use") args = parser.parse_args() # Cleaning input @@ -251,11 +249,15 @@ def main() -> None: # Reading nameservers servers: typing.List[str] = list() - if os.path.isfile('nameservers'): - servers = open('nameservers').readlines() + if os.path.isfile(args.nameservers): + servers = open(args.nameservers).readlines() servers = list(filter(None, map(str.strip, servers))) - for resolved in Orchestrator(iterator, servers).run(): + for resolved in Orchestrator( + iterator, + servers, + nb_workers=args.workers + ).run(): args.output.write(resolved) From 269b8278b512bb5187a99b74c40f3a2e11ab6923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 18:36:08 +0100 Subject: [PATCH 11/40] Worflow: Fixed rules counts --- database.py | 20 +++++++++++++++++++- database_schema.sql | 2 ++ export.py | 25 +++++++++++++++++++------ filter_subdomains.sh | 18 ++++++++++++------ new_workflow.sh | 4 +--- 5 files changed, 53 insertions(+), 16 deletions(-) diff --git a/database.py b/database.py index 3327438..1e8c4da 100755 --- a/database.py +++ b/database.py @@ -23,7 +23,7 @@ DbValue = typing.Union[None, int, float, str, bytes] class Database(): - VERSION = 4 + VERSION = 5 PATH = "blocking.db" def open(self) -> None: @@ -250,6 +250,24 @@ class Database(): else: yield val + def count_rules(self, + first_party_only: bool = False, + ) -> str: + counts: typing.List[str] = list() + cursor = self.conn.cursor() + for table in ['asn', 'ip4network', 'ip4address', 'zone', 'hostname']: + command = f'SELECT count(*) FROM rules ' \ + f'INNER JOIN {table} ON rules.id = {table}.entry ' \ + 'WHERE rules.level = 0' + if first_party_only: + command += ' AND first_party=1' + cursor.execute(command) + count, = cursor.fetchone() + if count > 0: + counts.append(f'{table}: {count}') + + return ', '.join(counts) + def get_domain(self, domain: str) -> typing.Iterable[int]: self.enter_step('get_domain_prepare') domain_prep = self.pack_hostname(domain) diff --git a/database_schema.sql b/database_schema.sql index a61f7f2..3116a09 100644 --- a/database_schema.sql +++ b/database_schema.sql @@ -11,6 +11,8 @@ CREATE TABLE rules ( FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE ); CREATE INDEX rules_source ON rules (source); -- for references recounting +CREATE INDEX rules_updated ON rules (updated); -- for pruning +CREATE INDEX rules_level_firstparty ON rules (level, first_party); -- for counting rules CREATE TABLE asn ( val INTEGER PRIMARY KEY, diff --git a/export.py b/export.py index 49051c9..886582c 100755 --- a/export.py +++ b/export.py @@ -22,15 +22,28 @@ if __name__ == '__main__': parser.add_argument( '-x', '--explain', action='store_true', help="TODO") + parser.add_argument( + '-r', '--rules', action='store_true', + help="TODO") + parser.add_argument( + '-c', '--count', action='store_true', + help="TODO") args = parser.parse_args() DB = database.Database() - for domain in DB.export( - first_party_only=args.first_party, - end_chain_only=args.end_chain, - explain=args.explain, - ): - print(domain, file=args.output) + if args.rules: + if not args.count: + raise NotImplementedError + print(DB.count_rules(first_party_only=args.first_party)) + else: + if args.count: + raise NotImplementedError + for domain in DB.export( + first_party_only=args.first_party, + end_chain_only=args.end_chain, + explain=args.explain, + ): + print(domain, file=args.output) DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 516efae..d4b90ae 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,6 +4,9 @@ function log() { echo -e "\033[33m$@\033[0m" } +log "Pruning old data…" +./database.py --prune + log "Recounting references…" ./database.py --references @@ -14,6 +17,8 @@ log "Exporting lists…" ./export.py --end-chain --output dist/multiparty-only-trackers.txt log "Generating hosts lists…" +./export.py --rules --count --first-party > temp/count_rules_firstparty.txt +./export.py --rules --count > temp/count_rules_multiparty.txt function generate_hosts { basename="$1" description="$2" @@ -39,15 +44,16 @@ function generate_hosts { echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: TODO" - echo "# Number of source subdomains: TODO" + echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" + echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" + echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" echo "#" - echo "# Number of known first-party trackers: TODO" - echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" + echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" + echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" - echo "# Number of known multi-party trackers: TODO" - echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" + echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" + echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo sed 's|^|0.0.0.0 |' "dist/$basename.txt" diff --git a/new_workflow.sh b/new_workflow.sh index bc2a78b..c98cd46 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -4,9 +4,7 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Preparing database…" -./database.py --expire - +./fetch_resources.sh ./import_rules.sh # TODO Fetch 'em From 5023b85d7ca802f2908526b0e48ac5245aa457df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 21:59:35 +0100 Subject: [PATCH 12/40] Added intermediate representation for DNS datasets It's just CSV. The DNS from the datasets are not ordered consistently, so we need to parse it completly. It seems that converting to an IR before sending data to ./feed_dns.py through a pipe is faster than decoding the JSON in ./feed_dns.py. This will also reduce the storage of the resolved subdomains by about 15% (compressed). --- feed_dns.py | 48 ++++++++++++++++++++++--------------------- json_to_csv.py | 36 ++++++++++++++++++++++++++++++++ new_workflow.sh | 6 +++--- resolve_subdomains.py | 5 ++--- resolve_subdomains.sh | 4 ++-- 5 files changed, 68 insertions(+), 31 deletions(-) create mode 100755 json_to_csv.py diff --git a/feed_dns.py b/feed_dns.py index f3fde5a..cb996e9 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -4,6 +4,8 @@ import database import argparse import sys import logging +import csv +import json if __name__ == '__main__': @@ -21,39 +23,39 @@ if __name__ == '__main__': try: DB.enter_step('iowait') - # line: bytes - line: str - for line in args.input: - DB.enter_step('feed_json_parse') - # split = line.split(b'"') - split = line.split('"') - try: - updated = int(split[3]) - name = split[7] - dtype = split[11] - value = split[15] - except IndexError: - log.error("Invalid JSON: %s", line) - continue - # DB.enter_step('feed_json_assert') + for row in csv.reader(args.input): + # for line in args.input: + DB.enter_step('feed_csv_parse') + dtype, timestamp, name, value = row + # DB.enter_step('feed_json_parse') # data = json.loads(line) - # assert dtype == data['type'] - # assert name == data['name'] - # assert value == data['value'] + # dtype = data['type'][0] + # # timestamp = data['timestamp'] + # name = data['name'] + # value = data['value'] DB.enter_step('feed_switch') if dtype == 'a': for rule in DB.get_ip4(value): if not list(DB.get_domain_in_zone(name)): - DB.set_hostname(name, source=rule, updated=updated) - elif dtype == 'cname': + + DB.set_hostname(name, source=rule, + updated=int(timestamp)) + # updated=int(data['timestamp'])) + elif dtype == 'c': for rule in DB.get_domain(value): if not list(DB.get_domain_in_zone(name)): - DB.set_hostname(name, source=rule, updated=updated) - elif dtype == 'ptr': + DB.set_hostname(name, source=rule, + updated=int(timestamp)) + # updated=int(data['timestamp'])) + elif dtype == 'p': for rule in DB.get_domain(value): if not list(DB.get_ip4_in_network(name)): - DB.set_ip4address(name, source=rule, updated=updated) + DB.set_ip4address(name, source=rule, + updated=int(timestamp)) + # updated=int(data['timestamp'])) + else: + raise NotImplementedError(f'Type: {dtype}') DB.enter_step('iowait') except KeyboardInterrupt: log.warning("Interupted.") diff --git a/json_to_csv.py b/json_to_csv.py new file mode 100755 index 0000000..11a3600 --- /dev/null +++ b/json_to_csv.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import argparse +import sys +import logging +import json +import csv + +if __name__ == '__main__': + + # Parsing arguments + log = logging.getLogger('json_to_csv') + parser = argparse.ArgumentParser( + description="TODO") + parser.add_argument( + # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + parser.add_argument( + # '-i', '--output', type=argparse.FileType('wb'), default=sys.stdout.buffer, + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="TODO") + args = parser.parse_args() + + writer = csv.writer(args.output) + for line in args.input: + data = json.loads(line) + try: + writer.writerow([ + data['type'][0], + data['timestamp'], + data['name'], + data['value']]) + except IndexError: + log.error('Could not parse line: %s', line) + pass diff --git a/new_workflow.sh b/new_workflow.sh index c98cd46..e21b426 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -9,11 +9,11 @@ function log() { # TODO Fetch 'em log "Reading PTR records…" -pv ptr.json.gz | gunzip | ./feed_dns.py +pv ptr.json.gz | gunzip | ./json_to_csv.py | ./feed_dns.py log "Reading A records…" -pv a.json.gz | gunzip | ./feed_dns.py +pv a.json.gz | gunzip | ./json_to_csv.py | ./feed_dns.py log "Reading CNAME records…" -pv cname.json.gz | gunzip | ./feed_dns.py +pv cname.json.gz | gunzip | ./json_to_csv.py | ./feed_dns.py log "Pruning old data…" ./database.py --prune diff --git a/resolve_subdomains.py b/resolve_subdomains.py index fa2ea59..bc26e34 100755 --- a/resolve_subdomains.py +++ b/resolve_subdomains.py @@ -168,7 +168,7 @@ class Orchestrator(): @staticmethod def format_rrset(rrset: dns.rrset.RRset) -> typing.Iterable[str]: if rrset.rdtype == dns.rdatatype.CNAME: - dtype = 'cname' + dtype = 'c' elif rrset.rdtype == dns.rdatatype.A: dtype = 'a' else: @@ -178,8 +178,7 @@ class Orchestrator(): value = item.to_text() if rrset.rdtype == dns.rdatatype.CNAME: value = value[:-1] - yield '{"timestamp":"' + str(int(time.time())) + '","name":"' + \ - name + '","type":"' + dtype + '","value":"' + value + '"}\n' + yield f'{dtype},{int(time.time())},{name},{value}\n' def run(self) -> typing.Iterable[str]: """ diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index ee5f83c..e37ddeb 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -6,7 +6,7 @@ function log() { log "Compiling locally known subdomain…" # Sort by last character to utilize the DNS server caching mechanism -pv subdomains/*.list | rev | sort -u | rev > temp/all_subdomains.list +pv subdomains/*.list | sed 's/\r$//' | rev | sort -u | rev > temp/all_subdomains.list log "Resolving locally known subdomain…" -pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.json +pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.csv From d7c239a6f62788276eb0171659a803421ad18cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sat, 14 Dec 2019 16:04:19 +0100 Subject: [PATCH 13/40] Workflow: Some modifications --- .gitignore | 2 -- database.py | 22 +++++++++++++++++----- feed_dns.py | 7 ++++--- fetch_resources.sh | 2 +- import_rules.sh | 3 +++ json_to_csv.py | 4 ++-- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index aa3f3eb..188051c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,3 @@ *.db-journal nameservers nameservers.head -*.o -*.so diff --git a/database.py b/database.py index 1e8c4da..ee51829 100755 --- a/database.py +++ b/database.py @@ -149,6 +149,8 @@ class Database(): total = 0 for i, octet in enumerate(address.split('.')): total += int(octet) << (3-i)*8 + if total > 0xFFFFFFFF: + raise ValueError return total # return '{:02x}{:02x}{:02x}{:02x}'.format( # *[int(c) for c in address.split('.')]) @@ -192,10 +194,13 @@ class Database(): '(SELECT count(*) FROM rules ' 'WHERE source=r.id)') - def prune(self, before: int) -> None: + def prune(self, before: int, base_only: bool = False) -> None: self.enter_step('prune') cursor = self.conn.cursor() - cursor.execute('DELETE FROM rules WHERE updated str: # Format current @@ -541,7 +546,14 @@ if __name__ == '__main__': help="Reconstruct the whole database") parser.add_argument( '-p', '--prune', action='store_true', - help="Remove old (+6 months) entries from database") + help="Remove old entries from database") + parser.add_argument( + '-b', '--prune-base', action='store_true', + help="TODO") + parser.add_argument( + '-s', '--prune-before', type=int, + default=(int(time.time()) - 60*60*24*31*6), + help="TODO") parser.add_argument( '-r', '--references', action='store_true', help="Update the reference count") @@ -552,8 +564,8 @@ if __name__ == '__main__': if args.initialize: DB.initialize() if args.prune: - DB.prune(before=int(time.time()) - 60*60*24*31*6) - if args.references and not args.prune: + DB.prune(before=args.prune_before, base_only=args.prune_base) + if args.references: DB.update_references() DB.close() diff --git a/feed_dns.py b/feed_dns.py index cb996e9..87a4fb6 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -37,20 +37,21 @@ if __name__ == '__main__': DB.enter_step('feed_switch') if dtype == 'a': for rule in DB.get_ip4(value): - if not list(DB.get_domain_in_zone(name)): + if not any(DB.get_domain_in_zone(name)): DB.set_hostname(name, source=rule, updated=int(timestamp)) # updated=int(data['timestamp'])) elif dtype == 'c': for rule in DB.get_domain(value): - if not list(DB.get_domain_in_zone(name)): + if not any(DB.get_domain_in_zone(name)): DB.set_hostname(name, source=rule, updated=int(timestamp)) # updated=int(data['timestamp'])) elif dtype == 'p': for rule in DB.get_domain(value): - if not list(DB.get_ip4_in_network(name)): + if not any(DB.get_ip4_in_network(name)): + log.debug('%s matched by %d: add %s', value, rule, name) DB.set_ip4address(name, source=rule, updated=int(timestamp)) # updated=int(data['timestamp'])) diff --git a/fetch_resources.sh b/fetch_resources.sh index 01121d8..e799729 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -18,7 +18,7 @@ log "Retrieving rules…" rm -f rules*/*.cache.* dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt # From firebog.net Tracking & Telemetry Lists -dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list +# dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list # dl https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-blocklist.txt rules/notrack-blocklist.cache.list # False positives: https://github.com/WaLLy3K/wally3k.github.io/issues/73 -> 69.media.tumblr.com chicdn.net dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt diff --git a/import_rules.sh b/import_rules.sh index 358155c..33c4fbd 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -5,6 +5,7 @@ function log() { } log "Importing rules…" +BEFORE="$(date +%s)" cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone @@ -17,3 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py +log "Pruning old rules…" +./database.py --prune --prune-before "$BEFORE" --prune-base diff --git a/json_to_csv.py b/json_to_csv.py index 11a3600..39ca1b7 100755 --- a/json_to_csv.py +++ b/json_to_csv.py @@ -27,10 +27,10 @@ if __name__ == '__main__': data = json.loads(line) try: writer.writerow([ - data['type'][0], + data['type'][0], # First letter, will need to do something special for AAAA data['timestamp'], data['name'], data['value']]) - except IndexError: + except (KeyError, json.decoder.JSONDecodeError): log.error('Could not parse line: %s', line) pass From ddceed3d25e46a332cea12f9ea3f0f2672205dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sat, 14 Dec 2019 23:59:50 +0100 Subject: [PATCH 14/40] Workflow: Can now import DnsMass output Well, in a specific format but DnsMass nonetheless --- database.py | 6 +- feed_dns.py | 185 ++++++++++++++++++++++------- json_to_csv.py | 36 ------ new_workflow.sh | 6 +- resolve_subdomains.py | 264 ------------------------------------------ 5 files changed, 152 insertions(+), 345 deletions(-) delete mode 100755 json_to_csv.py delete mode 100755 resolve_subdomains.py diff --git a/database.py b/database.py index ee51829..19fbe97 100755 --- a/database.py +++ b/database.py @@ -284,14 +284,18 @@ class Database(): 'UNION ' 'SELECT * FROM (' 'SELECT val, entry FROM zone ' + # 'WHERE val>=:d ' + # 'ORDER BY val ASC LIMIT 1' 'WHERE val<=:d ' - 'ORDER BY val DESC LIMIT 1' + 'AND instr(:d, val) = 1' ')', {'d': domain_prep} ) for val, entry in cursor: + # print(293, val, entry) self.enter_step('get_domain_confirm') if not (val is None or domain_prep.startswith(val)): + # print(297) continue self.enter_step('get_domain_yield') yield entry diff --git a/feed_dns.py b/feed_dns.py index 585a211..4b01814 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -7,23 +7,24 @@ import logging import sys import typing import multiprocessing +import enum -NUMBER_THREADS = 2 -BLOCK_SIZE = 100 +RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') +Record = typing.Tuple[RecordType, int, str, str] # select, confirm, write FUNCTION_MAP: typing.Any = { - 'a': ( + RecordType.A: ( database.Database.get_ip4, database.Database.get_domain_in_zone, database.Database.set_hostname, ), - 'cname': ( + RecordType.CNAME: ( database.Database.get_domain, database.Database.get_domain_in_zone, database.Database.set_hostname, ), - 'ptr': ( + RecordType.PTR: ( database.Database.get_domain, database.Database.get_ip4_in_network, database.Database.set_ip4address, @@ -33,12 +34,12 @@ FUNCTION_MAP: typing.Any = { class Reader(multiprocessing.Process): def __init__(self, - lines_queue: multiprocessing.Queue, + recs_queue: multiprocessing.Queue, write_queue: multiprocessing.Queue, index: int = 0): super(Reader, self).__init__() self.log = logging.getLogger(f'rd{index:03d}') - self.lines_queue = lines_queue + self.recs_queue = recs_queue self.write_queue = write_queue self.index = index @@ -48,15 +49,19 @@ class Reader(multiprocessing.Process): self.db.enter_step('line_wait') block: typing.List[str] try: - for block in iter(self.lines_queue.get, None): - for line in block: - dtype, updated, name, value = line + for block in iter(self.recs_queue.get, None): + record: Record + for record in block: + # print(55, record) + dtype, updated, name, value = record self.db.enter_step('feed_switch') select, confirm, write = FUNCTION_MAP[dtype] for rule in select(self.db, value): + # print(60, rule, list(confirm(self.db, name))) if not any(confirm(self.db, name)): + # print(62, write, name, updated, rule) self.db.enter_step('wait_put') - self.write_queue.put((write, name, updated)) + self.write_queue.put((write, name, updated, rule)) self.db.enter_step('line_wait') except KeyboardInterrupt: self.log.error('Interrupted') @@ -82,9 +87,10 @@ class Writer(multiprocessing.Process): fun: typing.Callable name: str updated: int - for fun, name, updated in iter(self.write_queue.get, None): + source: int + for fun, name, updated, source in iter(self.write_queue.get, None): self.db.enter_step('exec') - fun(self.db, name, updated) + fun(self.db, name, updated, source=source) self.db.enter_step('line_wait') except KeyboardInterrupt: self.log.error('Interrupted') @@ -93,29 +99,142 @@ class Writer(multiprocessing.Process): self.db.close() +class Parser(): + def __init__(self, + buf: typing.Any, + recs_queue: multiprocessing.Queue, + block_size: int, + ): + super(Parser, self).__init__() + self.buf = buf + self.log = logging.getLogger('pr ') + self.recs_queue = recs_queue + self.block: typing.List[Record] = list() + self.block_size = block_size + self.db = database.Database() # Just for timing + self.db.log = logging.getLogger('pr ') + + def register(self, record: Record) -> None: + self.db.enter_step('register') + self.block.append(record) + if len(self.block) >= self.block_size: + self.db.enter_step('put_block') + self.recs_queue.put(self.block) + self.block = list() + + def run(self) -> None: + self.consume() + self.recs_queue.put(self.block) + self.db.close() + + def consume(self) -> None: + raise NotImplementedError + + +class Rapid7Parser(Parser): + TYPES = { + 'a': RecordType.A, + 'aaaa': RecordType.AAAA, + 'cname': RecordType.CNAME, + 'ptr': RecordType.PTR, + } + + def consume(self) -> None: + for line in self.buf: + self.db.enter_step('parse_rapid7') + try: + data = json.loads(line) + except json.decoder.JSONDecodeError: + continue + record = ( + Rapid7Parser.TYPES[data['type']], + int(data['timestamp']), + data['name'], + data['value'] + ) + self.register(record) + + +class DnsMassParser(Parser): + # dnsmass --output Snrql + # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 + TYPES = { + 'A': (RecordType.A, -1, None), + 'AAAA': (RecordType.AAAA, -1, None), + 'CNAME': (RecordType.CNAME, -1, -1), + } + + def consume(self) -> None: + self.db.enter_step('parse_dnsmass') + timestamp = 0 + header = True + for line in self.buf: + line = line[:-1] + if not line: + header = True + continue + + split = line.split(' ') + try: + if header: + timestamp = int(split[1]) + header = False + else: + dtype, name_offset, value_offset = \ + DnsMassParser.TYPES[split[1]] + record = ( + dtype, + timestamp, + split[0][:name_offset], + split[2][:value_offset], + ) + self.register(record) + self.db.enter_step('parse_dnsmass') + except KeyError: + continue + + +PARSERS = { + 'rapid7': Rapid7Parser, + 'dnsmass': DnsMassParser, +} + if __name__ == '__main__': # Parsing arguments log = logging.getLogger('feed_dns') - parser = argparse.ArgumentParser( + args_parser = argparse.ArgumentParser( description="TODO") - parser.add_argument( - # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, + args_parser.add_argument( + 'parser', + choices=PARSERS.keys(), + help="TODO") + args_parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="TODO") - args = parser.parse_args() + args_parser.add_argument( + '-j', '--workers', type=int, default=4, + help="TODO") + args_parser.add_argument( + '-b', '--block-size', type=int, default=100, + help="TODO") + args = args_parser.parse_args() DB = database.Database(write=False) # Not needed, just for timing DB.log = logging.getLogger('db ') - lines_queue: multiprocessing.Queue = multiprocessing.Queue(maxsize=100) - write_queue: multiprocessing.Queue = multiprocessing.Queue(maxsize=100) + recs_queue: multiprocessing.Queue = multiprocessing.Queue( + maxsize=10*args.workers) + write_queue: multiprocessing.Queue = multiprocessing.Queue( + maxsize=10*args.workers) DB.enter_step('proc_create') readers: typing.List[Reader] = list() - for w in range(NUMBER_THREADS): - readers.append(Reader(lines_queue, write_queue, w)) + for w in range(args.workers): + readers.append(Reader(recs_queue, write_queue, w)) writer = Writer(write_queue) + parser = PARSERS[args.parser]( + args.input, recs_queue, args.block_size) DB.enter_step('proc_start') for reader in readers: @@ -123,28 +242,12 @@ if __name__ == '__main__': writer.start() try: - block: typing.List[str] = list() - DB.enter_step('iowait') - for line in args.input: - DB.enter_step('block_append') - DB.enter_step('feed_json_parse') - data = json.loads(line) - line = (data['type'], - int(data['timestamp']), - data['name'], - data['value']) - block.append(line) - if len(block) >= BLOCK_SIZE: - DB.enter_step('wait_put') - lines_queue.put(block) - block = list() - DB.enter_step('iowait') - DB.enter_step('wait_put') - lines_queue.put(block) + DB.enter_step('parser_run') + parser.run() DB.enter_step('end_put') - for _ in range(NUMBER_THREADS): - lines_queue.put(None) + for _ in range(args.workers): + recs_queue.put(None) write_queue.put(None) DB.enter_step('proc_join') diff --git a/json_to_csv.py b/json_to_csv.py deleted file mode 100755 index 39ca1b7..0000000 --- a/json_to_csv.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import sys -import logging -import json -import csv - -if __name__ == '__main__': - - # Parsing arguments - log = logging.getLogger('json_to_csv') - parser = argparse.ArgumentParser( - description="TODO") - parser.add_argument( - # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="TODO") - parser.add_argument( - # '-i', '--output', type=argparse.FileType('wb'), default=sys.stdout.buffer, - '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, - help="TODO") - args = parser.parse_args() - - writer = csv.writer(args.output) - for line in args.input: - data = json.loads(line) - try: - writer.writerow([ - data['type'][0], # First letter, will need to do something special for AAAA - data['timestamp'], - data['name'], - data['value']]) - except (KeyError, json.decoder.JSONDecodeError): - log.error('Could not parse line: %s', line) - pass diff --git a/new_workflow.sh b/new_workflow.sh index e21b426..c98cd46 100755 --- a/new_workflow.sh +++ b/new_workflow.sh @@ -9,11 +9,11 @@ function log() { # TODO Fetch 'em log "Reading PTR records…" -pv ptr.json.gz | gunzip | ./json_to_csv.py | ./feed_dns.py +pv ptr.json.gz | gunzip | ./feed_dns.py log "Reading A records…" -pv a.json.gz | gunzip | ./json_to_csv.py | ./feed_dns.py +pv a.json.gz | gunzip | ./feed_dns.py log "Reading CNAME records…" -pv cname.json.gz | gunzip | ./json_to_csv.py | ./feed_dns.py +pv cname.json.gz | gunzip | ./feed_dns.py log "Pruning old data…" ./database.py --prune diff --git a/resolve_subdomains.py b/resolve_subdomains.py deleted file mode 100755 index bc26e34..0000000 --- a/resolve_subdomains.py +++ /dev/null @@ -1,264 +0,0 @@ -#!/usr/bin/env python3 - -""" -From a list of subdomains, output only -the ones resolving to a first-party tracker. -""" - -import argparse -import logging -import os -import queue -import sys -import threading -import typing -import time - -import coloredlogs -import dns.exception -import dns.resolver - -DNS_TIMEOUT = 5.0 -NUMBER_TRIES = 5 - - -class Worker(threading.Thread): - """ - Worker process for a DNS resolver. - Will resolve DNS to match first-party subdomains. - """ - - def change_nameserver(self) -> None: - """ - Assign a this worker another nameserver from the queue. - """ - server = None - while server is None: - try: - server = self.orchestrator.nameservers_queue.get(block=False) - except queue.Empty: - self.orchestrator.refill_nameservers_queue() - self.log.info("Using nameserver: %s", server) - self.resolver.nameservers = [server] - - def __init__(self, - orchestrator: 'Orchestrator', - index: int = 0): - super(Worker, self).__init__() - self.log = logging.getLogger(f'worker{index:03d}') - self.orchestrator = orchestrator - - self.resolver = dns.resolver.Resolver() - self.change_nameserver() - - def resolve_subdomain(self, subdomain: str) -> typing.Optional[ - typing.List[ - dns.rrset.RRset - ] - ]: - """ - Returns the resolution chain of the subdomain to an A record, - including any intermediary CNAME. - The last element is an IP address. - Returns None if the nameserver was unable to satisfy the request. - Returns [] if the requests points to nothing. - """ - self.log.debug("Querying %s", subdomain) - try: - query = self.resolver.query(subdomain, 'A', lifetime=DNS_TIMEOUT) - except dns.resolver.NXDOMAIN: - return [] - except dns.resolver.NoAnswer: - return [] - except dns.resolver.YXDOMAIN: - self.log.warning("Query name too long for %s", subdomain) - return None - except dns.resolver.NoNameservers: - # NOTE Most of the time this error message means that the domain - # does not exists, but sometimes it means the that the server - # itself is broken. So we count on the retry logic. - self.log.warning("All nameservers broken for %s", subdomain) - return None - except dns.exception.Timeout: - # NOTE Same as above - self.log.warning("Timeout for %s", subdomain) - return None - except dns.name.EmptyLabel: - self.log.warning("Empty label for %s", subdomain) - return None - return query.response.answer - - def run(self) -> None: - self.log.info("Started") - subdomain: str - for subdomain in iter(self.orchestrator.subdomains_queue.get, None): - - for _ in range(NUMBER_TRIES): - resolved = self.resolve_subdomain(subdomain) - # Retry with another nameserver if error - if resolved is None: - self.change_nameserver() - else: - break - - # If it wasn't found after multiple tries - if resolved is None: - self.log.error("Gave up on %s", subdomain) - resolved = [] - - assert isinstance(resolved, list) - self.orchestrator.results_queue.put(resolved) - - self.orchestrator.results_queue.put(None) - self.log.info("Stopped") - - -class Orchestrator(): - """ - Orchestrator of the different Worker threads. - """ - - def refill_nameservers_queue(self) -> None: - """ - Re-fill the given nameservers into the nameservers queue. - Done every-time the queue is empty, making it - basically looping and infinite. - """ - # Might be in a race condition but that's probably fine - for nameserver in self.nameservers: - self.nameservers_queue.put(nameserver) - self.log.info("Refilled nameserver queue") - - def __init__(self, subdomains: typing.Iterable[str], - nameservers: typing.List[str] = None, - nb_workers: int = 1, - ): - self.log = logging.getLogger('orchestrator') - self.subdomains = subdomains - self.nb_workers = nb_workers - - # Use interal resolver by default - self.nameservers = nameservers or dns.resolver.Resolver().nameservers - - self.subdomains_queue: queue.Queue = queue.Queue( - maxsize=self.nb_workers) - self.results_queue: queue.Queue = queue.Queue() - self.nameservers_queue: queue.Queue = queue.Queue() - - self.refill_nameservers_queue() - - def fill_subdomain_queue(self) -> None: - """ - Read the subdomains in input and put them into the queue. - Done in a thread so we can both: - - yield the results as they come - - not store all the subdomains at once - """ - self.log.info("Started reading subdomains") - # Send data to workers - for subdomain in self.subdomains: - self.subdomains_queue.put(subdomain) - - self.log.info("Finished reading subdomains") - # Send sentinel to each worker - # sentinel = None ~= EOF - for _ in range(self.nb_workers): - self.subdomains_queue.put(None) - - @staticmethod - def format_rrset(rrset: dns.rrset.RRset) -> typing.Iterable[str]: - if rrset.rdtype == dns.rdatatype.CNAME: - dtype = 'c' - elif rrset.rdtype == dns.rdatatype.A: - dtype = 'a' - else: - raise NotImplementedError - name = rrset.name.to_text()[:-1] - for item in rrset.items: - value = item.to_text() - if rrset.rdtype == dns.rdatatype.CNAME: - value = value[:-1] - yield f'{dtype},{int(time.time())},{name},{value}\n' - - def run(self) -> typing.Iterable[str]: - """ - Yield the results. - """ - # Create workers - self.log.info("Creating workers") - for i in range(self.nb_workers): - Worker(self, i).start() - - fill_thread = threading.Thread(target=self.fill_subdomain_queue) - fill_thread.start() - - # Wait for one sentinel per worker - # In the meantime output results - for _ in range(self.nb_workers): - resolved: typing.List[dns.rrset.RRset] - for resolved in iter(self.results_queue.get, None): - for rrset in resolved: - yield from self.format_rrset(rrset) - - self.log.info("Waiting for reader thread") - fill_thread.join() - - self.log.info("Done!") - - -def main() -> None: - """ - Main function when used directly. - Read the subdomains provided and output it, - the last CNAME resolved and the IP adress it resolves to. - Takes as an input a filename (or nothing, for stdin), - and as an output a filename (or nothing, for stdout). - The input must be a subdomain per line, the output is a TODO - Use the file `nameservers` as the list of nameservers - to use, or else it will use the system defaults. - """ - - # Initialization - coloredlogs.install( - level='DEBUG', - fmt='%(asctime)s %(name)s %(levelname)s %(message)s' - ) - - # Parsing arguments - parser = argparse.ArgumentParser( - description="Massively resolves subdomains and store them in a file.") - parser.add_argument( - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="Input file with one subdomain per line") - parser.add_argument( - '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, - help="Outptut file with DNS chains") - parser.add_argument( - '-n', '--nameservers', default='nameservers', - help="File with one nameserver per line") - parser.add_argument( - '-j', '--workers', type=int, default=512, - help="Number of threads to use") - args = parser.parse_args() - - # Cleaning input - iterator = iter(args.input) - iterator = map(str.strip, iterator) - iterator = filter(None, iterator) - - # Reading nameservers - servers: typing.List[str] = list() - if os.path.isfile(args.nameservers): - servers = open(args.nameservers).readlines() - servers = list(filter(None, map(str.strip, servers))) - - for resolved in Orchestrator( - iterator, - servers, - nb_workers=args.workers - ).run(): - args.output.write(resolved) - - -if __name__ == '__main__': - main() From 040ce4c14ebccf48dff3b0eb29a02b55eb65f46b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 01:52:45 +0100 Subject: [PATCH 15/40] Typo in source --- fetch_resources.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch_resources.sh b/fetch_resources.sh index a7bdd36..00d131f 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -51,4 +51,4 @@ then else mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list fi -dl https://www.orwell1984.today/cname/eulerian.net_full.txt subdomains/orwell-eulerian-cname-list.cache.list +dl https://www.orwell1984.today/cname/eulerian.net.txt subdomains/orwell-eulerian-cname-list.cache.list From 4d966371b2d696222c9bc625c31984baad3eefd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 15:56:26 +0100 Subject: [PATCH 16/40] Workflow: SQL -> Tree Welp. All that for this. --- .gitignore | 3 +- database.py | 728 +++++++++++++++----------------------------- database_schema.sql | 59 ---- export.py | 2 - feed_asn.py | 18 +- feed_dns.py | 167 ++-------- feed_rules.py | 6 +- import_rules.sh | 12 +- 8 files changed, 296 insertions(+), 699 deletions(-) mode change 100755 => 100644 database.py delete mode 100644 database_schema.sql diff --git a/.gitignore b/.gitignore index 188051c..c72635d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ *.log -*.db -*.db-journal +*.p nameservers nameservers.head diff --git a/database.py b/database.py old mode 100755 new mode 100644 index 19fbe97..2d970e3 --- a/database.py +++ b/database.py @@ -4,111 +4,59 @@ Utility functions to interact with the database. """ -import sqlite3 import typing import time -import os import logging -import argparse import coloredlogs -import ipaddress -import math +import pickle +import enum coloredlogs.install( level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' ) -DbValue = typing.Union[None, int, float, str, bytes] +PathType = enum.Enum('PathType', 'Rule Hostname Zone Asn Ip4 Ip6') +RulePath = typing.Union[None] +Asn = int +DomainPath = typing.List[str] +Ip4Path = typing.List[int] +Ip6Path = typing.List[int] +Path = typing.Union[RulePath, DomainPath, Asn, Ip4Path, Ip6Path] +TypedPath = typing.Tuple[PathType, Path] +Timestamp = int +Level = int +Match = typing.Tuple[Timestamp, TypedPath, Level] + +DebugPath = (PathType.Rule, None) -class Database(): - VERSION = 5 - PATH = "blocking.db" +class DomainTreeNode(): + def __init__(self) -> None: + self.children: typing.Dict[str, DomainTreeNode] = dict() + self.match_zone: typing.Optional[Match] = None + self.match_hostname: typing.Optional[Match] = None - def open(self) -> None: - mode = 'rwc' if self.write else 'ro' - uri = f'file:{self.PATH}?mode={mode}' - self.conn = sqlite3.connect(uri, uri=True) - cursor = self.conn.cursor() - cursor.execute("PRAGMA foreign_keys = ON") - self.conn.create_function("unpack_asn", 1, - self.unpack_asn, - deterministic=True) - self.conn.create_function("unpack_ip4address", 1, - self.unpack_ip4address, - deterministic=True) - self.conn.create_function("unpack_ip4network", 2, - self.unpack_ip4network, - deterministic=True) - self.conn.create_function("unpack_domain", 1, - lambda s: s[:-1][::-1], - deterministic=True) - self.conn.create_function("format_zone", 1, - lambda s: '*' + s[::-1], - deterministic=True) - def get_meta(self, key: str) -> typing.Optional[int]: - cursor = self.conn.cursor() - try: - cursor.execute("SELECT value FROM meta WHERE key=?", (key,)) - except sqlite3.OperationalError: - return None - for ver, in cursor: - return ver - return None +class IpTreeNode(): + def __init__(self) -> None: + self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None] + self.match: typing.Optional[Match] = None - def set_meta(self, key: str, val: int) -> None: - cursor = self.conn.cursor() - cursor.execute("INSERT INTO meta VALUES (?, ?) " - "ON CONFLICT (key) DO " - "UPDATE set value=?", - (key, val, val)) - def close(self) -> None: - self.enter_step('close_commit') - self.conn.commit() - self.enter_step('close') - self.conn.close() - self.profile() - - def initialize(self) -> None: - self.close() - self.enter_step('initialize') - if not self.write: - self.log.error("Cannot initialize in read-only mode.") - raise - os.unlink(self.PATH) - self.open() - self.log.info("Creating database version %d.", self.VERSION) - cursor = self.conn.cursor() - with open("database_schema.sql", 'r') as db_schema: - cursor.executescript(db_schema.read()) - self.set_meta('version', self.VERSION) - self.conn.commit() - - def __init__(self, write: bool = False) -> None: - self.log = logging.getLogger('db') +class Profiler(): + def __init__(self) -> None: + self.log = logging.getLogger('profiler') self.time_last = time.perf_counter() self.time_step = 'init' self.time_dict: typing.Dict[str, float] = dict() self.step_dict: typing.Dict[str, int] = dict() - self.write = write - - self.open() - version = self.get_meta('version') - if version != self.VERSION: - if version is not None: - self.log.warning( - "Outdated database version: %d found, will be rebuilt.", - version) - self.initialize() def enter_step(self, name: str) -> None: now = time.perf_counter() try: self.time_dict[self.time_step] += now - self.time_last - self.step_dict[self.time_step] += 1 + self.step_dict[self.time_step] += int(name != self.time_step) except KeyError: self.time_dict[self.time_step] = now - self.time_last self.step_dict[self.time_step] = 1 @@ -125,13 +73,58 @@ class Database(): self.log.debug(f"{'total':<20}: " f"{total:9.2f} s ({1:7.2%})") - @staticmethod - def pack_hostname(hostname: str) -> str: - return hostname[::-1] + '.' + +class Database(Profiler): + VERSION = 8 + PATH = "blocking.p" + + def initialize(self) -> None: + self.log.warning( + "Creating database version: %d ", + Database.VERSION) + self.domtree = DomainTreeNode() + self.asns: typing.Set[Asn] = set() + self.ip4tree = IpTreeNode() + + def load(self) -> None: + self.enter_step('load') + try: + with open(self.PATH, 'rb') as db_fdsec: + version, data = pickle.load(db_fdsec) + if version == Database.VERSION: + self.domtree, self.asns, self.ip4tree = data + return + self.log.warning( + "Outdated database version found: %d, " + "will be rebuilt.", + version) + except (TypeError, AttributeError, EOFError): + self.log.error( + "Corrupt database found, " + "will be rebuilt.") + except FileNotFoundError: + pass + self.initialize() + + def save(self) -> None: + self.enter_step('save') + with open(self.PATH, 'wb') as db_fdsec: + data = self.domtree, self.asns, self.ip4tree + pickle.dump((self.VERSION, data), db_fdsec) + self.profile() + + def __init__(self) -> None: + Profiler.__init__(self) + self.log = logging.getLogger('db') + self.load() @staticmethod - def pack_zone(zone: str) -> str: - return Database.pack_hostname(zone) + def pack_domain(domain: str) -> DomainPath: + return domain.split('.')[::-1] + + @staticmethod + def unpack_domain(domain: DomainPath) -> str: + return '.'.join(domain[::-1]) @staticmethod def pack_asn(asn: str) -> int: @@ -145,431 +138,208 @@ class Database(): return f'AS{asn}' @staticmethod - def pack_ip4address(address: str) -> int: - total = 0 - for i, octet in enumerate(address.split('.')): - total += int(octet) << (3-i)*8 - if total > 0xFFFFFFFF: - raise ValueError - return total - # return '{:02x}{:02x}{:02x}{:02x}'.format( - # *[int(c) for c in address.split('.')]) - # return base64.b16encode(packed).decode() - # return '{:08b}{:08b}{:08b}{:08b}'.format( - # *[int(c) for c in address.split('.')]) - # carg = ctypes.c_wchar_p(address) - # ret = ACCEL.ip4_flat(carg, self.accel_ip4_buf) - # if ret != 0: - # raise ValueError - # return self.accel_ip4_buf.value - # packed = ipaddress.ip_address(address).packed - # return packed + def pack_ip4address(address: str) -> Ip4Path: + addr: Ip4Path = [0] * 32 + octets = [int(octet) for octet in address.split('.')] + for b in range(32): + if (octets[b//8] >> b % 8) & 0b1: + addr[b] = 1 + return addr @staticmethod - def unpack_ip4address(address: int) -> str: - return '.'.join(str((address >> (i * 8)) & 0xFF) - for i in reversed(range(4))) + def unpack_ip4address(address: Ip4Path) -> str: + octets = [0] * 4 + for b, bit in enumerate(address): + octets[b//8] = (octets[b//8] << 1) + bit + return '.'.join(map(str, octets)) @staticmethod - def pack_ip4network(network: str) -> typing.Tuple[int, int]: - # def pack_ip4network(network: str) -> str: - net = ipaddress.ip_network(network) - mini = Database.pack_ip4address(net.network_address.exploded) - maxi = Database.pack_ip4address(net.broadcast_address.exploded) - # mini = net.network_address.packed - # maxi = net.broadcast_address.packed - return mini, maxi - # return Database.pack_ip4address(net.network_address.exploded)[:net.prefixlen] + def pack_ip4network(network: str) -> Ip4Path: + address, prefixlen_str = network.split('/') + prefixlen = int(prefixlen_str) + return Database.pack_ip4address(address)[:prefixlen] @staticmethod - def unpack_ip4network(mini: int, maxi: int) -> str: - addr = Database.unpack_ip4address(mini) - prefixlen = 32-int(math.log2(maxi-mini+1)) + def unpack_ip4network(network: Ip4Path) -> str: + address = network.copy() + prefixlen = len(network) + for _ in range(32-prefixlen): + address.append(0) + addr = Database.unpack_ip4address(address) return f'{addr}/{prefixlen}' def update_references(self) -> None: - self.enter_step('update_refs') - cursor = self.conn.cursor() - cursor.execute('UPDATE rules AS r SET refs=' - '(SELECT count(*) FROM rules ' - 'WHERE source=r.id)') + raise NotImplementedError def prune(self, before: int, base_only: bool = False) -> None: - self.enter_step('prune') - cursor = self.conn.cursor() - cmd = 'DELETE FROM rules WHERE updated str: - # Format current - string = '???' - cursor = self.conn.cursor() - cursor.execute( - 'SELECT unpack_asn(val) FROM asn WHERE entry=:entry ' - 'UNION ' - 'SELECT unpack_domain(val) FROM hostname WHERE entry=:entry ' - 'UNION ' - 'SELECT format_zone(val) FROM zone WHERE entry=:entry ' - 'UNION ' - 'SELECT unpack_ip4address(val) FROM ip4address WHERE entry=:entry ' - 'UNION ' - 'SELECT unpack_ip4network(mini, maxi) ' - 'FROM ip4network WHERE entry=:entry ', - {"entry": entry} - ) - for val, in cursor: # only one - string = str(val) - string += f' #{entry}' - - # Add source if any - cursor.execute('SELECT source FROM rules WHERE id=?', (entry,)) - for source, in cursor: - if source: - string += f' ← {self.explain(source)}' - return string + raise NotImplementedError def export(self, first_party_only: bool = False, end_chain_only: bool = False, explain: bool = False, + _dic: DomainTreeNode = None, + _par: DomainPath = None, ) -> typing.Iterable[str]: - selection = 'entry' if explain else 'unpack_domain(val)' - command = f'SELECT {selection} FROM rules ' \ - 'INNER JOIN hostname ON rules.id = hostname.entry' - restrictions: typing.List[str] = list() - if first_party_only: - restrictions.append('rules.first_party = 1') - if end_chain_only: - restrictions.append('rules.refs = 0') - if restrictions: - command += ' WHERE ' + ' AND '.join(restrictions) - if not explain: - command += ' ORDER BY unpack_domain(val) ASC' - cursor = self.conn.cursor() - cursor.execute(command) - for val, in cursor: - if explain: - yield self.explain(val) - else: - yield val + if first_party_only or end_chain_only or explain: + raise NotImplementedError + _dic = _dic or self.domtree + _par = _par or list() + if _dic.match_hostname: + yield self.unpack_domain(_par) + for part in _dic.children: + dic = _dic.children[part] + yield from self.export(_dic=dic, + _par=_par + [part]) def count_rules(self, first_party_only: bool = False, ) -> str: - counts: typing.List[str] = list() - cursor = self.conn.cursor() - for table in ['asn', 'ip4network', 'ip4address', 'zone', 'hostname']: - command = f'SELECT count(*) FROM rules ' \ - f'INNER JOIN {table} ON rules.id = {table}.entry ' \ - 'WHERE rules.level = 0' - if first_party_only: - command += ' AND first_party=1' - cursor.execute(command) - count, = cursor.fetchone() - if count > 0: - counts.append(f'{table}: {count}') + raise NotImplementedError - return ', '.join(counts) - - def get_domain(self, domain: str) -> typing.Iterable[int]: - self.enter_step('get_domain_prepare') - domain_prep = self.pack_hostname(domain) - cursor = self.conn.cursor() - self.enter_step('get_domain_select') - cursor.execute( - 'SELECT null, entry FROM hostname ' - 'WHERE val=:d ' - 'UNION ' - 'SELECT * FROM (' - 'SELECT val, entry FROM zone ' - # 'WHERE val>=:d ' - # 'ORDER BY val ASC LIMIT 1' - 'WHERE val<=:d ' - 'AND instr(:d, val) = 1' - ')', - {'d': domain_prep} - ) - for val, entry in cursor: - # print(293, val, entry) - self.enter_step('get_domain_confirm') - if not (val is None or domain_prep.startswith(val)): - # print(297) - continue + def get_domain(self, domain_str: str) -> typing.Iterable[TypedPath]: + self.enter_step('get_domain_pack') + domain = self.pack_domain(domain_str) + self.enter_step('get_domain_brws') + dic = self.domtree + depth = 0 + for part in domain: + if dic.match_zone: + self.enter_step('get_domain_yield') + yield (PathType.Zone, domain[:depth]) + self.enter_step('get_domain_brws') + if part not in dic.children: + return + dic = dic.children[part] + depth += 1 + if dic.match_zone: self.enter_step('get_domain_yield') - yield entry + yield (PathType.Zone, domain) + if dic.match_hostname: + self.enter_step('get_domain_yield') + yield (PathType.Hostname, domain) - def get_domain_in_zone(self, domain: str) -> typing.Iterable[int]: - self.enter_step('get_domainiz_prepare') - domain_prep = self.pack_hostname(domain) - cursor = self.conn.cursor() - self.enter_step('get_domainiz_select') - cursor.execute( - 'SELECT val, entry FROM zone ' - 'WHERE val<=:d ' - 'ORDER BY val DESC LIMIT 1', - {'d': domain_prep} - ) - for val, entry in cursor: - self.enter_step('get_domainiz_confirm') - if not (val is None or domain_prep.startswith(val)): - continue - self.enter_step('get_domainiz_yield') - yield entry - - def get_ip4(self, address: str) -> typing.Iterable[int]: - self.enter_step('get_ip4_prepare') - try: - address_prep = self.pack_ip4address(address) - except (ValueError, IndexError): - self.log.error("Invalid ip4address: %s", address) - return - cursor = self.conn.cursor() - self.enter_step('get_ip4_select') - cursor.execute( - 'SELECT entry FROM ip4address ' - # 'SELECT null, entry FROM ip4address ' - 'WHERE val=:a ' - 'UNION ' - # 'SELECT * FROM (' - # 'SELECT val, entry FROM ip4network ' - # 'WHERE val<=:a ' - # 'AND instr(:a, val) > 0 ' - # 'ORDER BY val DESC' - # ')' - 'SELECT entry FROM ip4network ' - 'WHERE :a BETWEEN mini AND maxi ', - {'a': address_prep} - ) - for entry, in cursor: - # self.enter_step('get_ip4_confirm') - # if not (val is None or val.startswith(address_prep)): - # # PERF startswith but from the end - # continue + def get_ip4(self, ip4_str: str) -> typing.Iterable[TypedPath]: + self.enter_step('get_ip4_pack') + ip4 = self.pack_ip4address(ip4_str) + self.enter_step('get_ip4_brws') + dic = self.ip4tree + depth = 0 + for part in ip4: + if dic.match: + self.enter_step('get_ip4_yield') + yield (PathType.Ip4, ip4[:depth]) + self.enter_step('get_ip4_brws') + next_dic = dic.children[part] + if next_dic is None: + return + dic = next_dic + depth += 1 + if dic.match: self.enter_step('get_ip4_yield') - yield entry + yield (PathType.Ip4, ip4) - def get_ip4_in_network(self, address: str) -> typing.Iterable[int]: - self.enter_step('get_ip4in_prepare') - try: - address_prep = self.pack_ip4address(address) - except (ValueError, IndexError): - self.log.error("Invalid ip4address: %s", address) - return - cursor = self.conn.cursor() - self.enter_step('get_ip4in_select') - cursor.execute( - 'SELECT entry FROM ip4network ' - 'WHERE :a BETWEEN mini AND maxi ', - {'a': address_prep} - ) - for entry, in cursor: - self.enter_step('get_ip4in_yield') - yield entry + def list_asn(self) -> typing.Iterable[TypedPath]: + for asn in self.asns: + yield (PathType.Asn, asn) - def list_asn(self) -> typing.Iterable[typing.Tuple[str, int]]: - cursor = self.conn.cursor() - self.enter_step('list_asn_select') - cursor.execute('SELECT val, entry FROM asn') - for val, entry in cursor: - yield f'AS{val}', entry - - def _set_generic(self, - table: str, - select_query: str, - insert_query: str, - prep: typing.Dict[str, DbValue], + def set_hostname(self, + hostname_str: str, updated: int, - is_first_party: bool = False, - source: int = None, - ) -> None: - # Since this isn't the bulk of the processing, - # here abstraction > performaces + is_first_party: bool = None, + source: TypedPath = None) -> None: + self.enter_step('set_hostname_pack') + if is_first_party or source: + raise NotImplementedError + self.enter_step('set_hostname_brws') + hostname = self.pack_domain(hostname_str) + dic = self.domtree + for part in hostname: + if dic.match_zone: + # Refuse to add hostname whose zone is already matching + return + if part not in dic.children: + dic.children[part] = DomainTreeNode() + dic = dic.children[part] + dic.match_hostname = (updated, DebugPath, 0) - # Fields based on the source - self.enter_step(f'set_{table}_prepare') - cursor = self.conn.cursor() - if source is None: - first_party = int(is_first_party) - level = 0 - else: - self.enter_step(f'set_{table}_source') - cursor.execute( - 'SELECT first_party, level FROM rules ' - 'WHERE id=?', - (source,) - ) - first_party, level = cursor.fetchone() - level += 1 + def set_zone(self, + zone_str: str, + updated: int, + is_first_party: bool = None, + source: TypedPath = None) -> None: + self.enter_step('set_zone_pack') + if is_first_party or source: + raise NotImplementedError + zone = self.pack_domain(zone_str) + self.enter_step('set_zone_brws') + dic = self.domtree + for part in zone: + if dic.match_zone: + # Refuse to add zone whose parent zone is already matching + return + if part not in dic.children: + dic.children[part] = DomainTreeNode() + dic = dic.children[part] + dic.match_zone = (updated, DebugPath, 0) - self.enter_step(f'set_{table}_select') - cursor.execute(select_query, prep) + def set_asn(self, + asn_str: str, + updated: int, + is_first_party: bool = None, + source: TypedPath = None) -> None: + self.enter_step('set_asn_pack') + if is_first_party or source: + # TODO updated + raise NotImplementedError + asn = self.pack_asn(asn_str) + self.enter_step('set_asn_brws') + self.asns.add(asn) - rules_prep: typing.Dict[str, DbValue] = { - "source": source, - "updated": updated, - "first_party": first_party, - "level": level, - } + def set_ip4address(self, + ip4address_str: str, + updated: int, + is_first_party: bool = None, + source: TypedPath = None) -> None: + self.enter_step('set_ip4add_pack') + if is_first_party or source: + raise NotImplementedError + self.enter_step('set_ip4add_brws') + ip4address = self.pack_ip4address(ip4address_str) + dic = self.ip4tree + for part in ip4address: + if dic.match: + # Refuse to add ip4address whose network is already matching + return + next_dic = dic.children[part] + if next_dic is None: + next_dic = IpTreeNode() + dic.children[part] = next_dic + dic = next_dic + dic.match = (updated, DebugPath, 0) - # If the entry already exists - for entry, in cursor: # only one - self.enter_step(f'set_{table}_update') - rules_prep['entry'] = entry - cursor.execute( - 'UPDATE rules SET ' - 'source=:source, updated=:updated, ' - 'first_party=:first_party, level=:level ' - 'WHERE id=:entry AND (updated<:updated OR ' - 'first_party<:first_party OR level<:level)', - rules_prep - ) - # Only update if any of the following: - # - the entry is outdataed - # - the entry was not a first_party but this is - # - this is closer to the original rule - return - - # If it does not exist - - self.enter_step(f'set_{table}_insert') - cursor.execute( - 'INSERT INTO rules ' - '(source, updated, first_party, level) ' - 'VALUES (:source, :updated, :first_party, :level) ', - rules_prep - ) - cursor.execute('SELECT id FROM rules WHERE rowid=?', - (cursor.lastrowid,)) - for entry, in cursor: # only one - prep['entry'] = entry - cursor.execute(insert_query, prep) - return - assert False - - def set_hostname(self, hostname: str, - *args: typing.Any, **kwargs: typing.Any) -> None: - self.enter_step('set_hostname_prepare') - prep: typing.Dict[str, DbValue] = { - 'val': self.pack_hostname(hostname), - } - self._set_generic( - 'hostname', - 'SELECT entry FROM hostname WHERE val=:val', - 'INSERT INTO hostname (val, entry) ' - 'VALUES (:val, :entry)', - prep, - *args, **kwargs - ) - - def set_asn(self, asn: str, - *args: typing.Any, **kwargs: typing.Any) -> None: - self.enter_step('set_asn_prepare') - try: - asn_prep = self.pack_asn(asn) - except ValueError: - self.log.error("Invalid asn: %s", asn) - return - prep: typing.Dict[str, DbValue] = { - 'val': asn_prep, - } - self._set_generic( - 'asn', - 'SELECT entry FROM asn WHERE val=:val', - 'INSERT INTO asn (val, entry) ' - 'VALUES (:val, :entry)', - prep, - *args, **kwargs - ) - - def set_ip4address(self, ip4address: str, - *args: typing.Any, **kwargs: typing.Any) -> None: - self.enter_step('set_ip4add_prepare') - try: - ip4address_prep = self.pack_ip4address(ip4address) - except (ValueError, IndexError): - self.log.error("Invalid ip4address: %s", ip4address) - return - prep: typing.Dict[str, DbValue] = { - 'val': ip4address_prep, - } - self._set_generic( - 'ip4add', - 'SELECT entry FROM ip4address WHERE val=:val', - 'INSERT INTO ip4address (val, entry) ' - 'VALUES (:val, :entry)', - prep, - *args, **kwargs - ) - - def set_zone(self, zone: str, - *args: typing.Any, **kwargs: typing.Any) -> None: - self.enter_step('set_zone_prepare') - prep: typing.Dict[str, DbValue] = { - 'val': self.pack_zone(zone), - } - self._set_generic( - 'zone', - 'SELECT entry FROM zone WHERE val=:val', - 'INSERT INTO zone (val, entry) ' - 'VALUES (:val, :entry)', - prep, - *args, **kwargs - ) - - def set_ip4network(self, ip4network: str, - *args: typing.Any, **kwargs: typing.Any) -> None: - self.enter_step('set_ip4net_prepare') - try: - ip4network_prep = self.pack_ip4network(ip4network) - except (ValueError, IndexError): - self.log.error("Invalid ip4network: %s", ip4network) - return - prep: typing.Dict[str, DbValue] = { - 'mini': ip4network_prep[0], - 'maxi': ip4network_prep[1], - } - self._set_generic( - 'ip4net', - 'SELECT entry FROM ip4network WHERE mini=:mini AND maxi=:maxi', - 'INSERT INTO ip4network (mini, maxi, entry) ' - 'VALUES (:mini, :maxi, :entry)', - prep, - *args, **kwargs - ) - - -if __name__ == '__main__': - - # Parsing arguments - parser = argparse.ArgumentParser( - description="Database operations") - parser.add_argument( - '-i', '--initialize', action='store_true', - help="Reconstruct the whole database") - parser.add_argument( - '-p', '--prune', action='store_true', - help="Remove old entries from database") - parser.add_argument( - '-b', '--prune-base', action='store_true', - help="TODO") - parser.add_argument( - '-s', '--prune-before', type=int, - default=(int(time.time()) - 60*60*24*31*6), - help="TODO") - parser.add_argument( - '-r', '--references', action='store_true', - help="Update the reference count") - args = parser.parse_args() - - DB = Database(write=True) - - if args.initialize: - DB.initialize() - if args.prune: - DB.prune(before=args.prune_before, base_only=args.prune_base) - if args.references: - DB.update_references() - - DB.close() + def set_ip4network(self, + ip4network_str: str, + updated: int, + is_first_party: bool = None, + source: TypedPath = None) -> None: + self.enter_step('set_ip4net_pack') + if is_first_party or source: + raise NotImplementedError + self.enter_step('set_ip4net_brws') + ip4network = self.pack_ip4network(ip4network_str) + dic = self.ip4tree + for part in ip4network: + if dic.match: + # Refuse to add ip4network whose parent network + # is already matching + return + next_dic = dic.children[part] + if next_dic is None: + next_dic = IpTreeNode() + dic.children[part] = next_dic + dic = next_dic + dic.match = (updated, DebugPath, 0) diff --git a/database_schema.sql b/database_schema.sql deleted file mode 100644 index 3116a09..0000000 --- a/database_schema.sql +++ /dev/null @@ -1,59 +0,0 @@ --- Remember to increment DB_VERSION --- in database.py on changes to this file - -CREATE TABLE rules ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - source INTEGER, -- The rule this one is based on - updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) - first_party INTEGER, -- 1: this blocks a first party for sure, 0: maybe - refs INTEGER, -- Number of entries issued from this one - level INTEGER, -- Level of recursion to the root source rule (used for source priority) - FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE -); -CREATE INDEX rules_source ON rules (source); -- for references recounting -CREATE INDEX rules_updated ON rules (updated); -- for pruning -CREATE INDEX rules_level_firstparty ON rules (level, first_party); -- for counting rules - -CREATE TABLE asn ( - val INTEGER PRIMARY KEY, - entry INTEGER, - FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE -); -CREATE INDEX asn_entry ON asn (entry); -- for explainations - -CREATE TABLE hostname ( - val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone) - entry INTEGER, - FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE -); -CREATE INDEX hostname_entry ON hostname (entry); -- for explainations - -CREATE TABLE zone ( - val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching) - entry INTEGER, - FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE -); -CREATE INDEX zone_entry ON zone (entry); -- for explainations - -CREATE TABLE ip4address ( - val INTEGER PRIMARY KEY, - entry INTEGER, - FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE -); -CREATE INDEX ip4address_entry ON ip4address (entry); -- for explainations - -CREATE TABLE ip4network ( - -- val TEXT PRIMARY KEY, - mini INTEGER, - maxi INTEGER, - entry INTEGER, - FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE -); -CREATE INDEX ip4network_minmax ON ip4network (mini, maxi); -CREATE INDEX ip4network_entry ON ip4network (entry); -- for explainations - --- Store various things -CREATE TABLE meta ( - key TEXT PRIMARY KEY, - value integer -); diff --git a/export.py b/export.py index 886582c..bca3281 100755 --- a/export.py +++ b/export.py @@ -45,5 +45,3 @@ if __name__ == '__main__': explain=args.explain, ): print(domain, file=args.output) - - DB.close() diff --git a/feed_asn.py b/feed_asn.py index 098f931..ead63fe 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -31,23 +31,25 @@ if __name__ == '__main__': args = parser.parse_args() DB = database.Database() - DBW = database.Database(write=True) - for asn, entry in DB.list_asn(): + for path in DB.list_asn(): + ptype, asn = path + assert ptype == database.PathType.Asn + assert isinstance(asn, int) + asn_str = database.Database.unpack_asn(asn) DB.enter_step('asn_get_ranges') - for prefix in get_ranges(asn): + for prefix in get_ranges(asn_str): parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) if parsed_prefix.version == 4: - DBW.set_ip4network( + DB.set_ip4network( prefix, - source=entry, + # source=path, updated=int(time.time()) ) - log.info('Added %s from %s (id=%s)', prefix, asn, entry) + log.info('Added %s from %s (source=%s)', prefix, asn, path) elif parsed_prefix.version == 6: log.warning('Unimplemented prefix version: %s', prefix) else: log.error('Unknown prefix version: %s', prefix) - DB.close() - DBW.close() + DB.save() diff --git a/feed_dns.py b/feed_dns.py index 4b01814..3acad9a 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -6,126 +6,52 @@ import json import logging import sys import typing -import multiprocessing import enum RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') Record = typing.Tuple[RecordType, int, str, str] -# select, confirm, write +# select, write FUNCTION_MAP: typing.Any = { RecordType.A: ( database.Database.get_ip4, - database.Database.get_domain_in_zone, database.Database.set_hostname, ), RecordType.CNAME: ( database.Database.get_domain, - database.Database.get_domain_in_zone, database.Database.set_hostname, ), RecordType.PTR: ( database.Database.get_domain, - database.Database.get_ip4_in_network, database.Database.set_ip4address, ), } -class Reader(multiprocessing.Process): - def __init__(self, - recs_queue: multiprocessing.Queue, - write_queue: multiprocessing.Queue, - index: int = 0): - super(Reader, self).__init__() - self.log = logging.getLogger(f'rd{index:03d}') - self.recs_queue = recs_queue - self.write_queue = write_queue - self.index = index - - def run(self) -> None: - self.db = database.Database(write=False) - self.db.log = logging.getLogger(f'db{self.index:03d}') - self.db.enter_step('line_wait') - block: typing.List[str] - try: - for block in iter(self.recs_queue.get, None): - record: Record - for record in block: - # print(55, record) - dtype, updated, name, value = record - self.db.enter_step('feed_switch') - select, confirm, write = FUNCTION_MAP[dtype] - for rule in select(self.db, value): - # print(60, rule, list(confirm(self.db, name))) - if not any(confirm(self.db, name)): - # print(62, write, name, updated, rule) - self.db.enter_step('wait_put') - self.write_queue.put((write, name, updated, rule)) - self.db.enter_step('line_wait') - except KeyboardInterrupt: - self.log.error('Interrupted') - - self.db.enter_step('end') - self.db.close() - - -class Writer(multiprocessing.Process): - def __init__(self, - write_queue: multiprocessing.Queue, - ): - super(Writer, self).__init__() - self.log = logging.getLogger(f'wr ') - self.write_queue = write_queue - - def run(self) -> None: - self.db = database.Database(write=True) - self.db.log = logging.getLogger(f'dbw ') - self.db.enter_step('line_wait') - block: typing.List[str] - try: - fun: typing.Callable - name: str - updated: int - source: int - for fun, name, updated, source in iter(self.write_queue.get, None): - self.db.enter_step('exec') - fun(self.db, name, updated, source=source) - self.db.enter_step('line_wait') - except KeyboardInterrupt: - self.log.error('Interrupted') - - self.db.enter_step('end') - self.db.close() - - class Parser(): - def __init__(self, - buf: typing.Any, - recs_queue: multiprocessing.Queue, - block_size: int, - ): - super(Parser, self).__init__() + def __init__(self, buf: typing.Any) -> None: self.buf = buf - self.log = logging.getLogger('pr ') - self.recs_queue = recs_queue - self.block: typing.List[Record] = list() - self.block_size = block_size - self.db = database.Database() # Just for timing - self.db.log = logging.getLogger('pr ') + self.log = logging.getLogger('parser') + self.db = database.Database() + + def end(self) -> None: + self.db.save() + + def register(self, + rtype: RecordType, + updated: int, + name: str, + value: str + ) -> None: - def register(self, record: Record) -> None: self.db.enter_step('register') - self.block.append(record) - if len(self.block) >= self.block_size: - self.db.enter_step('put_block') - self.recs_queue.put(self.block) - self.block = list() - - def run(self) -> None: - self.consume() - self.recs_queue.put(self.block) - self.db.close() + select, write = FUNCTION_MAP[rtype] + try: + for source in select(self.db, value): + # write(self.db, name, updated, source=source) + write(self.db, name, updated) + except NotImplementedError: + return # DEBUG def consume(self) -> None: raise NotImplementedError @@ -146,13 +72,12 @@ class Rapid7Parser(Parser): data = json.loads(line) except json.decoder.JSONDecodeError: continue - record = ( + self.register( Rapid7Parser.TYPES[data['type']], int(data['timestamp']), data['name'], data['value'] ) - self.register(record) class DnsMassParser(Parser): @@ -182,13 +107,12 @@ class DnsMassParser(Parser): else: dtype, name_offset, value_offset = \ DnsMassParser.TYPES[split[1]] - record = ( + self.register( dtype, timestamp, split[0][:name_offset], split[2][:value_offset], ) - self.register(record) self.db.enter_step('parse_dnsmass') except KeyError: continue @@ -212,49 +136,12 @@ if __name__ == '__main__': args_parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="TODO") - args_parser.add_argument( - '-j', '--workers', type=int, default=4, - help="TODO") - args_parser.add_argument( - '-b', '--block-size', type=int, default=100, - help="TODO") args = args_parser.parse_args() - DB = database.Database(write=False) # Not needed, just for timing - DB.log = logging.getLogger('db ') - - recs_queue: multiprocessing.Queue = multiprocessing.Queue( - maxsize=10*args.workers) - write_queue: multiprocessing.Queue = multiprocessing.Queue( - maxsize=10*args.workers) - - DB.enter_step('proc_create') - readers: typing.List[Reader] = list() - for w in range(args.workers): - readers.append(Reader(recs_queue, write_queue, w)) - writer = Writer(write_queue) - parser = PARSERS[args.parser]( - args.input, recs_queue, args.block_size) - - DB.enter_step('proc_start') - for reader in readers: - reader.start() - writer.start() - + parser = PARSERS[args.parser](args.input) try: - DB.enter_step('parser_run') - parser.run() - - DB.enter_step('end_put') - for _ in range(args.workers): - recs_queue.put(None) - write_queue.put(None) - - DB.enter_step('proc_join') - for reader in readers: - reader.join() - writer.join() + parser.consume() except KeyboardInterrupt: - log.error('Interrupted') + pass + parser.end() - DB.close() diff --git a/feed_rules.py b/feed_rules.py index 715126e..cca1261 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -28,15 +28,15 @@ if __name__ == '__main__': help="The input only comes from verified first-party sources") args = parser.parse_args() - DB = database.Database(write=True) + DB = database.Database() fun = FUNCTION_MAP[args.type] for rule in args.input: fun(DB, rule.strip(), - is_first_party=args.first_party, + # is_first_party=args.first_party, updated=int(time.time()), ) - DB.close() + DB.save() diff --git a/import_rules.sh b/import_rules.sh index 33c4fbd..cdeec93 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -6,11 +6,11 @@ function log() { log "Importing rules…" BEFORE="$(date +%s)" -cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone -cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone -cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone -cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network -cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn +# cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone +# cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone +# cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone +# cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +# cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party @@ -19,4 +19,4 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py log "Pruning old rules…" -./database.py --prune --prune-before "$BEFORE" --prune-base +./db.py --prune --prune-before "$BEFORE" --prune-base From d976752797f7e8ecd8dbfa89a17b08745acf154e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 16:26:18 +0100 Subject: [PATCH 17/40] Store Ip4Path as int instead of List[int] --- database.py | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/database.py b/database.py index 2d970e3..12569d3 100644 --- a/database.py +++ b/database.py @@ -20,7 +20,7 @@ PathType = enum.Enum('PathType', 'Rule Hostname Zone Asn Ip4 Ip6') RulePath = typing.Union[None] Asn = int DomainPath = typing.List[str] -Ip4Path = typing.List[int] +Ip4Path = typing.Tuple[int, int] # value, prefixlen Ip6Path = typing.List[int] Path = typing.Union[RulePath, DomainPath, Asn, Ip4Path, Ip6Path] TypedPath = typing.Tuple[PathType, Path] @@ -139,33 +139,33 @@ class Database(Profiler): @staticmethod def pack_ip4address(address: str) -> Ip4Path: - addr: Ip4Path = [0] * 32 - octets = [int(octet) for octet in address.split('.')] - for b in range(32): - if (octets[b//8] >> b % 8) & 0b1: - addr[b] = 1 - return addr + addr = 0 + for split in address.split('.'): + addr = addr << 4 + int(split) + return (addr, 32) @staticmethod def unpack_ip4address(address: Ip4Path) -> str: + addr, prefixlen = address + assert prefixlen == 32 + octets: typing.List[int] = list() octets = [0] * 4 - for b, bit in enumerate(address): - octets[b//8] = (octets[b//8] << 1) + bit + for o in reversed(range(4)): + octets[o] = addr & 0xFF + addr >>= 8 return '.'.join(map(str, octets)) @staticmethod def pack_ip4network(network: str) -> Ip4Path: address, prefixlen_str = network.split('/') prefixlen = int(prefixlen_str) - return Database.pack_ip4address(address)[:prefixlen] + addr, _ = Database.pack_ip4address(address) + return (addr, prefixlen) @staticmethod def unpack_ip4network(network: Ip4Path) -> str: - address = network.copy() - prefixlen = len(network) - for _ in range(32-prefixlen): - address.append(0) - addr = Database.unpack_ip4address(address) + address, prefixlen = network + addr = Database.unpack_ip4address((address, 32)) return f'{addr}/{prefixlen}' def update_references(self) -> None: @@ -224,20 +224,19 @@ class Database(Profiler): def get_ip4(self, ip4_str: str) -> typing.Iterable[TypedPath]: self.enter_step('get_ip4_pack') - ip4 = self.pack_ip4address(ip4_str) + ip4, prefixlen = self.pack_ip4address(ip4_str) self.enter_step('get_ip4_brws') dic = self.ip4tree - depth = 0 - for part in ip4: + for i in reversed(range(prefixlen)): + part = (ip4 >> i) & 0b1 if dic.match: self.enter_step('get_ip4_yield') - yield (PathType.Ip4, ip4[:depth]) + yield (PathType.Ip4, (ip4, 32-i)) self.enter_step('get_ip4_brws') next_dic = dic.children[part] if next_dic is None: return dic = next_dic - depth += 1 if dic.match: self.enter_step('get_ip4_yield') yield (PathType.Ip4, ip4) @@ -307,10 +306,11 @@ class Database(Profiler): self.enter_step('set_ip4add_pack') if is_first_party or source: raise NotImplementedError + ip4, prefixlen = self.pack_ip4address(ip4address_str) self.enter_step('set_ip4add_brws') - ip4address = self.pack_ip4address(ip4address_str) dic = self.ip4tree - for part in ip4address: + for i in reversed(range(prefixlen)): + part = (ip4 >> i) & 0b1 if dic.match: # Refuse to add ip4address whose network is already matching return @@ -330,9 +330,10 @@ class Database(Profiler): if is_first_party or source: raise NotImplementedError self.enter_step('set_ip4net_brws') - ip4network = self.pack_ip4network(ip4network_str) + ip4, prefixlen = self.pack_ip4network(ip4network_str) dic = self.ip4tree - for part in ip4network: + for i in reversed(range(prefixlen)): + part = (ip4 >> i) & 0b1 if dic.match: # Refuse to add ip4network whose parent network # is already matching From 954b33b2a63e1ef39f74d89a35bbf9ed6dd2fc78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 16:38:01 +0100 Subject: [PATCH 18/40] Slightly better Rapid7 parser --- feed_dns.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/feed_dns.py b/feed_dns.py index 3acad9a..5adf770 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -2,7 +2,6 @@ import argparse import database -import json import logging import sys import typing @@ -66,12 +65,16 @@ class Rapid7Parser(Parser): } def consume(self) -> None: + data = dict() for line in self.buf: self.db.enter_step('parse_rapid7') - try: - data = json.loads(line) - except json.decoder.JSONDecodeError: - continue + split = line.split('"') + + for k in range(1, 14, 4): + key = split[k] + val = split[k+2] + data[key] = val + self.register( Rapid7Parser.TYPES[data['type']], int(data['timestamp']), From ce52897d305ac44bd11af89dc7e68e63d45389c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 16:48:17 +0100 Subject: [PATCH 19/40] Smol fixes --- database.py | 2 +- feed_dns.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/database.py b/database.py index 12569d3..7e78a9d 100644 --- a/database.py +++ b/database.py @@ -141,7 +141,7 @@ class Database(Profiler): def pack_ip4address(address: str) -> Ip4Path: addr = 0 for split in address.split('.'): - addr = addr << 4 + int(split) + addr = (addr << 8) + int(split) return (addr, 32) @staticmethod diff --git a/feed_dns.py b/feed_dns.py index 5adf770..b106968 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -45,12 +45,9 @@ class Parser(): self.db.enter_step('register') select, write = FUNCTION_MAP[rtype] - try: - for source in select(self.db, value): - # write(self.db, name, updated, source=source) - write(self.db, name, updated) - except NotImplementedError: - return # DEBUG + for source in select(self.db, value): + # write(self.db, name, updated, source=source) + write(self.db, name, updated) def consume(self) -> None: raise NotImplementedError From 45325782d2c5c6dfda93d12f4468588871c5a8ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 17:05:41 +0100 Subject: [PATCH 20/40] Multi-processed parser --- feed_dns.py | 111 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/feed_dns.py b/feed_dns.py index b106968..c2438d8 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -5,6 +5,7 @@ import database import logging import sys import typing +import multiprocessing import enum RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') @@ -27,27 +28,66 @@ FUNCTION_MAP: typing.Any = { } -class Parser(): - def __init__(self, buf: typing.Any) -> None: - self.buf = buf - self.log = logging.getLogger('parser') - self.db = database.Database() +class Writer(multiprocessing.Process): + def __init__(self, + recs_queue: multiprocessing.Queue, + index: int = 0): + super(Writer, self).__init__() + self.log = logging.getLogger(f'wr') + self.recs_queue = recs_queue - def end(self) -> None: + def run(self) -> None: + self.db = database.Database() + self.db.log = logging.getLogger(f'wr') + + self.db.enter_step('block_wait') + block: typing.List[Record] + for block in iter(self.recs_queue.get, None): + + record: Record + for record in block: + + rtype, updated, name, value = record + self.db.enter_step('feed_switch') + + select, write = FUNCTION_MAP[rtype] + for source in select(self.db, value): + # write(self.db, name, updated, source=source) + write(self.db, name, updated) + + self.db.enter_step('block_wait') + + self.db.enter_step('end') self.db.save() - def register(self, - rtype: RecordType, - updated: int, - name: str, - value: str - ) -> None: - self.db.enter_step('register') - select, write = FUNCTION_MAP[rtype] - for source in select(self.db, value): - # write(self.db, name, updated, source=source) - write(self.db, name, updated) +class Parser(): + def __init__(self, + buf: typing.Any, + recs_queue: multiprocessing.Queue, + block_size: int, + ): + super(Parser, self).__init__() + self.buf = buf + self.log = logging.getLogger('pr') + self.recs_queue = recs_queue + self.block: typing.List[Record] = list() + self.block_size = block_size + self.prof = database.Profiler() + self.prof.log = logging.getLogger('pr') + + def register(self, record: Record) -> None: + self.prof.enter_step('register') + self.block.append(record) + if len(self.block) >= self.block_size: + self.prof.enter_step('put_block') + self.recs_queue.put(self.block) + self.block = list() + + def run(self) -> None: + self.consume() + self.recs_queue.put(self.block) + self.prof.profile() def consume(self) -> None: raise NotImplementedError @@ -64,7 +104,7 @@ class Rapid7Parser(Parser): def consume(self) -> None: data = dict() for line in self.buf: - self.db.enter_step('parse_rapid7') + self.prof.enter_step('parse_rapid7') split = line.split('"') for k in range(1, 14, 4): @@ -72,12 +112,13 @@ class Rapid7Parser(Parser): val = split[k+2] data[key] = val - self.register( + record = ( Rapid7Parser.TYPES[data['type']], int(data['timestamp']), data['name'], data['value'] ) + self.register(record) class DnsMassParser(Parser): @@ -90,7 +131,7 @@ class DnsMassParser(Parser): } def consume(self) -> None: - self.db.enter_step('parse_dnsmass') + self.prof.enter_step('parse_dnsmass') timestamp = 0 header = True for line in self.buf: @@ -107,13 +148,14 @@ class DnsMassParser(Parser): else: dtype, name_offset, value_offset = \ DnsMassParser.TYPES[split[1]] - self.register( + record = ( dtype, timestamp, split[0][:name_offset], split[2][:value_offset], ) - self.db.enter_step('parse_dnsmass') + self.register(record) + self.prof.enter_step('parse_dnsmass') except KeyError: continue @@ -136,12 +178,25 @@ if __name__ == '__main__': args_parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="TODO") + args_parser.add_argument( + '-j', '--workers', type=int, default=4, + help="TODO") + args_parser.add_argument( + '-b', '--block-size', type=int, default=100, + help="TODO") + args_parser.add_argument( + '-q', '--queue-size', type=int, default=10, + help="TODO") args = args_parser.parse_args() - parser = PARSERS[args.parser](args.input) - try: - parser.consume() - except KeyboardInterrupt: - pass - parser.end() + recs_queue: multiprocessing.Queue = multiprocessing.Queue( + maxsize=args.queue_size) + writer = Writer(recs_queue) + writer.start() + + parser = PARSERS[args.parser](args.input, recs_queue, args.block_size) + parser.run() + + recs_queue.put(None) + writer.join() From 7af2074c7a3861c54cba8e46480310e20bdbe23b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 17:12:44 +0100 Subject: [PATCH 21/40] Small optimisation of feed_switch --- feed_dns.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/feed_dns.py b/feed_dns.py index c2438d8..d72dc49 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -8,20 +8,19 @@ import typing import multiprocessing import enum -RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') -Record = typing.Tuple[RecordType, int, str, str] +Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str] # select, write FUNCTION_MAP: typing.Any = { - RecordType.A: ( + 'a': ( database.Database.get_ip4, database.Database.set_hostname, ), - RecordType.CNAME: ( + 'cname': ( database.Database.get_domain, database.Database.set_hostname, ), - RecordType.PTR: ( + 'ptr': ( database.Database.get_domain, database.Database.set_ip4address, ), @@ -47,10 +46,9 @@ class Writer(multiprocessing.Process): record: Record for record in block: - rtype, updated, name, value = record + select, write, updated, name, value = record self.db.enter_step('feed_switch') - select, write = FUNCTION_MAP[rtype] for source in select(self.db, value): # write(self.db, name, updated, source=source) write(self.db, name, updated) @@ -94,13 +92,6 @@ class Parser(): class Rapid7Parser(Parser): - TYPES = { - 'a': RecordType.A, - 'aaaa': RecordType.AAAA, - 'cname': RecordType.CNAME, - 'ptr': RecordType.PTR, - } - def consume(self) -> None: data = dict() for line in self.buf: @@ -112,8 +103,10 @@ class Rapid7Parser(Parser): val = split[k+2] data[key] = val + select, writer = FUNCTION_MAP[data['type']] record = ( - Rapid7Parser.TYPES[data['type']], + select, + writer, int(data['timestamp']), data['name'], data['value'] @@ -125,9 +118,9 @@ class DnsMassParser(Parser): # dnsmass --output Snrql # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 TYPES = { - 'A': (RecordType.A, -1, None), - 'AAAA': (RecordType.AAAA, -1, None), - 'CNAME': (RecordType.CNAME, -1, -1), + 'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None), + # 'AAAA': (FUNCTION_MAP['aaaa'][0], FUNCTION_MAP['aaaa'][1], -1, None), + 'CNAME': (FUNCTION_MAP['cname'][0], FUNCTION_MAP['cname'][1], -1, -1), } def consume(self) -> None: @@ -146,10 +139,11 @@ class DnsMassParser(Parser): timestamp = int(split[1]) header = False else: - dtype, name_offset, value_offset = \ + select, write, name_offset, value_offset = \ DnsMassParser.TYPES[split[1]] record = ( - dtype, + select, + write, timestamp, split[0][:name_offset], split[2][:value_offset], From aec8d3f8de50f4cef07f22992d1530880674faad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 22:21:05 +0100 Subject: [PATCH 22/40] Reworked how paths work Get those tuples out of my eyes --- database.py | 278 ++++++++++++++++++++++++++++++++---------------- feed_asn.py | 7 +- feed_dns.old.py | 147 +++++++++++++++++++++++++ feed_dns.py | 36 ++++--- 4 files changed, 354 insertions(+), 114 deletions(-) create mode 100755 feed_dns.old.py diff --git a/database.py b/database.py index 7e78a9d..fc2855e 100644 --- a/database.py +++ b/database.py @@ -16,19 +16,48 @@ coloredlogs.install( fmt='%(asctime)s %(name)s %(levelname)s %(message)s' ) -PathType = enum.Enum('PathType', 'Rule Hostname Zone Asn Ip4 Ip6') -RulePath = typing.Union[None] Asn = int -DomainPath = typing.List[str] -Ip4Path = typing.Tuple[int, int] # value, prefixlen -Ip6Path = typing.List[int] -Path = typing.Union[RulePath, DomainPath, Asn, Ip4Path, Ip6Path] -TypedPath = typing.Tuple[PathType, Path] Timestamp = int Level = int -Match = typing.Tuple[Timestamp, TypedPath, Level] -DebugPath = (PathType.Rule, None) + +class Path(): + pass + + +class RulePath(Path): + pass + + +class DomainPath(Path): + def __init__(self, path: typing.List[str]): + self.path = path + + +class HostnamePath(DomainPath): + pass + + +class ZonePath(DomainPath): + pass + + +class AsnPath(Path): + def __init__(self, asn: Asn): + self.asn = asn + + +class Ip4Path(Path): + def __init__(self, value: int, prefixlen: int): + self.value = value + self.prefixlen = prefixlen + + +Match = typing.Tuple[Timestamp, Path, Level] + +# class AsnNode(): +# def __init__(self, asn: int) -> None: +# self.asn = asn class DomainTreeNode(): @@ -44,6 +73,13 @@ class IpTreeNode(): self.match: typing.Optional[Match] = None +Node = typing.Union[DomainTreeNode, IpTreeNode, Asn] +NodeCallable = typing.Callable[[Path, + Node, + typing.Optional[typing.Any]], + typing.Any] + + class Profiler(): def __init__(self) -> None: self.log = logging.getLogger('profiler') @@ -53,6 +89,7 @@ class Profiler(): self.step_dict: typing.Dict[str, int] = dict() def enter_step(self, name: str) -> None: + return now = time.perf_counter() try: self.time_dict[self.time_step] += now - self.time_last @@ -75,7 +112,7 @@ class Profiler(): class Database(Profiler): - VERSION = 8 + VERSION = 9 PATH = "blocking.p" def initialize(self) -> None: @@ -120,34 +157,34 @@ class Database(Profiler): @staticmethod def pack_domain(domain: str) -> DomainPath: - return domain.split('.')[::-1] + return DomainPath(domain.split('.')[::-1]) @staticmethod def unpack_domain(domain: DomainPath) -> str: - return '.'.join(domain[::-1]) + return '.'.join(domain.path[::-1]) @staticmethod - def pack_asn(asn: str) -> int: + def pack_asn(asn: str) -> AsnPath: asn = asn.upper() if asn.startswith('AS'): asn = asn[2:] - return int(asn) + return AsnPath(int(asn)) @staticmethod - def unpack_asn(asn: int) -> str: - return f'AS{asn}' + def unpack_asn(asn: AsnPath) -> str: + return f'AS{asn.asn}' @staticmethod def pack_ip4address(address: str) -> Ip4Path: addr = 0 for split in address.split('.'): addr = (addr << 8) + int(split) - return (addr, 32) + return Ip4Path(addr, 32) @staticmethod def unpack_ip4address(address: Ip4Path) -> str: - addr, prefixlen = address - assert prefixlen == 32 + addr = address.value + assert address.prefixlen == 32 octets: typing.List[int] = list() octets = [0] * 4 for o in reversed(range(4)): @@ -159,14 +196,76 @@ class Database(Profiler): def pack_ip4network(network: str) -> Ip4Path: address, prefixlen_str = network.split('/') prefixlen = int(prefixlen_str) - addr, _ = Database.pack_ip4address(address) - return (addr, prefixlen) + addr = Database.pack_ip4address(address) + addr.prefixlen = prefixlen + return addr @staticmethod def unpack_ip4network(network: Ip4Path) -> str: - address, prefixlen = network - addr = Database.unpack_ip4address((address, 32)) - return f'{addr}/{prefixlen}' + addr = network.value + octets: typing.List[int] = list() + octets = [0] * 4 + for o in reversed(range(4)): + octets[o] = addr & 0xFF + addr >>= 8 + return '.'.join(map(str, octets)) + '/' + str(network.prefixlen) + + def exec_each_domain(self, + callback: NodeCallable, + arg: typing.Any = None, + _dic: DomainTreeNode = None, + _par: DomainPath = None, + ) -> typing.Any: + _dic = _dic or self.domtree + _par = _par or DomainPath([]) + yield from callback(_par, _dic, arg) + for part in _dic.children: + dic = _dic.children[part] + yield from self.exec_each_domain( + callback, + arg, + _dic=dic, + _par=DomainPath(_par.path + [part]) + ) + + def exec_each_ip4(self, + callback: NodeCallable, + arg: typing.Any = None, + _dic: IpTreeNode = None, + _par: Ip4Path = None, + ) -> typing.Any: + _dic = _dic or self.ip4tree + _par = _par or Ip4Path(0, 0) + callback(_par, _dic, arg) + + # 0 + dic = _dic.children[0] + if dic: + addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen))) + assert addr0 == _par.value + yield from self.exec_each_ip4( + callback, + arg, + _dic=dic, + _par=Ip4Path(addr0, _par.prefixlen+1) + ) + # 1 + dic = _dic.children[1] + if dic: + addr1 = _par.value | (1 << (32-_par.prefixlen)) + yield from self.exec_each_ip4( + callback, + arg, + _dic=dic, + _par=Ip4Path(addr1, _par.prefixlen+1) + ) + + def exec_each(self, + callback: NodeCallable, + arg: typing.Any = None, + ) -> typing.Any: + yield from self.exec_each_domain(callback) + yield from self.exec_each_ip4(callback) def update_references(self) -> None: raise NotImplementedError @@ -181,35 +280,35 @@ class Database(Profiler): first_party_only: bool = False, end_chain_only: bool = False, explain: bool = False, - _dic: DomainTreeNode = None, - _par: DomainPath = None, ) -> typing.Iterable[str]: if first_party_only or end_chain_only or explain: raise NotImplementedError - _dic = _dic or self.domtree - _par = _par or list() - if _dic.match_hostname: - yield self.unpack_domain(_par) - for part in _dic.children: - dic = _dic.children[part] - yield from self.export(_dic=dic, - _par=_par + [part]) + + def export_cb(path: Path, node: Node, _: typing.Any + ) -> typing.Iterable[str]: + assert isinstance(path, DomainPath) + assert isinstance(node, DomainTreeNode) + if node.match_hostname: + a = self.unpack_domain(path) + yield a + + yield from self.exec_each_domain(export_cb, None) def count_rules(self, first_party_only: bool = False, ) -> str: raise NotImplementedError - def get_domain(self, domain_str: str) -> typing.Iterable[TypedPath]: + def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: self.enter_step('get_domain_pack') domain = self.pack_domain(domain_str) self.enter_step('get_domain_brws') dic = self.domtree depth = 0 - for part in domain: + for part in domain.path: if dic.match_zone: self.enter_step('get_domain_yield') - yield (PathType.Zone, domain[:depth]) + yield ZonePath(domain.path[:depth]) self.enter_step('get_domain_brws') if part not in dic.children: return @@ -217,21 +316,21 @@ class Database(Profiler): depth += 1 if dic.match_zone: self.enter_step('get_domain_yield') - yield (PathType.Zone, domain) + yield ZonePath(domain.path) if dic.match_hostname: self.enter_step('get_domain_yield') - yield (PathType.Hostname, domain) + yield HostnamePath(domain.path) - def get_ip4(self, ip4_str: str) -> typing.Iterable[TypedPath]: + def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]: self.enter_step('get_ip4_pack') - ip4, prefixlen = self.pack_ip4address(ip4_str) + ip4 = self.pack_ip4address(ip4_str) self.enter_step('get_ip4_brws') dic = self.ip4tree - for i in reversed(range(prefixlen)): - part = (ip4 >> i) & 0b1 + for i in reversed(range(ip4.prefixlen)): + part = (ip4.value >> i) & 0b1 if dic.match: self.enter_step('get_ip4_yield') - yield (PathType.Ip4, (ip4, 32-i)) + yield Ip4Path(ip4.value, 32-i) self.enter_step('get_ip4_brws') next_dic = dic.children[part] if next_dic is None: @@ -239,108 +338,99 @@ class Database(Profiler): dic = next_dic if dic.match: self.enter_step('get_ip4_yield') - yield (PathType.Ip4, ip4) + yield ip4 - def list_asn(self) -> typing.Iterable[TypedPath]: + def list_asn(self) -> typing.Iterable[AsnPath]: for asn in self.asns: - yield (PathType.Asn, asn) + yield AsnPath(asn) def set_hostname(self, hostname_str: str, updated: int, is_first_party: bool = None, - source: TypedPath = None) -> None: + source: Path = None) -> None: self.enter_step('set_hostname_pack') - if is_first_party or source: + if is_first_party: raise NotImplementedError self.enter_step('set_hostname_brws') hostname = self.pack_domain(hostname_str) dic = self.domtree - for part in hostname: + for part in hostname.path: if dic.match_zone: # Refuse to add hostname whose zone is already matching return if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] - dic.match_hostname = (updated, DebugPath, 0) + dic.match_hostname = (updated, source or RulePath(), 0) def set_zone(self, zone_str: str, updated: int, is_first_party: bool = None, - source: TypedPath = None) -> None: + source: Path = None) -> None: self.enter_step('set_zone_pack') - if is_first_party or source: + if is_first_party: raise NotImplementedError zone = self.pack_domain(zone_str) self.enter_step('set_zone_brws') dic = self.domtree - for part in zone: + for part in zone.path: if dic.match_zone: # Refuse to add zone whose parent zone is already matching return if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] - dic.match_zone = (updated, DebugPath, 0) + dic.match_zone = (updated, source or RulePath(), 0) def set_asn(self, asn_str: str, updated: int, is_first_party: bool = None, - source: TypedPath = None) -> None: + source: Path = None) -> None: self.enter_step('set_asn_pack') if is_first_party or source: # TODO updated raise NotImplementedError asn = self.pack_asn(asn_str) self.enter_step('set_asn_brws') - self.asns.add(asn) + self.asns.add(asn.asn) + + def _set_ip4(self, + ip4: Ip4Path, + updated: int, + is_first_party: bool = None, + source: Path = None) -> None: + if is_first_party: + raise NotImplementedError + dic = self.ip4tree + for i in reversed(range(ip4.prefixlen)): + part = (ip4.value >> i) & 0b1 + if dic.match: + # Refuse to add ip4* whose network is already matching + return + next_dic = dic.children[part] + if next_dic is None: + next_dic = IpTreeNode() + dic.children[part] = next_dic + dic = next_dic + dic.match = (updated, source or RulePath(), 0) def set_ip4address(self, ip4address_str: str, - updated: int, - is_first_party: bool = None, - source: TypedPath = None) -> None: + *args: typing.Any, **kwargs: typing.Any + ) -> None: self.enter_step('set_ip4add_pack') - if is_first_party or source: - raise NotImplementedError - ip4, prefixlen = self.pack_ip4address(ip4address_str) + ip4 = self.pack_ip4address(ip4address_str) self.enter_step('set_ip4add_brws') - dic = self.ip4tree - for i in reversed(range(prefixlen)): - part = (ip4 >> i) & 0b1 - if dic.match: - # Refuse to add ip4address whose network is already matching - return - next_dic = dic.children[part] - if next_dic is None: - next_dic = IpTreeNode() - dic.children[part] = next_dic - dic = next_dic - dic.match = (updated, DebugPath, 0) + self._set_ip4(ip4, *args, **kwargs) def set_ip4network(self, ip4network_str: str, - updated: int, - is_first_party: bool = None, - source: TypedPath = None) -> None: + *args: typing.Any, **kwargs: typing.Any + ) -> None: self.enter_step('set_ip4net_pack') - if is_first_party or source: - raise NotImplementedError + ip4 = self.pack_ip4network(ip4network_str) self.enter_step('set_ip4net_brws') - ip4, prefixlen = self.pack_ip4network(ip4network_str) - dic = self.ip4tree - for i in reversed(range(prefixlen)): - part = (ip4 >> i) & 0b1 - if dic.match: - # Refuse to add ip4network whose parent network - # is already matching - return - next_dic = dic.children[part] - if next_dic is None: - next_dic = IpTreeNode() - dic.children[part] = next_dic - dic = next_dic - dic.match = (updated, DebugPath, 0) + self._set_ip4(ip4, *args, **kwargs) diff --git a/feed_asn.py b/feed_asn.py index ead63fe..aa311f8 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -33,10 +33,7 @@ if __name__ == '__main__': DB = database.Database() for path in DB.list_asn(): - ptype, asn = path - assert ptype == database.PathType.Asn - assert isinstance(asn, int) - asn_str = database.Database.unpack_asn(asn) + asn_str = database.Database.unpack_asn(path) DB.enter_step('asn_get_ranges') for prefix in get_ranges(asn_str): parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) @@ -46,7 +43,7 @@ if __name__ == '__main__': # source=path, updated=int(time.time()) ) - log.info('Added %s from %s (source=%s)', prefix, asn, path) + log.info('Added %s from %s (%s)', prefix, asn_str, path) elif parsed_prefix.version == 6: log.warning('Unimplemented prefix version: %s', prefix) else: diff --git a/feed_dns.old.py b/feed_dns.old.py new file mode 100755 index 0000000..b106968 --- /dev/null +++ b/feed_dns.old.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +import argparse +import database +import logging +import sys +import typing +import enum + +RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') +Record = typing.Tuple[RecordType, int, str, str] + +# select, write +FUNCTION_MAP: typing.Any = { + RecordType.A: ( + database.Database.get_ip4, + database.Database.set_hostname, + ), + RecordType.CNAME: ( + database.Database.get_domain, + database.Database.set_hostname, + ), + RecordType.PTR: ( + database.Database.get_domain, + database.Database.set_ip4address, + ), +} + + +class Parser(): + def __init__(self, buf: typing.Any) -> None: + self.buf = buf + self.log = logging.getLogger('parser') + self.db = database.Database() + + def end(self) -> None: + self.db.save() + + def register(self, + rtype: RecordType, + updated: int, + name: str, + value: str + ) -> None: + + self.db.enter_step('register') + select, write = FUNCTION_MAP[rtype] + for source in select(self.db, value): + # write(self.db, name, updated, source=source) + write(self.db, name, updated) + + def consume(self) -> None: + raise NotImplementedError + + +class Rapid7Parser(Parser): + TYPES = { + 'a': RecordType.A, + 'aaaa': RecordType.AAAA, + 'cname': RecordType.CNAME, + 'ptr': RecordType.PTR, + } + + def consume(self) -> None: + data = dict() + for line in self.buf: + self.db.enter_step('parse_rapid7') + split = line.split('"') + + for k in range(1, 14, 4): + key = split[k] + val = split[k+2] + data[key] = val + + self.register( + Rapid7Parser.TYPES[data['type']], + int(data['timestamp']), + data['name'], + data['value'] + ) + + +class DnsMassParser(Parser): + # dnsmass --output Snrql + # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 + TYPES = { + 'A': (RecordType.A, -1, None), + 'AAAA': (RecordType.AAAA, -1, None), + 'CNAME': (RecordType.CNAME, -1, -1), + } + + def consume(self) -> None: + self.db.enter_step('parse_dnsmass') + timestamp = 0 + header = True + for line in self.buf: + line = line[:-1] + if not line: + header = True + continue + + split = line.split(' ') + try: + if header: + timestamp = int(split[1]) + header = False + else: + dtype, name_offset, value_offset = \ + DnsMassParser.TYPES[split[1]] + self.register( + dtype, + timestamp, + split[0][:name_offset], + split[2][:value_offset], + ) + self.db.enter_step('parse_dnsmass') + except KeyError: + continue + + +PARSERS = { + 'rapid7': Rapid7Parser, + 'dnsmass': DnsMassParser, +} + +if __name__ == '__main__': + + # Parsing arguments + log = logging.getLogger('feed_dns') + args_parser = argparse.ArgumentParser( + description="TODO") + args_parser.add_argument( + 'parser', + choices=PARSERS.keys(), + help="TODO") + args_parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + args = args_parser.parse_args() + + parser = PARSERS[args.parser](args.input) + try: + parser.consume() + except KeyboardInterrupt: + pass + parser.end() + diff --git a/feed_dns.py b/feed_dns.py index d72dc49..be08e98 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -49,9 +49,12 @@ class Writer(multiprocessing.Process): select, write, updated, name, value = record self.db.enter_step('feed_switch') - for source in select(self.db, value): - # write(self.db, name, updated, source=source) - write(self.db, name, updated) + try: + for source in select(self.db, value): + # write(self.db, name, updated, source=source) + write(self.db, name, updated) + except ValueError: + self.log.exception("Cannot execute: %s", record) self.db.enter_step('block_wait') @@ -98,19 +101,22 @@ class Rapid7Parser(Parser): self.prof.enter_step('parse_rapid7') split = line.split('"') - for k in range(1, 14, 4): - key = split[k] - val = split[k+2] - data[key] = val + try: + for k in range(1, 14, 4): + key = split[k] + val = split[k+2] + data[key] = val - select, writer = FUNCTION_MAP[data['type']] - record = ( - select, - writer, - int(data['timestamp']), - data['name'], - data['value'] - ) + select, writer = FUNCTION_MAP[data['type']] + record = ( + select, + writer, + int(data['timestamp']), + data['name'], + data['value'] + ) + except IndexError: + self.log.exception("Cannot parse: %s", line) self.register(record) From a0e68f08487e333c39b5056ed24eb925cb3ff3c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 15 Dec 2019 23:13:25 +0100 Subject: [PATCH 23/40] Reworked match and node system For level, and first_party later Next: add get_match to retrieve level of source and have correct levels ... am I going somewhere with all this? --- database.py | 140 +++++++++++++++++++++++++++++++--------------------- feed_asn.py | 2 +- 2 files changed, 84 insertions(+), 58 deletions(-) diff --git a/database.py b/database.py index fc2855e..5522013 100644 --- a/database.py +++ b/database.py @@ -9,7 +9,6 @@ import time import logging import coloredlogs import pickle -import enum coloredlogs.install( level='DEBUG', @@ -22,6 +21,7 @@ Level = int class Path(): + # FP add boolean here pass @@ -53,27 +53,46 @@ class Ip4Path(Path): self.prefixlen = prefixlen -Match = typing.Tuple[Timestamp, Path, Level] +class Match(): + def __init__(self) -> None: + self.updated: int = 0 + self.level: int = 0 + self.source: Path = RulePath() + # FP dupplicate args -# class AsnNode(): -# def __init__(self, asn: int) -> None: -# self.asn = asn + def set(self, + updated: int, + level: int, + source: Path, + ) -> None: + if updated > self.updated or level > self.level: + self.updated = updated + self.level = level + self.source = source + # FP dupplicate function + + def active(self) -> bool: + return self.updated > 0 + + +class AsnNode(Match): + pass class DomainTreeNode(): def __init__(self) -> None: self.children: typing.Dict[str, DomainTreeNode] = dict() - self.match_zone: typing.Optional[Match] = None - self.match_hostname: typing.Optional[Match] = None + self.match_zone = Match() + self.match_hostname = Match() class IpTreeNode(): def __init__(self) -> None: self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None] - self.match: typing.Optional[Match] = None + self.match = Match() -Node = typing.Union[DomainTreeNode, IpTreeNode, Asn] +Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] NodeCallable = typing.Callable[[Path, Node, typing.Optional[typing.Any]], @@ -112,7 +131,7 @@ class Profiler(): class Database(Profiler): - VERSION = 9 + VERSION = 10 PATH = "blocking.p" def initialize(self) -> None: @@ -120,7 +139,7 @@ class Database(Profiler): "Creating database version: %d ", Database.VERSION) self.domtree = DomainTreeNode() - self.asns: typing.Set[Asn] = set() + self.asns: typing.Dict[Asn, AsnNode] = dict() self.ip4tree = IpTreeNode() def load(self) -> None: @@ -133,12 +152,12 @@ class Database(Profiler): return self.log.warning( "Outdated database version found: %d, " - "will be rebuilt.", + "it will be rebuilt.", version) except (TypeError, AttributeError, EOFError): self.log.error( - "Corrupt database found, " - "will be rebuilt.") + "Corrupt (or heavily outdated) database found, " + "it will be rebuilt.") except FileNotFoundError: pass self.initialize() @@ -306,7 +325,7 @@ class Database(Profiler): dic = self.domtree depth = 0 for part in domain.path: - if dic.match_zone: + if dic.match_zone.active(): self.enter_step('get_domain_yield') yield ZonePath(domain.path[:depth]) self.enter_step('get_domain_brws') @@ -314,10 +333,10 @@ class Database(Profiler): return dic = dic.children[part] depth += 1 - if dic.match_zone: + if dic.match_zone.active(): self.enter_step('get_domain_yield') yield ZonePath(domain.path) - if dic.match_hostname: + if dic.match_hostname.active(): self.enter_step('get_domain_yield') yield HostnamePath(domain.path) @@ -328,7 +347,7 @@ class Database(Profiler): dic = self.ip4tree for i in reversed(range(ip4.prefixlen)): part = (ip4.value >> i) & 0b1 - if dic.match: + if dic.match.active(): self.enter_step('get_ip4_yield') yield Ip4Path(ip4.value, 32-i) self.enter_step('get_ip4_brws') @@ -336,7 +355,7 @@ class Database(Profiler): if next_dic is None: return dic = next_dic - if dic.match: + if dic.match.active(): self.enter_step('get_ip4_yield') yield ip4 @@ -344,58 +363,61 @@ class Database(Profiler): for asn in self.asns: yield AsnPath(asn) - def set_hostname(self, - hostname_str: str, - updated: int, - is_first_party: bool = None, - source: Path = None) -> None: - self.enter_step('set_hostname_pack') + def _set_domain(self, + hostname: bool, + domain_str: str, + updated: int, + is_first_party: bool = None, + source: Path = None) -> None: + self.enter_step('set_domain_pack') if is_first_party: raise NotImplementedError - self.enter_step('set_hostname_brws') - hostname = self.pack_domain(hostname_str) + domain = self.pack_domain(domain_str) + self.enter_step('set_domain_brws') dic = self.domtree - for part in hostname.path: - if dic.match_zone: - # Refuse to add hostname whose zone is already matching + for part in domain.path: + if dic.match_zone.active(): + # Refuse to add domain whose zone is already matching return if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] - dic.match_hostname = (updated, source or RulePath(), 0) + if hostname: + match = dic.match_hostname + else: + match = dic.match_zone + match.set( + updated, + 0, # TODO Level + source or RulePath(), + ) + + def set_hostname(self, + *args: typing.Any, **kwargs: typing.Any + ) -> None: + self._set_domain(True, *args, **kwargs) def set_zone(self, - zone_str: str, - updated: int, - is_first_party: bool = None, - source: Path = None) -> None: - self.enter_step('set_zone_pack') - if is_first_party: - raise NotImplementedError - zone = self.pack_domain(zone_str) - self.enter_step('set_zone_brws') - dic = self.domtree - for part in zone.path: - if dic.match_zone: - # Refuse to add zone whose parent zone is already matching - return - if part not in dic.children: - dic.children[part] = DomainTreeNode() - dic = dic.children[part] - dic.match_zone = (updated, source or RulePath(), 0) + *args: typing.Any, **kwargs: typing.Any + ) -> None: + self._set_domain(False, *args, **kwargs) def set_asn(self, asn_str: str, updated: int, is_first_party: bool = None, source: Path = None) -> None: - self.enter_step('set_asn_pack') - if is_first_party or source: - # TODO updated + self.enter_step('set_asn') + if is_first_party: raise NotImplementedError - asn = self.pack_asn(asn_str) - self.enter_step('set_asn_brws') - self.asns.add(asn.asn) + path = self.pack_asn(asn_str) + match = AsnNode() + match.set( + updated, + 0, + source or RulePath() + ) + self.asns[path.asn] = match def _set_ip4(self, ip4: Ip4Path, @@ -407,7 +429,7 @@ class Database(Profiler): dic = self.ip4tree for i in reversed(range(ip4.prefixlen)): part = (ip4.value >> i) & 0b1 - if dic.match: + if dic.match.active(): # Refuse to add ip4* whose network is already matching return next_dic = dic.children[part] @@ -415,7 +437,11 @@ class Database(Profiler): next_dic = IpTreeNode() dic.children[part] = next_dic dic = next_dic - dic.match = (updated, source or RulePath(), 0) + dic.match.set( + updated, + 0, # TODO Level + source or RulePath(), + ) def set_ip4address(self, ip4address_str: str, diff --git a/feed_asn.py b/feed_asn.py index aa311f8..f34773f 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -40,7 +40,7 @@ if __name__ == '__main__': if parsed_prefix.version == 4: DB.set_ip4network( prefix, - # source=path, + source=path, updated=int(time.time()) ) log.info('Added %s from %s (%s)', prefix, asn_str, path) From 3197fa1663a4e099475afde662f5effbf7c7e58e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 16 Dec 2019 06:54:18 +0100 Subject: [PATCH 24/40] Remove list usage for IpTreeNode --- database.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/database.py b/database.py index 5522013..99ea3ad 100644 --- a/database.py +++ b/database.py @@ -88,7 +88,8 @@ class DomainTreeNode(): class IpTreeNode(): def __init__(self) -> None: - self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None] + self.zero: typing.Optional[IpTreeNode] = None + self.one: typing.Optional[IpTreeNode] = None self.match = Match() @@ -131,7 +132,7 @@ class Profiler(): class Database(Profiler): - VERSION = 10 + VERSION = 11 PATH = "blocking.p" def initialize(self) -> None: @@ -258,7 +259,7 @@ class Database(Profiler): callback(_par, _dic, arg) # 0 - dic = _dic.children[0] + dic = _dic.zero if dic: addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen))) assert addr0 == _par.value @@ -269,7 +270,7 @@ class Database(Profiler): _par=Ip4Path(addr0, _par.prefixlen+1) ) # 1 - dic = _dic.children[1] + dic = _dic.one if dic: addr1 = _par.value | (1 << (32-_par.prefixlen)) yield from self.exec_each_ip4( @@ -351,7 +352,7 @@ class Database(Profiler): self.enter_step('get_ip4_yield') yield Ip4Path(ip4.value, 32-i) self.enter_step('get_ip4_brws') - next_dic = dic.children[part] + next_dic = dic.one if part else dic.zero if next_dic is None: return dic = next_dic @@ -432,10 +433,13 @@ class Database(Profiler): if dic.match.active(): # Refuse to add ip4* whose network is already matching return - next_dic = dic.children[part] + next_dic = dic.one if part else dic.zero if next_dic is None: next_dic = IpTreeNode() - dic.children[part] = next_dic + if part: + dic.one = next_dic + else: + dic.zero = next_dic dic = next_dic dic.match.set( updated, From 03a4042238af71fe75d94105ba8f5f3210dd8ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 16 Dec 2019 09:31:29 +0100 Subject: [PATCH 25/40] Added level Also fixed IP logic because this was real messed up --- database.py | 223 +++++++++++++++++++++++++++++++++++++--------------- export.py | 8 +- feed_dns.py | 3 +- 3 files changed, 167 insertions(+), 67 deletions(-) diff --git a/database.py b/database.py index 99ea3ad..13f8876 100644 --- a/database.py +++ b/database.py @@ -26,38 +26,50 @@ class Path(): class RulePath(Path): - pass + def __str__(self) -> str: + return '(rules)' class DomainPath(Path): - def __init__(self, path: typing.List[str]): - self.path = path + def __init__(self, parts: typing.List[str]): + self.parts = parts + + def __str__(self) -> str: + return '?.' + Database.unpack_domain(self) class HostnamePath(DomainPath): - pass + def __str__(self) -> str: + return Database.unpack_domain(self) class ZonePath(DomainPath): - pass + def __str__(self) -> str: + return '*.' + Database.unpack_domain(self) class AsnPath(Path): def __init__(self, asn: Asn): self.asn = asn + def __str__(self) -> str: + return Database.unpack_asn(self) + class Ip4Path(Path): def __init__(self, value: int, prefixlen: int): self.value = value self.prefixlen = prefixlen + def __str__(self) -> str: + return Database.unpack_ip4network(self) + class Match(): def __init__(self) -> None: self.updated: int = 0 self.level: int = 0 - self.source: Path = RulePath() + self.source: typing.Optional[Path] = None # FP dupplicate args def set(self, @@ -86,18 +98,18 @@ class DomainTreeNode(): self.match_hostname = Match() -class IpTreeNode(): +class IpTreeNode(Match): def __init__(self) -> None: + Match.__init__(self) self.zero: typing.Optional[IpTreeNode] = None self.one: typing.Optional[IpTreeNode] = None - self.match = Match() Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] -NodeCallable = typing.Callable[[Path, - Node, - typing.Optional[typing.Any]], - typing.Any] +MatchCallable = typing.Callable[[Path, + Match, + typing.Optional[typing.Any]], + typing.Any] class Profiler(): @@ -109,7 +121,6 @@ class Profiler(): self.step_dict: typing.Dict[str, int] = dict() def enter_step(self, name: str) -> None: - return now = time.perf_counter() try: self.time_dict[self.time_step] += now - self.time_last @@ -132,7 +143,7 @@ class Profiler(): class Database(Profiler): - VERSION = 11 + VERSION = 13 PATH = "blocking.p" def initialize(self) -> None: @@ -181,7 +192,7 @@ class Database(Profiler): @staticmethod def unpack_domain(domain: DomainPath) -> str: - return '.'.join(domain.path[::-1]) + return '.'.join(domain.parts[::-1]) @staticmethod def pack_asn(asn: str) -> AsnPath: @@ -230,62 +241,107 @@ class Database(Profiler): addr >>= 8 return '.'.join(map(str, octets)) + '/' + str(network.prefixlen) + def get_match(self, path: Path) -> Match: + if isinstance(path, RulePath): + return Match() + elif isinstance(path, AsnPath): + return self.asns[path.asn] + elif isinstance(path, DomainPath): + dicd = self.domtree + for part in path.parts: + dicd = dicd.children[part] + if isinstance(path, HostnamePath): + return dicd.match_hostname + elif isinstance(path, ZonePath): + return dicd.match_zone + else: + raise ValueError + elif isinstance(path, Ip4Path): + dici = self.ip4tree + for i in range(31, 31-path.prefixlen, -1): + bit = (path.value >> i) & 0b1 + dici_next = dici.one if bit else dici.zero + if not dici_next: + raise IndexError + dici = dici_next + return dici + else: + raise ValueError + def exec_each_domain(self, - callback: NodeCallable, + callback: MatchCallable, arg: typing.Any = None, _dic: DomainTreeNode = None, _par: DomainPath = None, ) -> typing.Any: _dic = _dic or self.domtree _par = _par or DomainPath([]) - yield from callback(_par, _dic, arg) + if _dic.match_hostname.active(): + yield from callback( + HostnamePath(_par.parts), + _dic.match_hostname, + arg + ) + if _dic.match_zone.active(): + yield from callback( + ZonePath(_par.parts), + _dic.match_zone, + arg + ) for part in _dic.children: dic = _dic.children[part] yield from self.exec_each_domain( callback, arg, _dic=dic, - _par=DomainPath(_par.path + [part]) + _par=DomainPath(_par.parts + [part]) ) def exec_each_ip4(self, - callback: NodeCallable, + callback: MatchCallable, arg: typing.Any = None, _dic: IpTreeNode = None, _par: Ip4Path = None, ) -> typing.Any: _dic = _dic or self.ip4tree _par = _par or Ip4Path(0, 0) - callback(_par, _dic, arg) + if _dic.active(): + yield from callback( + _par, + _dic, + arg + ) # 0 + pref = _par.prefixlen + 1 dic = _dic.zero if dic: - addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen))) + addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref))) assert addr0 == _par.value yield from self.exec_each_ip4( callback, arg, _dic=dic, - _par=Ip4Path(addr0, _par.prefixlen+1) + _par=Ip4Path(addr0, pref) ) # 1 dic = _dic.one if dic: - addr1 = _par.value | (1 << (32-_par.prefixlen)) + addr1 = _par.value | (1 << (32-pref)) yield from self.exec_each_ip4( callback, arg, _dic=dic, - _par=Ip4Path(addr1, _par.prefixlen+1) + _par=Ip4Path(addr1, pref) ) def exec_each(self, - callback: NodeCallable, + callback: MatchCallable, arg: typing.Any = None, ) -> typing.Any: yield from self.exec_each_domain(callback) yield from self.exec_each_ip4(callback) + # TODO ASN def update_references(self) -> None: raise NotImplementedError @@ -293,27 +349,47 @@ class Database(Profiler): def prune(self, before: int, base_only: bool = False) -> None: raise NotImplementedError - def explain(self, entry: int) -> str: - raise NotImplementedError + def explain(self, path: Path) -> str: + string = str(path) + match = self.get_match(path) + if match.source: + string += f' ← {self.explain(match.source)}' + return string def export(self, first_party_only: bool = False, end_chain_only: bool = False, explain: bool = False, ) -> typing.Iterable[str]: - if first_party_only or end_chain_only or explain: + if first_party_only or end_chain_only: raise NotImplementedError - def export_cb(path: Path, node: Node, _: typing.Any + def export_cb(path: Path, match: Match, _: typing.Any ) -> typing.Iterable[str]: assert isinstance(path, DomainPath) - assert isinstance(node, DomainTreeNode) - if node.match_hostname: - a = self.unpack_domain(path) - yield a + if isinstance(path, HostnamePath): + if explain: + yield self.explain(path) + else: + yield self.unpack_domain(path) yield from self.exec_each_domain(export_cb, None) + def list_rules(self, + first_party_only: bool = False, + ) -> typing.Iterable[str]: + if first_party_only: + raise NotImplementedError + + def list_rules_cb(path: Path, match: Match, _: typing.Any + ) -> typing.Iterable[str]: + if isinstance(path, ZonePath) \ + or (isinstance(path, Ip4Path) and path.prefixlen < 32): + # if match.level == 0: + yield self.explain(path) + + yield from self.exec_each(list_rules_cb, None) + def count_rules(self, first_party_only: bool = False, ) -> str: @@ -325,10 +401,10 @@ class Database(Profiler): self.enter_step('get_domain_brws') dic = self.domtree depth = 0 - for part in domain.path: + for part in domain.parts: if dic.match_zone.active(): self.enter_step('get_domain_yield') - yield ZonePath(domain.path[:depth]) + yield ZonePath(domain.parts[:depth]) self.enter_step('get_domain_brws') if part not in dic.children: return @@ -336,27 +412,28 @@ class Database(Profiler): depth += 1 if dic.match_zone.active(): self.enter_step('get_domain_yield') - yield ZonePath(domain.path) + yield ZonePath(domain.parts) if dic.match_hostname.active(): self.enter_step('get_domain_yield') - yield HostnamePath(domain.path) + yield HostnamePath(domain.parts) def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]: self.enter_step('get_ip4_pack') ip4 = self.pack_ip4address(ip4_str) self.enter_step('get_ip4_brws') dic = self.ip4tree - for i in reversed(range(ip4.prefixlen)): - part = (ip4.value >> i) & 0b1 - if dic.match.active(): + for i in range(31, 31-ip4.prefixlen, -1): + bit = (ip4.value >> i) & 0b1 + if dic.active(): self.enter_step('get_ip4_yield') - yield Ip4Path(ip4.value, 32-i) - self.enter_step('get_ip4_brws') - next_dic = dic.one if part else dic.zero + a = Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) + yield a + self.enter_step('get_ip4_brws') + next_dic = dic.one if bit else dic.zero if next_dic is None: return dic = next_dic - if dic.match.active(): + if dic.active(): self.enter_step('get_ip4_yield') yield ip4 @@ -374,9 +451,16 @@ class Database(Profiler): if is_first_party: raise NotImplementedError domain = self.pack_domain(domain_str) + self.enter_step('set_domain_src') + if source is None: + level = 0 + source = RulePath() + else: + match = self.get_match(source) + level = match.level + 1 self.enter_step('set_domain_brws') dic = self.domtree - for part in domain.path: + for part in domain.parts: if dic.match_zone.active(): # Refuse to add domain whose zone is already matching return @@ -389,8 +473,8 @@ class Database(Profiler): match = dic.match_zone match.set( updated, - 0, # TODO Level - source or RulePath(), + level, + source, ) def set_hostname(self, @@ -411,14 +495,23 @@ class Database(Profiler): self.enter_step('set_asn') if is_first_party: raise NotImplementedError + if source is None: + level = 0 + source = RulePath() + else: + match = self.get_match(source) + level = match.level + 1 path = self.pack_asn(asn_str) - match = AsnNode() + if path.asn in self.asns: + match = self.asns[path.asn] + else: + match = AsnNode() + self.asns[path.asn] = match match.set( - updated, - 0, - source or RulePath() + updated, + level, + source, ) - self.asns[path.asn] = match def _set_ip4(self, ip4: Ip4Path, @@ -427,24 +520,32 @@ class Database(Profiler): source: Path = None) -> None: if is_first_party: raise NotImplementedError + self.enter_step('set_ip4_src') + if source is None: + level = 0 + source = RulePath() + else: + match = self.get_match(source) + level = match.level + 1 + self.enter_step('set_ip4_brws') dic = self.ip4tree - for i in reversed(range(ip4.prefixlen)): - part = (ip4.value >> i) & 0b1 - if dic.match.active(): + for i in range(31, 31-ip4.prefixlen, -1): + bit = (ip4.value >> i) & 0b1 + if dic.active(): # Refuse to add ip4* whose network is already matching return - next_dic = dic.one if part else dic.zero + next_dic = dic.one if bit else dic.zero if next_dic is None: next_dic = IpTreeNode() - if part: + if bit: dic.one = next_dic else: dic.zero = next_dic dic = next_dic - dic.match.set( + dic.set( updated, - 0, # TODO Level - source or RulePath(), + level, + source, ) def set_ip4address(self, @@ -453,7 +554,6 @@ class Database(Profiler): ) -> None: self.enter_step('set_ip4add_pack') ip4 = self.pack_ip4address(ip4address_str) - self.enter_step('set_ip4add_brws') self._set_ip4(ip4, *args, **kwargs) def set_ip4network(self, @@ -462,5 +562,4 @@ class Database(Profiler): ) -> None: self.enter_step('set_ip4net_pack') ip4 = self.pack_ip4network(ip4network_str) - self.enter_step('set_ip4net_brws') self._set_ip4(ip4, *args, **kwargs) diff --git a/export.py b/export.py index bca3281..0df4229 100755 --- a/export.py +++ b/export.py @@ -33,9 +33,11 @@ if __name__ == '__main__': DB = database.Database() if args.rules: - if not args.count: - raise NotImplementedError - print(DB.count_rules(first_party_only=args.first_party)) + if args.count: + print(DB.count_rules(first_party_only=args.first_party)) + else: + for line in DB.list_rules(): + print(line) else: if args.count: raise NotImplementedError diff --git a/feed_dns.py b/feed_dns.py index be08e98..43df1fd 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -51,8 +51,7 @@ class Writer(multiprocessing.Process): try: for source in select(self.db, value): - # write(self.db, name, updated, source=source) - write(self.db, name, updated) + write(self.db, name, updated, source=source) except ValueError: self.log.exception("Cannot execute: %s", record) From c3bf102289ca382a8042fbe3678e8256c9afb09d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 16 Dec 2019 14:18:03 +0100 Subject: [PATCH 26/40] Made references work --- database.py | 158 +++++++++++++++++++++++++++++++--------------------- feed_asn.py | 8 ++- 2 files changed, 103 insertions(+), 63 deletions(-) diff --git a/database.py b/database.py index 13f8876..0828691 100644 --- a/database.py +++ b/database.py @@ -70,19 +70,9 @@ class Match(): self.updated: int = 0 self.level: int = 0 self.source: typing.Optional[Path] = None + self.references: int = 0 # FP dupplicate args - def set(self, - updated: int, - level: int, - source: Path, - ) -> None: - if updated > self.updated or level > self.level: - self.updated = updated - self.level = level - self.source = source - # FP dupplicate function - def active(self) -> bool: return self.updated > 0 @@ -143,7 +133,7 @@ class Profiler(): class Database(Profiler): - VERSION = 13 + VERSION = 14 PATH = "blocking.p" def initialize(self) -> None: @@ -268,6 +258,24 @@ class Database(Profiler): else: raise ValueError + def exec_each_asn(self, + callback: MatchCallable, + arg: typing.Any = None, + ) -> typing.Any: + for asn in self.asns: + match = self.asns[asn] + if match.active(): + c = callback( + AsnPath(asn), + match, + arg + ) + try: + yield from c + except TypeError: # not iterable + pass + + def exec_each_domain(self, callback: MatchCallable, arg: typing.Any = None, @@ -277,17 +285,25 @@ class Database(Profiler): _dic = _dic or self.domtree _par = _par or DomainPath([]) if _dic.match_hostname.active(): - yield from callback( + c = callback( HostnamePath(_par.parts), _dic.match_hostname, arg ) + try: + yield from c + except TypeError: # not iterable + pass if _dic.match_zone.active(): - yield from callback( + c = callback( ZonePath(_par.parts), _dic.match_zone, arg ) + try: + yield from c + except TypeError: # not iterable + pass for part in _dic.children: dic = _dic.children[part] yield from self.exec_each_domain( @@ -306,11 +322,15 @@ class Database(Profiler): _dic = _dic or self.ip4tree _par = _par or Ip4Path(0, 0) if _dic.active(): - yield from callback( + c = callback( _par, _dic, arg ) + try: + yield from c + except TypeError: # not iterable + pass # 0 pref = _par.prefixlen + 1 @@ -341,17 +361,35 @@ class Database(Profiler): ) -> typing.Any: yield from self.exec_each_domain(callback) yield from self.exec_each_ip4(callback) - # TODO ASN + yield from self.exec_each_asn(callback) def update_references(self) -> None: - raise NotImplementedError + # Should be correctly calculated normally, + # keeping this just in case + def reset_references_cb(path: Path, + match: Match, _: typing.Any + ) -> None: + match.references = 0 + for _ in self.exec_each(reset_references_cb, None): + pass + + def increment_references_cb(path: Path, + match: Match, _: typing.Any + ) -> None: + if match.source: + source = self.get_match(match.source) + source.references += 1 + for _ in self.exec_each(increment_references_cb, None): + pass def prune(self, before: int, base_only: bool = False) -> None: raise NotImplementedError def explain(self, path: Path) -> str: - string = str(path) match = self.get_match(path) + string = f'{path}' + if not isinstance(path, RulePath): + string += f' #{match.references}' if match.source: string += f' ← {self.explain(match.source)}' return string @@ -361,17 +399,20 @@ class Database(Profiler): end_chain_only: bool = False, explain: bool = False, ) -> typing.Iterable[str]: - if first_party_only or end_chain_only: + if first_party_only: raise NotImplementedError def export_cb(path: Path, match: Match, _: typing.Any ) -> typing.Iterable[str]: assert isinstance(path, DomainPath) - if isinstance(path, HostnamePath): - if explain: - yield self.explain(path) - else: - yield self.unpack_domain(path) + if not isinstance(path, HostnamePath): + return + if end_chain_only and match.references > 0: + return + if explain: + yield self.explain(path) + else: + yield self.unpack_domain(path) yield from self.exec_each_domain(export_cb, None) @@ -437,9 +478,22 @@ class Database(Profiler): self.enter_step('get_ip4_yield') yield ip4 - def list_asn(self) -> typing.Iterable[AsnPath]: - for asn in self.asns: - yield AsnPath(asn) + def set_match(self, + match: Match, + updated: int, + source: Path, + ) -> None: + new_source = self.get_match(source) + new_level = new_source.level + 1 + if updated > match.updated or new_level > match.level: + if match.source: + old_source = self.get_match(match.source) + old_source.references -= 1 + match.updated = updated + match.level = new_level + match.source = source + new_source.references += 1 + # FP dupplicate function def _set_domain(self, hostname: bool, @@ -451,30 +505,23 @@ class Database(Profiler): if is_first_party: raise NotImplementedError domain = self.pack_domain(domain_str) - self.enter_step('set_domain_src') - if source is None: - level = 0 - source = RulePath() - else: - match = self.get_match(source) - level = match.level + 1 self.enter_step('set_domain_brws') dic = self.domtree for part in domain.parts: - if dic.match_zone.active(): - # Refuse to add domain whose zone is already matching - return if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] + if dic.match_zone.active(): + # Refuse to add domain whose zone is already matching + return if hostname: match = dic.match_hostname else: match = dic.match_zone - match.set( + self.set_match( + match, updated, - level, - source, + source or RulePath(), ) def set_hostname(self, @@ -495,22 +542,16 @@ class Database(Profiler): self.enter_step('set_asn') if is_first_party: raise NotImplementedError - if source is None: - level = 0 - source = RulePath() - else: - match = self.get_match(source) - level = match.level + 1 path = self.pack_asn(asn_str) if path.asn in self.asns: match = self.asns[path.asn] else: match = AsnNode() self.asns[path.asn] = match - match.set( + self.set_match( + match, updated, - level, - source, + source or RulePath(), ) def _set_ip4(self, @@ -520,20 +561,10 @@ class Database(Profiler): source: Path = None) -> None: if is_first_party: raise NotImplementedError - self.enter_step('set_ip4_src') - if source is None: - level = 0 - source = RulePath() - else: - match = self.get_match(source) - level = match.level + 1 self.enter_step('set_ip4_brws') dic = self.ip4tree for i in range(31, 31-ip4.prefixlen, -1): bit = (ip4.value >> i) & 0b1 - if dic.active(): - # Refuse to add ip4* whose network is already matching - return next_dic = dic.one if bit else dic.zero if next_dic is None: next_dic = IpTreeNode() @@ -542,10 +573,13 @@ class Database(Profiler): else: dic.zero = next_dic dic = next_dic - dic.set( + if dic.active(): + # Refuse to add ip4* whose network is already matching + return + self.set_match( + dic, updated, - level, - source, + source or RulePath(), ) def set_ip4address(self, diff --git a/feed_asn.py b/feed_asn.py index f34773f..fbdefcd 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -32,7 +32,10 @@ if __name__ == '__main__': DB = database.Database() - for path in DB.list_asn(): + def add_ranges(path: database.Path, + match: database.Match, + _: typing.Any) -> None: + assert isinstance(path, database.AsnPath) asn_str = database.Database.unpack_asn(path) DB.enter_step('asn_get_ranges') for prefix in get_ranges(asn_str): @@ -49,4 +52,7 @@ if __name__ == '__main__': else: log.error('Unknown prefix version: %s', prefix) + for _ in DB.exec_each_asn(add_ranges, None): + pass + DB.save() From 8f6e01c857fff21f4e51061f8f7afcc1870f0a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Mon, 16 Dec 2019 19:07:35 +0100 Subject: [PATCH 27/40] Added first_party tracking Well, tracking if a rule is from a first or a multi rule... Hope I did not do any mistake --- database.py | 135 +++++++++++++++++++++++++++++------------------- feed_rules.py | 8 ++- import_rules.sh | 10 ++-- 3 files changed, 93 insertions(+), 60 deletions(-) diff --git a/database.py b/database.py index 0828691..3fc93c5 100644 --- a/database.py +++ b/database.py @@ -27,7 +27,17 @@ class Path(): class RulePath(Path): def __str__(self) -> str: - return '(rules)' + return '(rule)' + + +class RuleFirstPath(RulePath): + def __str__(self) -> str: + return '(first-party rule)' + + +class RuleMultiPath(RulePath): + def __str__(self) -> str: + return '(multi-party rule)' class DomainPath(Path): @@ -67,14 +77,18 @@ class Ip4Path(Path): class Match(): def __init__(self) -> None: - self.updated: int = 0 - self.level: int = 0 self.source: typing.Optional[Path] = None - self.references: int = 0 - # FP dupplicate args + self.updated: int = 0 - def active(self) -> bool: - return self.updated > 0 + # Cache + self.level: int = 0 + self.first_party: bool = False + self.references: int = 0 + + def active(self, first_party: bool = None) -> bool: + if self.updated == 0 or (first_party and not self.first_party): + return False + return True class AsnNode(Match): @@ -133,13 +147,21 @@ class Profiler(): class Database(Profiler): - VERSION = 14 + VERSION = 17 PATH = "blocking.p" def initialize(self) -> None: self.log.warning( "Creating database version: %d ", Database.VERSION) + # Dummy match objects that everything refer to + self.rules: typing.List[Match] = list() + for first_party in (False, True): + m = Match() + m.updated = 1 + m.level = 0 + m.first_party = first_party + self.rules.append(m) self.domtree = DomainTreeNode() self.asns: typing.Dict[Asn, AsnNode] = dict() self.ip4tree = IpTreeNode() @@ -150,7 +172,7 @@ class Database(Profiler): with open(self.PATH, 'rb') as db_fdsec: version, data = pickle.load(db_fdsec) if version == Database.VERSION: - self.domtree, self.asns, self.ip4tree = data + self.rules, self.domtree, self.asns, self.ip4tree = data return self.log.warning( "Outdated database version found: %d, " @@ -167,7 +189,7 @@ class Database(Profiler): def save(self) -> None: self.enter_step('save') with open(self.PATH, 'wb') as db_fdsec: - data = self.domtree, self.asns, self.ip4tree + data = self.rules, self.domtree, self.asns, self.ip4tree pickle.dump((self.VERSION, data), db_fdsec) self.profile() @@ -232,8 +254,10 @@ class Database(Profiler): return '.'.join(map(str, octets)) + '/' + str(network.prefixlen) def get_match(self, path: Path) -> Match: - if isinstance(path, RulePath): - return Match() + if isinstance(path, RuleMultiPath): + return self.rules[0] + elif isinstance(path, RuleFirstPath): + return self.rules[1] elif isinstance(path, AsnPath): return self.asns[path.asn] elif isinstance(path, DomainPath): @@ -275,7 +299,6 @@ class Database(Profiler): except TypeError: # not iterable pass - def exec_each_domain(self, callback: MatchCallable, arg: typing.Any = None, @@ -374,8 +397,8 @@ class Database(Profiler): pass def increment_references_cb(path: Path, - match: Match, _: typing.Any - ) -> None: + match: Match, _: typing.Any + ) -> None: if match.source: source = self.get_match(match.source) source.references += 1 @@ -387,9 +410,7 @@ class Database(Profiler): def explain(self, path: Path) -> str: match = self.get_match(path) - string = f'{path}' - if not isinstance(path, RulePath): - string += f' #{match.references}' + string = f'{path} #{match.references}' if match.source: string += f' ← {self.explain(match.source)}' return string @@ -399,14 +420,14 @@ class Database(Profiler): end_chain_only: bool = False, explain: bool = False, ) -> typing.Iterable[str]: - if first_party_only: - raise NotImplementedError def export_cb(path: Path, match: Match, _: typing.Any ) -> typing.Iterable[str]: assert isinstance(path, DomainPath) if not isinstance(path, HostnamePath): return + if first_party_only and not match.first_party: + return if end_chain_only and match.references > 0: return if explain: @@ -419,11 +440,11 @@ class Database(Profiler): def list_rules(self, first_party_only: bool = False, ) -> typing.Iterable[str]: - if first_party_only: - raise NotImplementedError def list_rules_cb(path: Path, match: Match, _: typing.Any ) -> typing.Iterable[str]: + if first_party_only and not match.first_party: + return if isinstance(path, ZonePath) \ or (isinstance(path, Ip4Path) and path.prefixlen < 32): # if match.level == 0: @@ -465,10 +486,10 @@ class Database(Profiler): dic = self.ip4tree for i in range(31, 31-ip4.prefixlen, -1): bit = (ip4.value >> i) & 0b1 + # TODO PERF copy value and slide once every loop if dic.active(): self.enter_step('get_ip4_yield') - a = Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) - yield a + yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) self.enter_step('get_ip4_brws') next_dic = dic.one if bit else dic.zero if next_dic is None: @@ -478,50 +499,58 @@ class Database(Profiler): self.enter_step('get_ip4_yield') yield ip4 - def set_match(self, - match: Match, - updated: int, - source: Path, - ) -> None: - new_source = self.get_match(source) - new_level = new_source.level + 1 - if updated > match.updated or new_level > match.level: + def _set_match(self, + match: Match, + updated: int, + source: Path, + source_match: Match = None, + ) -> None: + # source_match is in parameters because most of the time + # its parent function needs it too, + # so it can pass it to save a traversal + source_match = source_match or self.get_match(source) + new_level = source_match.level + 1 + if updated > match.updated or new_level < match.level \ + or source_match.first_party > match.first_party: + # NOTE FP and level of matches referencing this one + # won't be updated until run or prune if match.source: old_source = self.get_match(match.source) old_source.references -= 1 match.updated = updated match.level = new_level + match.first_party = source_match.first_party match.source = source - new_source.references += 1 - # FP dupplicate function + source_match.references += 1 def _set_domain(self, hostname: bool, domain_str: str, updated: int, - is_first_party: bool = None, - source: Path = None) -> None: + source: Path) -> None: self.enter_step('set_domain_pack') - if is_first_party: - raise NotImplementedError domain = self.pack_domain(domain_str) + self.enter_step('set_domain_fp') + source_match = self.get_match(source) + is_first_party = source_match.first_party self.enter_step('set_domain_brws') dic = self.domtree for part in domain.parts: if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] - if dic.match_zone.active(): + if dic.match_zone.active(is_first_party): # Refuse to add domain whose zone is already matching return if hostname: match = dic.match_hostname else: match = dic.match_zone - self.set_match( + self._set_match( match, updated, - source or RulePath(), + source, + source_match=source_match, ) def set_hostname(self, @@ -537,30 +566,27 @@ class Database(Profiler): def set_asn(self, asn_str: str, updated: int, - is_first_party: bool = None, - source: Path = None) -> None: + source: Path) -> None: self.enter_step('set_asn') - if is_first_party: - raise NotImplementedError path = self.pack_asn(asn_str) if path.asn in self.asns: match = self.asns[path.asn] else: match = AsnNode() self.asns[path.asn] = match - self.set_match( + self._set_match( match, updated, - source or RulePath(), + source, ) def _set_ip4(self, ip4: Ip4Path, updated: int, - is_first_party: bool = None, - source: Path = None) -> None: - if is_first_party: - raise NotImplementedError + source: Path) -> None: + self.enter_step('set_ip4_fp') + source_match = self.get_match(source) + is_first_party = source_match.first_party self.enter_step('set_ip4_brws') dic = self.ip4tree for i in range(31, 31-ip4.prefixlen, -1): @@ -573,13 +599,14 @@ class Database(Profiler): else: dic.zero = next_dic dic = next_dic - if dic.active(): + if dic.active(is_first_party): # Refuse to add ip4* whose network is already matching return - self.set_match( + self._set_match( dic, updated, - source or RulePath(), + source, + source_match=source_match, ) def set_ip4address(self, diff --git a/feed_rules.py b/feed_rules.py index cca1261..2b5596e 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -32,10 +32,16 @@ if __name__ == '__main__': fun = FUNCTION_MAP[args.type] + source: database.RulePath + if args.first_party: + source = database.RuleFirstPath() + else: + source = database.RuleMultiPath() + for rule in args.input: fun(DB, rule.strip(), - # is_first_party=args.first_party, + source=source, updated=int(time.time()), ) diff --git a/import_rules.sh b/import_rules.sh index cdeec93..14c8c78 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -6,11 +6,11 @@ function log() { log "Importing rules…" BEFORE="$(date +%s)" -# cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone -# cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone -# cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone -# cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network -# cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn +cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone +cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone +cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone +cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party From 7851b038f5104cd5b666b84c6eb379547077e300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 13:29:02 +0100 Subject: [PATCH 28/40] Reworked rule export --- database.py | 68 +++++++++++++++---------- export.py | 13 +++-- filter_subdomains.sh => export_lists.sh | 22 +++++--- feed_asn.py | 16 +++++- 4 files changed, 75 insertions(+), 44 deletions(-) rename filter_subdomains.sh => export_lists.sh (81%) diff --git a/database.py b/database.py index 3fc93c5..bff6638 100644 --- a/database.py +++ b/database.py @@ -92,7 +92,9 @@ class Match(): class AsnNode(Match): - pass + def __init__(self) -> None: + Match.__init__(self) + self.name = '' class DomainTreeNode(): @@ -111,8 +113,7 @@ class IpTreeNode(Match): Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] MatchCallable = typing.Callable[[Path, - Match, - typing.Optional[typing.Any]], + Match], typing.Any] @@ -284,7 +285,6 @@ class Database(Profiler): def exec_each_asn(self, callback: MatchCallable, - arg: typing.Any = None, ) -> typing.Any: for asn in self.asns: match = self.asns[asn] @@ -292,7 +292,6 @@ class Database(Profiler): c = callback( AsnPath(asn), match, - arg ) try: yield from c @@ -301,7 +300,6 @@ class Database(Profiler): def exec_each_domain(self, callback: MatchCallable, - arg: typing.Any = None, _dic: DomainTreeNode = None, _par: DomainPath = None, ) -> typing.Any: @@ -311,7 +309,6 @@ class Database(Profiler): c = callback( HostnamePath(_par.parts), _dic.match_hostname, - arg ) try: yield from c @@ -321,7 +318,6 @@ class Database(Profiler): c = callback( ZonePath(_par.parts), _dic.match_zone, - arg ) try: yield from c @@ -331,14 +327,12 @@ class Database(Profiler): dic = _dic.children[part] yield from self.exec_each_domain( callback, - arg, _dic=dic, _par=DomainPath(_par.parts + [part]) ) def exec_each_ip4(self, callback: MatchCallable, - arg: typing.Any = None, _dic: IpTreeNode = None, _par: Ip4Path = None, ) -> typing.Any: @@ -348,7 +342,6 @@ class Database(Profiler): c = callback( _par, _dic, - arg ) try: yield from c @@ -363,7 +356,6 @@ class Database(Profiler): assert addr0 == _par.value yield from self.exec_each_ip4( callback, - arg, _dic=dic, _par=Ip4Path(addr0, pref) ) @@ -373,14 +365,12 @@ class Database(Profiler): addr1 = _par.value | (1 << (32-pref)) yield from self.exec_each_ip4( callback, - arg, _dic=dic, _par=Ip4Path(addr1, pref) ) def exec_each(self, callback: MatchCallable, - arg: typing.Any = None, ) -> typing.Any: yield from self.exec_each_domain(callback) yield from self.exec_each_ip4(callback) @@ -390,19 +380,19 @@ class Database(Profiler): # Should be correctly calculated normally, # keeping this just in case def reset_references_cb(path: Path, - match: Match, _: typing.Any + match: Match ) -> None: match.references = 0 - for _ in self.exec_each(reset_references_cb, None): + for _ in self.exec_each(reset_references_cb): pass def increment_references_cb(path: Path, - match: Match, _: typing.Any + match: Match ) -> None: if match.source: source = self.get_match(match.source) source.references += 1 - for _ in self.exec_each(increment_references_cb, None): + for _ in self.exec_each(increment_references_cb): pass def prune(self, before: int, base_only: bool = False) -> None: @@ -410,7 +400,10 @@ class Database(Profiler): def explain(self, path: Path) -> str: match = self.get_match(path) - string = f'{path} #{match.references}' + if isinstance(match, AsnNode): + string = f'{path} ({match.name}) #{match.references}' + else: + string = f'{path} #{match.references}' if match.source: string += f' ← {self.explain(match.source)}' return string @@ -421,7 +414,7 @@ class Database(Profiler): explain: bool = False, ) -> typing.Iterable[str]: - def export_cb(path: Path, match: Match, _: typing.Any + def export_cb(path: Path, match: Match ) -> typing.Iterable[str]: assert isinstance(path, DomainPath) if not isinstance(path, HostnamePath): @@ -435,27 +428,49 @@ class Database(Profiler): else: yield self.unpack_domain(path) - yield from self.exec_each_domain(export_cb, None) + yield from self.exec_each_domain(export_cb) def list_rules(self, first_party_only: bool = False, ) -> typing.Iterable[str]: - def list_rules_cb(path: Path, match: Match, _: typing.Any + def list_rules_cb(path: Path, match: Match ) -> typing.Iterable[str]: if first_party_only and not match.first_party: return if isinstance(path, ZonePath) \ or (isinstance(path, Ip4Path) and path.prefixlen < 32): - # if match.level == 0: + # if match.level == 1: + # It should be the latter condition but it is more + # useful when using the former yield self.explain(path) - yield from self.exec_each(list_rules_cb, None) + yield from self.exec_each(list_rules_cb) - def count_rules(self, + def count_records(self, first_party_only: bool = False, + rules_only: bool = False, ) -> str: - raise NotImplementedError + memo: typing.Dict[str, int] = dict() + + def count_records_cb(path: Path, match: Match) -> None: + if first_party_only and not match.first_party: + return + # if isinstance(path, ZonePath) \ + # or (isinstance(path, Ip4Path) and path.prefixlen < 32): + if rules_only and match.level > 1: + return + try: + memo[path.__class__.__name__] += 1 + except KeyError: + memo[path.__class__.__name__] = 1 + + for _ in self.exec_each(count_records_cb): + pass + split: typing.List[str] = list() + for key, value in sorted(memo.items(), key=lambda s: s[0]): + split.append(f'{key[:-4]}: {value}') + return ', '.join(split) def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: self.enter_step('get_domain_pack') @@ -486,7 +501,6 @@ class Database(Profiler): dic = self.ip4tree for i in range(31, 31-ip4.prefixlen, -1): bit = (ip4.value >> i) & 0b1 - # TODO PERF copy value and slide once every loop if dic.active(): self.enter_step('get_ip4_yield') yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) diff --git a/export.py b/export.py index 0df4229..91f7193 100755 --- a/export.py +++ b/export.py @@ -32,15 +32,14 @@ if __name__ == '__main__': DB = database.Database() - if args.rules: - if args.count: - print(DB.count_rules(first_party_only=args.first_party)) - else: + if args.count: + print(DB.count_records( + first_party_only=args.first_party, + rules_only=args.rules)) + else: + if args.rules: for line in DB.list_rules(): print(line) - else: - if args.count: - raise NotImplementedError for domain in DB.export( first_party_only=args.first_party, end_chain_only=args.end_chain, diff --git a/filter_subdomains.sh b/export_lists.sh similarity index 81% rename from filter_subdomains.sh rename to export_lists.sh index d4b90ae..20a34cb 100755 --- a/filter_subdomains.sh +++ b/export_lists.sh @@ -4,21 +4,25 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Pruning old data…" -./database.py --prune - -log "Recounting references…" -./database.py --references - log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt ./export.py --output dist/multiparty-trackers.txt ./export.py --end-chain --output dist/multiparty-only-trackers.txt -log "Generating hosts lists…" +log "Generating statistics…" +./export.py --count --first-party > temp/count_recs_firstparty.txt +./export.py --count > temp/count_recs_multiparty.txt ./export.py --rules --count --first-party > temp/count_rules_firstparty.txt ./export.py --rules --count > temp/count_rules_multiparty.txt + +log "Sorting lists…" +sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt +sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt +sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt +sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt + +log "Generating hosts lists…" function generate_hosts { basename="$1" description="$2" @@ -46,13 +50,15 @@ function generate_hosts { echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" + echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO echo "#" echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" + echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)" echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "#" echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" + echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)" echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo diff --git a/feed_asn.py b/feed_asn.py index fbdefcd..aa28dfe 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]: yield pref['prefix'] +def get_name(asn: str) -> str: + req = requests.get( + 'https://stat.ripe.net/data/as-overview/data.json', + params={'resource': asn} + ) + data = req.json() + return data['data']['holder'] + + if __name__ == '__main__': log = logging.getLogger('feed_asn') @@ -34,9 +43,12 @@ if __name__ == '__main__': def add_ranges(path: database.Path, match: database.Match, - _: typing.Any) -> None: + ) -> None: assert isinstance(path, database.AsnPath) + assert isinstance(match, database.AsnNode) asn_str = database.Database.unpack_asn(path) + DB.enter_step('asn_get_name') + match.name = get_name(asn_str) DB.enter_step('asn_get_ranges') for prefix in get_ranges(asn_str): parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) @@ -52,7 +64,7 @@ if __name__ == '__main__': else: log.error('Unknown prefix version: %s', prefix) - for _ in DB.exec_each_asn(add_ranges, None): + for _ in DB.exec_each_asn(add_ranges): pass DB.save() From ea0855bd00949004957bdc70c8441b106a4fda01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 13:50:39 +0100 Subject: [PATCH 29/40] Forgot to push this little guy Good thing I cleaned up my working directory. It only exists because pickles created from database.py itself won't be openable from a file simply importing databse.py. So we create it when in 'imported state'. --- db.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 db.py diff --git a/db.py b/db.py new file mode 100755 index 0000000..4ecec6b --- /dev/null +++ b/db.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import argparse +import database +import time +import os + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="Database operations") + parser.add_argument( + '-i', '--initialize', action='store_true', + help="Reconstruct the whole database") + parser.add_argument( + '-p', '--prune', action='store_true', + help="Remove old entries from database") + parser.add_argument( + '-b', '--prune-base', action='store_true', + help="TODO") + parser.add_argument( + '-s', '--prune-before', type=int, + default=(int(time.time()) - 60*60*24*31*6), + help="TODO") + parser.add_argument( + '-r', '--references', action='store_true', + help="Update the reference count") + args = parser.parse_args() + + if not args.initialize: + DB = database.Database() + else: + if os.path.isfile(database.Database.PATH): + os.unlink(database.Database.PATH) + DB = database.Database() + + DB.enter_step('main') + if args.prune: + DB.prune(before=args.prune_before, base_only=args.prune_base) + if args.references: + DB.update_references() + + DB.save() From d65107f849b7c8d6d572aa91994de899f84e138d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 14:09:06 +0100 Subject: [PATCH 30/40] Save dupplicates too Maybe I won't publish them but this will help me for tracking trackers. --- database.py | 29 ++++++---- export.py | 8 ++- export_lists.sh | 4 +- feed_asn.py | 5 +- feed_dns.old.py | 147 ------------------------------------------------ feed_dns.py | 4 +- 6 files changed, 33 insertions(+), 164 deletions(-) delete mode 100755 feed_dns.old.py diff --git a/database.py b/database.py index bff6638..6e4ca3a 100644 --- a/database.py +++ b/database.py @@ -79,6 +79,7 @@ class Match(): def __init__(self) -> None: self.source: typing.Optional[Path] = None self.updated: int = 0 + self.dupplicate: bool = False # Cache self.level: int = 0 @@ -148,7 +149,7 @@ class Profiler(): class Database(Profiler): - VERSION = 17 + VERSION = 18 PATH = "blocking.p" def initialize(self) -> None: @@ -411,6 +412,7 @@ class Database(Profiler): def export(self, first_party_only: bool = False, end_chain_only: bool = False, + no_dupplicates: bool = False, explain: bool = False, ) -> typing.Iterable[str]: @@ -423,6 +425,8 @@ class Database(Profiler): return if end_chain_only and match.references > 0: return + if no_dupplicates and match.dupplicate: + return if explain: yield self.explain(path) else: @@ -448,18 +452,19 @@ class Database(Profiler): yield from self.exec_each(list_rules_cb) def count_records(self, - first_party_only: bool = False, - rules_only: bool = False, - ) -> str: + first_party_only: bool = False, + rules_only: bool = False, + no_dupplicates: bool = False, + ) -> str: memo: typing.Dict[str, int] = dict() def count_records_cb(path: Path, match: Match) -> None: if first_party_only and not match.first_party: return - # if isinstance(path, ZonePath) \ - # or (isinstance(path, Ip4Path) and path.prefixlen < 32): if rules_only and match.level > 1: return + if no_dupplicates and match.dupplicate: + return try: memo[path.__class__.__name__] += 1 except KeyError: @@ -518,6 +523,7 @@ class Database(Profiler): updated: int, source: Path, source_match: Match = None, + dupplicate: bool = False, ) -> None: # source_match is in parameters because most of the time # its parent function needs it too, @@ -536,6 +542,7 @@ class Database(Profiler): match.first_party = source_match.first_party match.source = source source_match.references += 1 + match.dupplicate = dupplicate def _set_domain(self, hostname: bool, @@ -549,13 +556,13 @@ class Database(Profiler): is_first_party = source_match.first_party self.enter_step('set_domain_brws') dic = self.domtree + dupplicate = False for part in domain.parts: if part not in dic.children: dic.children[part] = DomainTreeNode() dic = dic.children[part] if dic.match_zone.active(is_first_party): - # Refuse to add domain whose zone is already matching - return + dupplicate = True if hostname: match = dic.match_hostname else: @@ -565,6 +572,7 @@ class Database(Profiler): updated, source, source_match=source_match, + dupplicate=dupplicate, ) def set_hostname(self, @@ -603,6 +611,7 @@ class Database(Profiler): is_first_party = source_match.first_party self.enter_step('set_ip4_brws') dic = self.ip4tree + dupplicate = False for i in range(31, 31-ip4.prefixlen, -1): bit = (ip4.value >> i) & 0b1 next_dic = dic.one if bit else dic.zero @@ -614,13 +623,13 @@ class Database(Profiler): dic.zero = next_dic dic = next_dic if dic.active(is_first_party): - # Refuse to add ip4* whose network is already matching - return + dupplicate = True self._set_match( dic, updated, source, source_match=source_match, + dupplicate=dupplicate, ) def set_ip4address(self, diff --git a/export.py b/export.py index 91f7193..8befd77 100755 --- a/export.py +++ b/export.py @@ -25,6 +25,9 @@ if __name__ == '__main__': parser.add_argument( '-r', '--rules', action='store_true', help="TODO") + parser.add_argument( + '-d', '--no-dupplicates', action='store_true', + help="TODO") parser.add_argument( '-c', '--count', action='store_true', help="TODO") @@ -35,7 +38,9 @@ if __name__ == '__main__': if args.count: print(DB.count_records( first_party_only=args.first_party, - rules_only=args.rules)) + rules_only=args.rules, + no_dupplicates=args.no_dupplicates, + )) else: if args.rules: for line in DB.list_rules(): @@ -43,6 +48,7 @@ if __name__ == '__main__': for domain in DB.export( first_party_only=args.first_party, end_chain_only=args.end_chain, + no_dupplicates=args.no_dupplicates, explain=args.explain, ): print(domain, file=args.output) diff --git a/export_lists.sh b/export_lists.sh index 20a34cb..7ef8156 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -6,9 +6,9 @@ function log() { log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt -./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt +./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt ./export.py --output dist/multiparty-trackers.txt -./export.py --end-chain --output dist/multiparty-only-trackers.txt +./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt log "Generating statistics…" ./export.py --count --first-party > temp/count_recs_firstparty.txt diff --git a/feed_asn.py b/feed_asn.py index aa28dfe..6acfba7 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -48,7 +48,8 @@ if __name__ == '__main__': assert isinstance(match, database.AsnNode) asn_str = database.Database.unpack_asn(path) DB.enter_step('asn_get_name') - match.name = get_name(asn_str) + name = get_name(asn_str) + match.name = name DB.enter_step('asn_get_ranges') for prefix in get_ranges(asn_str): parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) @@ -58,7 +59,7 @@ if __name__ == '__main__': source=path, updated=int(time.time()) ) - log.info('Added %s from %s (%s)', prefix, asn_str, path) + log.info('Added %s from %s (%s)', prefix, path, name) elif parsed_prefix.version == 6: log.warning('Unimplemented prefix version: %s', prefix) else: diff --git a/feed_dns.old.py b/feed_dns.old.py deleted file mode 100755 index b106968..0000000 --- a/feed_dns.old.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import database -import logging -import sys -import typing -import enum - -RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR') -Record = typing.Tuple[RecordType, int, str, str] - -# select, write -FUNCTION_MAP: typing.Any = { - RecordType.A: ( - database.Database.get_ip4, - database.Database.set_hostname, - ), - RecordType.CNAME: ( - database.Database.get_domain, - database.Database.set_hostname, - ), - RecordType.PTR: ( - database.Database.get_domain, - database.Database.set_ip4address, - ), -} - - -class Parser(): - def __init__(self, buf: typing.Any) -> None: - self.buf = buf - self.log = logging.getLogger('parser') - self.db = database.Database() - - def end(self) -> None: - self.db.save() - - def register(self, - rtype: RecordType, - updated: int, - name: str, - value: str - ) -> None: - - self.db.enter_step('register') - select, write = FUNCTION_MAP[rtype] - for source in select(self.db, value): - # write(self.db, name, updated, source=source) - write(self.db, name, updated) - - def consume(self) -> None: - raise NotImplementedError - - -class Rapid7Parser(Parser): - TYPES = { - 'a': RecordType.A, - 'aaaa': RecordType.AAAA, - 'cname': RecordType.CNAME, - 'ptr': RecordType.PTR, - } - - def consume(self) -> None: - data = dict() - for line in self.buf: - self.db.enter_step('parse_rapid7') - split = line.split('"') - - for k in range(1, 14, 4): - key = split[k] - val = split[k+2] - data[key] = val - - self.register( - Rapid7Parser.TYPES[data['type']], - int(data['timestamp']), - data['name'], - data['value'] - ) - - -class DnsMassParser(Parser): - # dnsmass --output Snrql - # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 - TYPES = { - 'A': (RecordType.A, -1, None), - 'AAAA': (RecordType.AAAA, -1, None), - 'CNAME': (RecordType.CNAME, -1, -1), - } - - def consume(self) -> None: - self.db.enter_step('parse_dnsmass') - timestamp = 0 - header = True - for line in self.buf: - line = line[:-1] - if not line: - header = True - continue - - split = line.split(' ') - try: - if header: - timestamp = int(split[1]) - header = False - else: - dtype, name_offset, value_offset = \ - DnsMassParser.TYPES[split[1]] - self.register( - dtype, - timestamp, - split[0][:name_offset], - split[2][:value_offset], - ) - self.db.enter_step('parse_dnsmass') - except KeyError: - continue - - -PARSERS = { - 'rapid7': Rapid7Parser, - 'dnsmass': DnsMassParser, -} - -if __name__ == '__main__': - - # Parsing arguments - log = logging.getLogger('feed_dns') - args_parser = argparse.ArgumentParser( - description="TODO") - args_parser.add_argument( - 'parser', - choices=PARSERS.keys(), - help="TODO") - args_parser.add_argument( - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="TODO") - args = args_parser.parse_args() - - parser = PARSERS[args.parser](args.input) - try: - parser.consume() - except KeyboardInterrupt: - pass - parser.end() - diff --git a/feed_dns.py b/feed_dns.py index 43df1fd..58529fd 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -181,10 +181,10 @@ if __name__ == '__main__': '-j', '--workers', type=int, default=4, help="TODO") args_parser.add_argument( - '-b', '--block-size', type=int, default=100, + '-b', '--block-size', type=int, default=1024, help="TODO") args_parser.add_argument( - '-q', '--queue-size', type=int, default=10, + '-q', '--queue-size', type=int, default=128, help="TODO") args = args_parser.parse_args() From e882e09b376891bc80568895e39655e362750813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 14:27:22 +0100 Subject: [PATCH 31/40] Added outdated documentation warning in README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 55fdc45..f27b6f6 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ That's where this scripts comes in, to generate a list of such subdomains. ## How does this script work +> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this. + It takes an input a list of websites with trackers included. So far, this list is manually-generated from the list of clients of such first-party trackers (latter we should use a general list of websites to be more exhaustive). @@ -38,6 +40,8 @@ It finally outputs the matching ones. ## Requirements +> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this. + Just to build the list, you can find an already-built list in the releases. - Bash @@ -54,6 +58,8 @@ Just to build the list, you can find an already-built list in the releases. ## Usage +> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this. + This is only if you want to build the list yourself. If you just want to use the list, the latest build is available here: It was build using additional sources not included in this repository for privacy reasons. From b43cb1725cc982b9268b1935b5f5d00254fc47ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 15:02:42 +0100 Subject: [PATCH 32/40] Autosave Not needed but since the import may take multiple hour I get frustrated if this gets interrupted for some reason. --- feed_dns.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/feed_dns.py b/feed_dns.py index 58529fd..0d9dd96 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -6,7 +6,7 @@ import logging import sys import typing import multiprocessing -import enum +import time Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str] @@ -30,14 +30,19 @@ FUNCTION_MAP: typing.Any = { class Writer(multiprocessing.Process): def __init__(self, recs_queue: multiprocessing.Queue, - index: int = 0): + autosave_interval: int = 0): super(Writer, self).__init__() self.log = logging.getLogger(f'wr') self.recs_queue = recs_queue + self.autosave_interval = autosave_interval def run(self) -> None: self.db = database.Database() self.db.log = logging.getLogger(f'wr') + if self.autosave_interval > 0: + next_save = time.time() + self.autosave_interval + else: + next_save = 0 self.db.enter_step('block_wait') block: typing.List[Record] @@ -55,6 +60,12 @@ class Writer(multiprocessing.Process): except ValueError: self.log.exception("Cannot execute: %s", record) + if next_save > 0 and time.time() > next_save: + self.log.info("Saving database...") + self.db.save() + self.log.info("Done!") + next_save = time.time() + self.autosave_interval + self.db.enter_step('block_wait') self.db.enter_step('end') @@ -186,12 +197,15 @@ if __name__ == '__main__': args_parser.add_argument( '-q', '--queue-size', type=int, default=128, help="TODO") + args_parser.add_argument( + '-a', '--autosave-interval', type=int, default=900, + help="TODO seconds") args = args_parser.parse_args() recs_queue: multiprocessing.Queue = multiprocessing.Queue( maxsize=args.queue_size) - writer = Writer(recs_queue) + writer = Writer(recs_queue, autosave_interval=args.autosave_interval) writer.start() parser = PARSERS[args.parser](args.input, recs_queue, args.block_size) From 747fe46ad0a13fc255fa3a4e114f1e5c1d8a9e7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 15:04:19 +0100 Subject: [PATCH 33/40] Script to automatically download from Rapid7 datasets --- import_rapid7.sh | 26 ++++++++++++++++++++++++++ new_workflow.sh | 22 ---------------------- 2 files changed, 26 insertions(+), 22 deletions(-) create mode 100755 import_rapid7.sh delete mode 100755 new_workflow.sh diff --git a/import_rapid7.sh b/import_rapid7.sh new file mode 100755 index 0000000..c8eacd1 --- /dev/null +++ b/import_rapid7.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +function feed_rapid7_fdns { # dataset + dataset=$1 + line=$(curl -s https://opendata.rapid7.com/sonar.fdns_v2/ | grep "href=\".\+-fdns_$dataset.json.gz\"") + link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)" + log "Reading $(echo "$dataset" | awk '{print toupper($0)}') records from $link" + curl -L "$link" | gunzip | ./feed_dns.py rapid7 +} + +function feed_rapid7_rdns { # dataset + dataset=$1 + line=$(curl -s https://opendata.rapid7.com/sonar.rdns_v2/ | grep "href=\".\+-rdns.json.gz\"") + link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)" + log "Reading PTR records from $link" + curl -L "$link" | gunzip | ./feed_dns.py rapid7 +} + +feed_rapid7_rdns +feed_rapid7_fdns a +# feed_rapid7_fdns aaaa +feed_rapid7_fdns cname diff --git a/new_workflow.sh b/new_workflow.sh deleted file mode 100755 index c98cd46..0000000 --- a/new_workflow.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -function log() { - echo -e "\033[33m$@\033[0m" -} - -./fetch_resources.sh -./import_rules.sh - -# TODO Fetch 'em -log "Reading PTR records…" -pv ptr.json.gz | gunzip | ./feed_dns.py -log "Reading A records…" -pv a.json.gz | gunzip | ./feed_dns.py -log "Reading CNAME records…" -pv cname.json.gz | gunzip | ./feed_dns.py - -log "Pruning old data…" -./database.py --prune - -./filter_subdomains.sh - From dce35cb29974994b77a304c6a90a8eda9be91ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 19:53:05 +0100 Subject: [PATCH 34/40] Harder verficiation before adding entries to DB --- database.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ export_lists.sh | 2 +- fetch_resources.sh | 5 +++- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/database.py b/database.py index 6e4ca3a..cddc326 100644 --- a/database.py +++ b/database.py @@ -10,6 +10,8 @@ import logging import coloredlogs import pickle +TLD_LIST: typing.Set[str] = set() + coloredlogs.install( level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' @@ -200,6 +202,27 @@ class Database(Profiler): self.log = logging.getLogger('db') self.load() + @staticmethod + def populate_tld_list() -> None: + with open('temp/all_tld.list', 'r') as tld_fdesc: + for tld in tld_fdesc: + tld = tld.strip() + TLD_LIST.add(tld) + + @staticmethod + def validate_domain(path: str) -> bool: + if len(path) > 255: + return False + splits = path.split('.') + if not TLD_LIST: + Database.populate_tld_list() + if splits[0] not in TLD_LIST: + return False + for split in splits: + if not 1 <= len(split) <= 63: + return False + return True + @staticmethod def pack_domain(domain: str) -> DomainPath: return DomainPath(domain.split('.')[::-1]) @@ -219,6 +242,19 @@ class Database(Profiler): def unpack_asn(asn: AsnPath) -> str: return f'AS{asn.asn}' + @staticmethod + def validate_ip4address(path: str) -> bool: + splits = path.split('.') + if len(splits) != 4: + return False + for split in splits: + try: + if not 0 <= int(split) <= 255: + return False + except ValueError: + return False + return True + @staticmethod def pack_ip4address(address: str) -> Ip4Path: addr = 0 @@ -237,6 +273,21 @@ class Database(Profiler): addr >>= 8 return '.'.join(map(str, octets)) + @staticmethod + def validate_ip4network(path: str) -> bool: + # A bit generous but ok for our usage + splits = path.split('/') + if len(splits) != 2: + return False + if not Database.validate_ip4address(splits[0]): + return False + try: + if not 0 <= int(splits[1]) <= 32: + return False + except ValueError: + return False + return True + @staticmethod def pack_ip4network(network: str) -> Ip4Path: address, prefixlen_str = network.split('/') @@ -549,6 +600,9 @@ class Database(Profiler): domain_str: str, updated: int, source: Path) -> None: + self.enter_step('set_domain_val') + if not Database.validate_domain(domain_str): + raise ValueError(f"Invalid domain: {domain_str}") self.enter_step('set_domain_pack') domain = self.pack_domain(domain_str) self.enter_step('set_domain_fp') @@ -636,6 +690,9 @@ class Database(Profiler): ip4address_str: str, *args: typing.Any, **kwargs: typing.Any ) -> None: + self.enter_step('set_ip4add_val') + if not Database.validate_ip4address(ip4address_str): + raise ValueError(f"Invalid ip4address: {ip4address_str}") self.enter_step('set_ip4add_pack') ip4 = self.pack_ip4address(ip4address_str) self._set_ip4(ip4, *args, **kwargs) @@ -644,6 +701,9 @@ class Database(Profiler): ip4network_str: str, *args: typing.Any, **kwargs: typing.Any ) -> None: + self.enter_step('set_ip4net_val') + if not Database.validate_ip4network(ip4network_str): + raise ValueError(f"Invalid ip4network: {ip4network_str}") self.enter_step('set_ip4net_pack') ip4 = self.pack_ip4network(ip4network_str) self._set_ip4(ip4, *args, **kwargs) diff --git a/export_lists.sh b/export_lists.sh index 7ef8156..1070865 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -8,7 +8,7 @@ log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt ./export.py --output dist/multiparty-trackers.txt -./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt +./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt log "Generating statistics…" ./export.py --count --first-party > temp/count_recs_firstparty.txt diff --git a/fetch_resources.sh b/fetch_resources.sh index 00d131f..f4c95b0 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -30,6 +30,10 @@ dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hos # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt +log "Retrieving TLD list…" +dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list +grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list + log "Retrieving nameservers…" rm -f nameservers touch nameservers @@ -51,4 +55,3 @@ then else mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list fi -dl https://www.orwell1984.today/cname/eulerian.net.txt subdomains/orwell-eulerian-cname-list.cache.list From aca5023c3f9d70d215e0d5fe043ea32cf3743ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Wed, 18 Dec 2019 01:03:08 +0100 Subject: [PATCH 35/40] Fixed scripting around --- .gitignore | 2 - database.py | 59 +++++++-------- export.py | 29 ++++--- export_lists.sh | 147 ++++++++++++++++++++--------------- feed_dns.py | 12 +-- fetch_resources.sh | 7 +- filter_subdomains.py | 160 --------------------------------------- import_rules.sh | 4 +- nameservers/.gitignore | 2 + nameservers/popular.list | 24 ++++++ resolve_subdomains.sh | 17 +++-- validate_list.py | 35 +++++++++ 12 files changed, 212 insertions(+), 286 deletions(-) delete mode 100755 filter_subdomains.py create mode 100644 nameservers/.gitignore create mode 100644 nameservers/popular.list create mode 100755 validate_list.py diff --git a/.gitignore b/.gitignore index c72635d..e6abf3c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ *.log *.p -nameservers -nameservers.head diff --git a/database.py b/database.py index cddc326..0a62ad1 100644 --- a/database.py +++ b/database.py @@ -216,7 +216,7 @@ class Database(Profiler): splits = path.split('.') if not TLD_LIST: Database.populate_tld_list() - if splits[0] not in TLD_LIST: + if splits[-1] not in TLD_LIST: return False for split in splits: if not 1 <= len(split) <= 63: @@ -460,62 +460,56 @@ class Database(Profiler): string += f' ← {self.explain(match.source)}' return string - def export(self, - first_party_only: bool = False, - end_chain_only: bool = False, - no_dupplicates: bool = False, - explain: bool = False, - ) -> typing.Iterable[str]: + def list_records(self, + first_party_only: bool = False, + end_chain_only: bool = False, + no_dupplicates: bool = False, + rules_only: bool = False, + hostnames_only: bool = False, + explain: bool = False, + ) -> typing.Iterable[str]: def export_cb(path: Path, match: Match ) -> typing.Iterable[str]: - assert isinstance(path, DomainPath) - if not isinstance(path, HostnamePath): - return if first_party_only and not match.first_party: return if end_chain_only and match.references > 0: return if no_dupplicates and match.dupplicate: return + if rules_only and match.level > 1: + return + if hostnames_only and not isinstance(path, HostnamePath): + return + if explain: yield self.explain(path) else: - yield self.unpack_domain(path) + yield str(path) - yield from self.exec_each_domain(export_cb) - - def list_rules(self, - first_party_only: bool = False, - ) -> typing.Iterable[str]: - - def list_rules_cb(path: Path, match: Match - ) -> typing.Iterable[str]: - if first_party_only and not match.first_party: - return - if isinstance(path, ZonePath) \ - or (isinstance(path, Ip4Path) and path.prefixlen < 32): - # if match.level == 1: - # It should be the latter condition but it is more - # useful when using the former - yield self.explain(path) - - yield from self.exec_each(list_rules_cb) + yield from self.exec_each(export_cb) def count_records(self, first_party_only: bool = False, - rules_only: bool = False, + end_chain_only: bool = False, no_dupplicates: bool = False, + rules_only: bool = False, + hostnames_only: bool = False, ) -> str: memo: typing.Dict[str, int] = dict() def count_records_cb(path: Path, match: Match) -> None: if first_party_only and not match.first_party: return - if rules_only and match.level > 1: + if end_chain_only and match.references > 0: return if no_dupplicates and match.dupplicate: return + if rules_only and match.level > 1: + return + if hostnames_only and not isinstance(path, HostnamePath): + return + try: memo[path.__class__.__name__] += 1 except KeyError: @@ -523,9 +517,10 @@ class Database(Profiler): for _ in self.exec_each(count_records_cb): pass + split: typing.List[str] = list() for key, value in sorted(memo.items(), key=lambda s: s[0]): - split.append(f'{key[:-4]}: {value}') + split.append(f'{key[:-4].lower()}s: {value}') return ', '.join(split) def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: diff --git a/export.py b/export.py index 8befd77..8dcf2c5 100755 --- a/export.py +++ b/export.py @@ -19,15 +19,18 @@ if __name__ == '__main__': parser.add_argument( '-e', '--end-chain', action='store_true', help="TODO") - parser.add_argument( - '-x', '--explain', action='store_true', - help="TODO") parser.add_argument( '-r', '--rules', action='store_true', help="TODO") + parser.add_argument( + '-b', '--base-rules', action='store_true', + help="TODO implies rules") parser.add_argument( '-d', '--no-dupplicates', action='store_true', help="TODO") + parser.add_argument( + '-x', '--explain', action='store_true', + help="TODO") parser.add_argument( '-c', '--count', action='store_true', help="TODO") @@ -36,19 +39,21 @@ if __name__ == '__main__': DB = database.Database() if args.count: + assert not args.explain print(DB.count_records( - first_party_only=args.first_party, - rules_only=args.rules, - no_dupplicates=args.no_dupplicates, - )) - else: - if args.rules: - for line in DB.list_rules(): - print(line) - for domain in DB.export( first_party_only=args.first_party, end_chain_only=args.end_chain, no_dupplicates=args.no_dupplicates, + rules_only=args.base_rules, + hostnames_only=not (args.rules or args.base_rules), + )) + else: + for domain in DB.list_records( + first_party_only=args.first_party, + end_chain_only=args.end_chain, + no_dupplicates=args.no_dupplicates, + rules_only=args.base_rules, + hostnames_only=not (args.rules or args.base_rules), explain=args.explain, ): print(domain, file=args.output) diff --git a/export_lists.sh b/export_lists.sh index 1070865..b9853ed 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -4,69 +4,94 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Exporting lists…" -./export.py --first-party --output dist/firstparty-trackers.txt -./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt -./export.py --output dist/multiparty-trackers.txt -./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt +log "Calculating statistics…" +gen_date=$(date -Isec) +gen_software=$(git describe --tags) +number_websites=$(wc -l < temp/all_websites.list) +number_subdomains=$(wc -l < temp/all_subdomains.list) +number_dns=$(grep '^$' temp/all_resolved.txt | wc -l) -log "Generating statistics…" -./export.py --count --first-party > temp/count_recs_firstparty.txt -./export.py --count > temp/count_recs_multiparty.txt -./export.py --rules --count --first-party > temp/count_rules_firstparty.txt -./export.py --rules --count > temp/count_rules_multiparty.txt +for partyness in {first,multi} +do + if [ $partyness = "first" ] + then + partyness_flags="--first-party" + else + partyness_flags="" + fi -log "Sorting lists…" -sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt -sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt -sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt -sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt + echo "Statistics for ${partyness}-party trackers" + echo "Input rules: $(./export.py --count --base-rules $partyness_flags)" + echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)" + echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)" + echo "Output hostnames: $(./export.py --count $partyness_flags)" + echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)" + echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)" + echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)" + echo -log "Generating hosts lists…" -function generate_hosts { - basename="$1" - description="$2" - description2="$3" + for trackerness in {trackers,only-trackers} + do + if [ $trackerness = "trackers" ] + then + trackerness_flags="" + else + trackerness_flags="--end-chain --no-dupplicates" + fi + file_list="dist/${partyness}party-${trackerness}.txt" + file_host="dist/${partyness}party-${trackerness}-hosts.txt" - ( - echo "# First-party trackers host list" - echo "# $description" - echo "# $description2" - echo "#" - echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" - echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" - echo "#" - echo "# In case of false positives/negatives, or any other question," - echo "# contact me the way you like: https://geoffrey.frogeye.fr" - echo "#" - echo "# Latest version:" - echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" - echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" - echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt" - echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt" - echo '# (you can remove `-hosts` to get the raw list)' - echo "#" - echo "# Generation date: $(date -Isec)" - echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO - echo "#" - echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" - echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)" - echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" - echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" - echo "#" - echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" - echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)" - echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" - echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" - echo - sed 's|^|0.0.0.0 |' "dist/$basename.txt" - ) > "dist/$basename-hosts.txt" -} + log "Generating lists for variant ${partyness}-party ${trackerness}…" -generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" "" -generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection." -generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party." -generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists." + # Real export heeere + ./export.py $partyness_flags $trackerness_flags > $file_list + # Sometimes a bit heavy to have the DB open and sort the output + # so this is done in two steps + sort -u $file_list -o $file_list + + rules_input=$(./export.py --count --base-rules $partyness_flags) + rules_found=$(./export.py --count --rules $partyness_flags) + rules_output=$(./export.py --count $partyness_flags $trackerness_flags) + + function link() { # link partyness, link trackerness + url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt" + if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ] + then + url="$url (this one)" + fi + echo $url + } + + ( + echo "# First-party trackers host list" + echo "# Variant: ${partyness}-party ${trackerness}" + echo "#" + echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" + echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" + echo "#" + echo "# In case of false positives/negatives, or any other question," + echo "# contact me the way you like: https://geoffrey.frogeye.fr" + echo "#" + echo "# Latest versions:" + echo "# - First-party trackers : $(link first trackers)" + echo "# - … excluding redirected: $(link first only-trackers)" + echo "# - First and third party : $(link multi trackers)" + echo "# - … excluding redirected: $(link multi only-trackers)" + echo '# (you can remove `-hosts` to get the raw list)' + echo "#" + echo "# Generation date: $gen_date" + echo "# Generation software: eulaurarien $gen_software" + echo "# Number of source websites: $number_websites" + echo "# Number of source subdomains: $number_subdomains" + echo "# Number of source DNS records: ~2E9 + $number_dns" + echo "#" + echo "# Input rules: $rules_input" + echo "# Subsequent rules: $rules_found" + echo "# Output rules: $rules_output" + echo "#" + echo + sed 's|^|0.0.0.0 |' "$file_list" + ) > "$file_host" + + done +done diff --git a/feed_dns.py b/feed_dns.py index 0d9dd96..f923831 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -130,8 +130,8 @@ class Rapid7Parser(Parser): self.register(record) -class DnsMassParser(Parser): - # dnsmass --output Snrql +class MassDnsParser(Parser): + # massdns --output Snrql # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 TYPES = { 'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None), @@ -140,7 +140,7 @@ class DnsMassParser(Parser): } def consume(self) -> None: - self.prof.enter_step('parse_dnsmass') + self.prof.enter_step('parse_massdns') timestamp = 0 header = True for line in self.buf: @@ -156,7 +156,7 @@ class DnsMassParser(Parser): header = False else: select, write, name_offset, value_offset = \ - DnsMassParser.TYPES[split[1]] + MassDnsParser.TYPES[split[1]] record = ( select, write, @@ -165,14 +165,14 @@ class DnsMassParser(Parser): split[2][:value_offset], ) self.register(record) - self.prof.enter_step('parse_dnsmass') + self.prof.enter_step('parse_massdns') except KeyError: continue PARSERS = { 'rapid7': Rapid7Parser, - 'dnsmass': DnsMassParser, + 'massdns': MassDnsParser, } if __name__ == '__main__': diff --git a/fetch_resources.sh b/fetch_resources.sh index f4c95b0..d659fbc 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -35,12 +35,7 @@ dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list log "Retrieving nameservers…" -rm -f nameservers -touch nameservers -[ -f nameservers.head ] && cat nameservers.head >> nameservers -dl https://public-dns.info/nameservers.txt nameservers.temp -sort -R nameservers.temp >> nameservers -rm nameservers.temp +dl https://public-dns.info/nameservers.txt nameservers/public-dns.list log "Retrieving top subdomains…" dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip diff --git a/filter_subdomains.py b/filter_subdomains.py deleted file mode 100755 index 601a031..0000000 --- a/filter_subdomains.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 -# pylint: disable=C0103 - -""" -From a list of subdomains, output only -the ones resolving to a first-party tracker. -""" - -import argparse -import sys -import progressbar -import csv -import typing -import ipaddress - -# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']] -DomainRule = typing.Union[bool, typing.Dict] -# IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']] -IpRule = typing.Union[bool, typing.Dict] - -RULES_DICT: DomainRule = dict() -RULES_IP_DICT: IpRule = dict() - - -def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: - for char in address.packed: - for i in range(7, -1, -1): - yield (char >> i) & 0b1 - - -def subdomain_matching(subdomain: str) -> bool: - parts = subdomain.split('.') - parts.reverse() - dic = RULES_DICT - for part in parts: - if isinstance(dic, bool) or part not in dic: - break - dic = dic[part] - if isinstance(dic, bool): - return dic - return False - - -def ip_matching(ip_str: str) -> bool: - ip = ipaddress.ip_address(ip_str) - dic = RULES_IP_DICT - i = 0 - for bit in get_bits(ip): - i += 1 - if isinstance(dic, bool) or bit not in dic: - break - dic = dic[bit] - if isinstance(dic, bool): - return dic - return False - - -def get_matching(chain: typing.List[str], no_explicit: bool = False - ) -> typing.Iterable[str]: - if len(chain) <= 1: - return - initial = chain[0] - cname_destinations = chain[1:-1] - a_destination = chain[-1] - initial_matching = subdomain_matching(initial) - if no_explicit and initial_matching: - return - cname_matching = any(map(subdomain_matching, cname_destinations)) - if cname_matching or initial_matching or ip_matching(a_destination): - yield initial - - -def register_rule(subdomain: str) -> None: - # Make a tree with domain parts - parts = subdomain.split('.') - parts.reverse() - dic = RULES_DICT - last_part = len(parts) - 1 - for p, part in enumerate(parts): - if isinstance(dic, bool): - return - if p == last_part: - dic[part] = True - else: - dic.setdefault(part, dict()) - dic = dic[part] - - -def register_rule_ip(network: str) -> None: - net = ipaddress.ip_network(network) - ip = net.network_address - dic = RULES_IP_DICT - last_bit = net.prefixlen - 1 - for b, bit in enumerate(get_bits(ip)): - if isinstance(dic, bool): - return - if b == last_bit: - dic[bit] = True - else: - dic.setdefault(bit, dict()) - dic = dic[bit] - - -if __name__ == '__main__': - - # Parsing arguments - parser = argparse.ArgumentParser( - description="Filter first-party trackers from a list of subdomains") - parser.add_argument( - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="Input file with DNS chains") - parser.add_argument( - '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, - help="Outptut file with one tracking subdomain per line") - parser.add_argument( - '-n', '--no-explicit', action='store_true', - help="Don't output domains already blocked with rules without CNAME") - parser.add_argument( - '-r', '--rules', type=argparse.FileType('r'), - help="List of domains domains to block (with their subdomains)") - parser.add_argument( - '-p', '--rules-ip', type=argparse.FileType('r'), - help="List of IPs ranges to block") - args = parser.parse_args() - - # Progress bar - widgets = [ - progressbar.Percentage(), - ' ', progressbar.SimpleProgress(), - ' ', progressbar.Bar(), - ' ', progressbar.Timer(), - ' ', progressbar.AdaptiveTransferSpeed(unit='req'), - ' ', progressbar.AdaptiveETA(), - ] - progress = progressbar.ProgressBar(widgets=widgets) - - # Reading rules - if args.rules: - for rule in args.rules: - register_rule(rule.strip()) - if args.rules_ip: - for rule in args.rules_ip: - register_rule_ip(rule.strip()) - - # Approximating line count - if args.input.seekable(): - lines = 0 - for line in args.input: - lines += 1 - progress.max_value = lines - args.input.seek(0) - - # Reading domains to filter - reader = csv.reader(args.input) - progress.start() - for chain in reader: - for match in get_matching(chain, no_explicit=args.no_explicit): - print(match, file=args.output) - progress.update(progress.value + 1) - progress.finish() diff --git a/import_rules.sh b/import_rules.sh index 14c8c78..cbcfbd8 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py -log "Pruning old rules…" -./db.py --prune --prune-before "$BEFORE" --prune-base +# log "Pruning old rules…" +# ./db.py --prune --prune-before "$BEFORE" --prune-base diff --git a/nameservers/.gitignore b/nameservers/.gitignore new file mode 100644 index 0000000..dbd03bc --- /dev/null +++ b/nameservers/.gitignore @@ -0,0 +1,2 @@ +*.custom.list +*.cache.list diff --git a/nameservers/popular.list b/nameservers/popular.list new file mode 100644 index 0000000..c35d391 --- /dev/null +++ b/nameservers/popular.list @@ -0,0 +1,24 @@ +8.8.8.8 +8.8.4.4 +2001:4860:4860:0:0:0:0:8888 +2001:4860:4860:0:0:0:0:8844 +208.67.222.222 +208.67.220.220 +2620:119:35::35 +2620:119:53::53 +4.2.2.1 +4.2.2.2 +8.26.56.26 +8.20.247.20 +84.200.69.80 +84.200.70.40 +2001:1608:10:25:0:0:1c04:b12f +2001:1608:10:25:0:0:9249:d69b +9.9.9.10 +149.112.112.10 +2620:fe::10 +2620:fe::fe:10 +1.1.1.1 +1.0.0.1 +2606:4700:4700::1111 +2606:4700:4700::1001 diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index e37ddeb..7a91337 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -4,9 +4,16 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Compiling locally known subdomain…" -# Sort by last character to utilize the DNS server caching mechanism -pv subdomains/*.list | sed 's/\r$//' | rev | sort -u | rev > temp/all_subdomains.list -log "Resolving locally known subdomain…" -pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.csv +log "Compiling nameservers…" +pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list +log "Compiling subdomain…" +# Sort by last character to utilize the DNS server caching mechanism +# (not as efficient with massdns but it's almost free so why not) +pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list + +log "Resolving subdomain…" +massdns --output Snrql --retry REFUSED,SERVFAIL --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list + +log "Importing into database…" +pv temp/all_resolved.txt | ./feed_dns.py massdns diff --git a/validate_list.py b/validate_list.py new file mode 100755 index 0000000..62301c2 --- /dev/null +++ b/validate_list.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# pylint: disable=C0103 + +""" +Filter out invalid domain names +""" + +import database +import argparse +import sys + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="Filter out invalid domain names.") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="TODO") + parser.add_argument( + '-d', '--domain', action='store_true', + help="Can be domain") + parser.add_argument( + '-4', '--ip4', action='store_true', + help="Can be IP4") + args = parser.parse_args() + + for line in args.input: + line = line.strip() + if (args.domain and database.Database.validate_domain(line)) or \ + (args.ip4 and database.Database.validate_ip4address(line)): + print(line, file=args.output) From 06b745890c8d26e7ae5dc7f5e09312e200a672d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Wed, 18 Dec 2019 17:03:05 +0100 Subject: [PATCH 36/40] Added other first-party trackers --- rules/first-party.list | 9 ++++++++- tests/first-party.csv | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/rules/first-party.list b/rules/first-party.list index b7c393e..54246cd 100644 --- a/rules/first-party.list +++ b/rules/first-party.list @@ -18,7 +18,14 @@ omtrdc.net online-metrix.net # Webtrekk wt-eu02.net +webtrekk.net # Otto Group oghub.io -# ??? +# Intent.com partner.intentmedia.net +# Wizaly +wizaly.com +# Commanders Act +tagcommander.com +# Affex Marketing +affex.org diff --git a/tests/first-party.csv b/tests/first-party.csv index 5084bcb..92ff458 100644 --- a/tests/first-party.csv +++ b/tests/first-party.csv @@ -5,3 +5,6 @@ https://www.discover.com/,,content.discover.com,ThreatMetrix https://www.mytoys.de/,,web.mytoys.de,Webtrekk https://www.baur.de/,,tp.baur.de,Otto Group https://www.liligo.com/,,compare.liligo.com,??? +https://www.boulanger.com/,,tag.boulanger.fr,TagCommander +https://www.airfrance.fr/FR/,,tk.airfrance.fr,Wizaly +https://www.vsgamers.es/,,marketing.net.vsgamers.es,Affex From 4a2205479669667b2bc17274551fc862612737ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Wed, 18 Dec 2019 21:23:49 +0100 Subject: [PATCH 37/40] Added optional cache for faster IP matching --- database.py | 39 +++++++++++++++++++++++++++++++++++++-- feed_dns.py | 11 +++++++++-- feed_rules.py | 14 +++++++++----- fetch_resources.sh | 2 +- import_rapid7.sh | 12 ++++++------ rules/first-party.list | 2 +- 6 files changed, 63 insertions(+), 17 deletions(-) diff --git a/database.py b/database.py index 0a62ad1..c37369f 100644 --- a/database.py +++ b/database.py @@ -9,6 +9,8 @@ import time import logging import coloredlogs import pickle +import numpy +import math TLD_LIST: typing.Set[str] = set() @@ -201,6 +203,33 @@ class Database(Profiler): Profiler.__init__(self) self.log = logging.getLogger('db') self.load() + self.ip4cache_shift: int = 32 + self.ip4cache = numpy.ones(1) + + def _set_ip4cache(self, path: Path, _: Match) -> None: + assert isinstance(path, Ip4Path) + self.enter_step('set_ip4cache') + mini = path.value >> self.ip4cache_shift + maxi = (path.value + 2**(32-path.prefixlen)) >> self.ip4cache_shift + if mini == maxi: + self.ip4cache[mini] = True + else: + self.ip4cache[mini:maxi] = True + + def fill_ip4cache(self, max_size: int = 512*1024**2) -> None: + """ + Size in bytes + """ + if max_size > 2**32/8: + self.log.warning("Allocating more than 512 MiB of RAM for " + "the Ip4 cache is not necessary.") + max_cache_width = int(math.log2(max(1, max_size*8))) + cache_width = min(2**32, max_cache_width) + self.ip4cache_shift = 32-cache_width + cache_size = 2**cache_width + self.ip4cache = numpy.zeros(cache_size, dtype=numpy.bool) + for _ in self.exec_each_ip4(self._set_ip4cache): + pass @staticmethod def populate_tld_list() -> None: @@ -404,8 +433,9 @@ class Database(Profiler): pref = _par.prefixlen + 1 dic = _dic.zero if dic: - addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref))) - assert addr0 == _par.value + # addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref))) + # assert addr0 == _par.value + addr0 = _par.value yield from self.exec_each_ip4( callback, _dic=dic, @@ -415,6 +445,7 @@ class Database(Profiler): dic = _dic.one if dic: addr1 = _par.value | (1 << (32-pref)) + # assert addr1 != _par.value yield from self.exec_each_ip4( callback, _dic=dic, @@ -548,6 +579,9 @@ class Database(Profiler): def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]: self.enter_step('get_ip4_pack') ip4 = self.pack_ip4address(ip4_str) + self.enter_step('get_ip4_cache') + if not self.ip4cache[ip4.value >> self.ip4cache_shift]: + return self.enter_step('get_ip4_brws') dic = self.ip4tree for i in range(31, 31-ip4.prefixlen, -1): @@ -680,6 +714,7 @@ class Database(Profiler): source_match=source_match, dupplicate=dupplicate, ) + self._set_ip4cache(ip4, dic) def set_ip4address(self, ip4address_str: str, diff --git a/feed_dns.py b/feed_dns.py index f923831..03b9429 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -30,15 +30,19 @@ FUNCTION_MAP: typing.Any = { class Writer(multiprocessing.Process): def __init__(self, recs_queue: multiprocessing.Queue, - autosave_interval: int = 0): + autosave_interval: int = 0, + ip4_cache: int = 0, + ): super(Writer, self).__init__() self.log = logging.getLogger(f'wr') self.recs_queue = recs_queue self.autosave_interval = autosave_interval + self.ip4_cache = ip4_cache def run(self) -> None: self.db = database.Database() self.db.log = logging.getLogger(f'wr') + self.db.fill_ip4cache(max_size=self.ip4_cache) if self.autosave_interval > 0: next_save = time.time() + self.autosave_interval else: @@ -200,12 +204,15 @@ if __name__ == '__main__': args_parser.add_argument( '-a', '--autosave-interval', type=int, default=900, help="TODO seconds") + args_parser.add_argument( + '-4', '--ip4-cache', type=int, default=0, + help="TODO bytes max 512 MiB") args = args_parser.parse_args() recs_queue: multiprocessing.Queue = multiprocessing.Queue( maxsize=args.queue_size) - writer = Writer(recs_queue, autosave_interval=args.autosave_interval) + writer = Writer(recs_queue, autosave_interval=args.autosave_interval, ip4_cache=args.ip4_cache) writer.start() parser = PARSERS[args.parser](args.input, recs_queue, args.block_size) diff --git a/feed_rules.py b/feed_rules.py index 2b5596e..0889900 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -39,10 +39,14 @@ if __name__ == '__main__': source = database.RuleMultiPath() for rule in args.input: - fun(DB, - rule.strip(), - source=source, - updated=int(time.time()), - ) + rule = rule.strip() + try: + fun(DB, + rule, + source=source, + updated=int(time.time()), + ) + except ValueError: + DB.log.error(f"Could not add rule: {rule}") DB.save() diff --git a/fetch_resources.sh b/fetch_resources.sh index d659fbc..cb66ff7 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -35,7 +35,7 @@ dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list log "Retrieving nameservers…" -dl https://public-dns.info/nameservers.txt nameservers/public-dns.list +dl https://public-dns.info/nameservers.txt nameservers/public-dns.cache.list log "Retrieving top subdomains…" dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip diff --git a/import_rapid7.sh b/import_rapid7.sh index c8eacd1..993bfe7 100755 --- a/import_rapid7.sh +++ b/import_rapid7.sh @@ -9,7 +9,7 @@ function feed_rapid7_fdns { # dataset line=$(curl -s https://opendata.rapid7.com/sonar.fdns_v2/ | grep "href=\".\+-fdns_$dataset.json.gz\"") link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)" log "Reading $(echo "$dataset" | awk '{print toupper($0)}') records from $link" - curl -L "$link" | gunzip | ./feed_dns.py rapid7 + curl -L "$link" | gunzip } function feed_rapid7_rdns { # dataset @@ -17,10 +17,10 @@ function feed_rapid7_rdns { # dataset line=$(curl -s https://opendata.rapid7.com/sonar.rdns_v2/ | grep "href=\".\+-rdns.json.gz\"") link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)" log "Reading PTR records from $link" - curl -L "$link" | gunzip | ./feed_dns.py rapid7 + curl -L "$link" | gunzip } -feed_rapid7_rdns -feed_rapid7_fdns a -# feed_rapid7_fdns aaaa -feed_rapid7_fdns cname +feed_rapid7_rdns | ./feed_dns.py rapid7 +feed_rapid7_fdns a | ./feed_dns.py rapid7 --ip4-cache 536870912 +# feed_rapid7_fdns aaaa | ./feed_dns.py rapid7 --ip6-cache 536870912 +feed_rapid7_fdns cname | ./feed_dns.py rapid7 diff --git a/rules/first-party.list b/rules/first-party.list index 54246cd..3092397 100644 --- a/rules/first-party.list +++ b/rules/first-party.list @@ -27,5 +27,5 @@ partner.intentmedia.net wizaly.com # Commanders Act tagcommander.com -# Affex Marketing +# Ingenious Technologies affex.org From c81be4825c7f73cf54fb8756e681a25211dbe552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Wed, 18 Dec 2019 22:46:00 +0100 Subject: [PATCH 38/40] Automated tests Very rudimentary but should do the trick Closes #4 --- run_tests.py | 34 ++++++++++++++++++++++++++++++++++ tests/false-positives.csv | 1 - 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100755 run_tests.py diff --git a/run_tests.py b/run_tests.py new file mode 100755 index 0000000..548b6eb --- /dev/null +++ b/run_tests.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import database +import os +import logging +import csv + +TESTS_DIR = 'tests' + +if __name__ == '__main__': + + DB = database.Database() + log = logging.getLogger('tests') + + for filename in os.listdir(TESTS_DIR): + log.info("") + log.info("Running tests from %s", filename) + path = os.path.join(TESTS_DIR, filename) + with open(path, 'rt') as fdesc: + reader = csv.DictReader(fdesc) + for test in reader: + log.info("Testing %s (%s)", test['url'], test['comment']) + + for white in test['white'].split(':'): + if not white: + continue + if any(DB.get_domain(white)): + log.error("False positive: %s", white) + + for black in test['black'].split(':'): + if not black: + continue + if not any(DB.get_domain(black)): + log.error("False negative: %s", black) diff --git a/tests/false-positives.csv b/tests/false-positives.csv index c20639a..664b630 100644 --- a/tests/false-positives.csv +++ b/tests/false-positives.csv @@ -1,6 +1,5 @@ url,white,black,comment https://support.apple.com,support.apple.com,,EdgeKey / AkamaiEdge https://www.pinterest.fr/,i.pinimg.com,,Cedexis -https://www.pinterest.fr/,i.pinimg.com,,Cedexis https://www.tumblr.com/,66.media.tumblr.com,,ChiCDN https://www.skype.com/fr/,www.skype.com,,TrafficManager From 53b14c6ffae2a1472b6338b01c458b5d0e633ffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Thu, 19 Dec 2019 08:05:05 +0100 Subject: [PATCH 39/40] Removed TODO placeholders in commands description It's better than nothing but not by that much --- db.py | 8 +++++--- export.py | 23 ++++++++++++++--------- feed_asn.py | 2 +- feed_dns.py | 29 +++++++++++++++++------------ feed_rules.py | 8 +++++--- import_rapid7.sh | 2 +- validate_list.py | 8 ++++---- 7 files changed, 47 insertions(+), 33 deletions(-) diff --git a/db.py b/db.py index 4ecec6b..91d00c5 100755 --- a/db.py +++ b/db.py @@ -18,14 +18,16 @@ if __name__ == '__main__': help="Remove old entries from database") parser.add_argument( '-b', '--prune-base', action='store_true', - help="TODO") + help="With --prune, only prune base rules " + "(the ones added by ./feed_rules.py)") parser.add_argument( '-s', '--prune-before', type=int, default=(int(time.time()) - 60*60*24*31*6), - help="TODO") + help="With --prune, only rules updated before " + "this UNIX timestamp will be deleted") parser.add_argument( '-r', '--references', action='store_true', - help="Update the reference count") + help="DEBUG: Update the reference count") args = parser.parse_args() if not args.initialize: diff --git a/export.py b/export.py index 8dcf2c5..c5eefb2 100755 --- a/export.py +++ b/export.py @@ -9,31 +9,36 @@ if __name__ == '__main__': # Parsing arguments parser = argparse.ArgumentParser( - description="TODO") + description="Export the hostnames rules stored " + "in the Database as plain text") parser.add_argument( '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, - help="TODO") + help="Output file, one rule per line") parser.add_argument( '-f', '--first-party', action='store_true', - help="TODO") + help="Only output rules issued from first-party sources") parser.add_argument( '-e', '--end-chain', action='store_true', - help="TODO") + help="Only output rules that are not referenced by any other") parser.add_argument( '-r', '--rules', action='store_true', - help="TODO") + help="Output all kinds of rules, not just hostnames") parser.add_argument( '-b', '--base-rules', action='store_true', - help="TODO implies rules") + help="Output base rules " + "(the ones added by ./feed_rules.py) " + "(implies --rules)") parser.add_argument( '-d', '--no-dupplicates', action='store_true', - help="TODO") + help="Do not output rules that already match a zone/network rule " + "(e.g. dummy.example.com when there's a zone example.com rule)") parser.add_argument( '-x', '--explain', action='store_true', - help="TODO") + help="Show the chain of rules leading to one " + "(and the number of references they have)") parser.add_argument( '-c', '--count', action='store_true', - help="TODO") + help="Show the number of rules per type instead of listing them") args = parser.parse_args() DB = database.Database() diff --git a/feed_asn.py b/feed_asn.py index 6acfba7..25a35e2 100755 --- a/feed_asn.py +++ b/feed_asn.py @@ -36,7 +36,7 @@ if __name__ == '__main__': # Parsing arguments parser = argparse.ArgumentParser( - description="TODO") + description="Add the IP ranges associated to the AS in the database") args = parser.parse_args() DB = database.Database() diff --git a/feed_dns.py b/feed_dns.py index 03b9429..74fe1dd 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -184,35 +184,40 @@ if __name__ == '__main__': # Parsing arguments log = logging.getLogger('feed_dns') args_parser = argparse.ArgumentParser( - description="TODO") + description="Read DNS records and import " + "tracking-relevant data into the database") args_parser.add_argument( 'parser', choices=PARSERS.keys(), - help="TODO") + help="Input format") args_parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="TODO") - args_parser.add_argument( - '-j', '--workers', type=int, default=4, - help="TODO") + help="Input file") args_parser.add_argument( '-b', '--block-size', type=int, default=1024, - help="TODO") + help="Performance tuning value") args_parser.add_argument( '-q', '--queue-size', type=int, default=128, - help="TODO") + help="Performance tuning value") args_parser.add_argument( '-a', '--autosave-interval', type=int, default=900, - help="TODO seconds") + help="Interval to which the database will save in seconds. " + "0 to disable.") args_parser.add_argument( '-4', '--ip4-cache', type=int, default=0, - help="TODO bytes max 512 MiB") + help="RAM cache for faster IPv4 lookup. " + "Maximum useful value: 512 MiB (536870912). " + "Warning: Depending on the rules, this might already " + "be a memory-heavy process, even without the cache.") args = args_parser.parse_args() recs_queue: multiprocessing.Queue = multiprocessing.Queue( - maxsize=args.queue_size) + maxsize=args.queue_size) - writer = Writer(recs_queue, autosave_interval=args.autosave_interval, ip4_cache=args.ip4_cache) + writer = Writer(recs_queue, + autosave_interval=args.autosave_interval, + ip4_cache=args.ip4_cache + ) writer.start() parser = PARSERS[args.parser](args.input, recs_queue, args.block_size) diff --git a/feed_rules.py b/feed_rules.py index 0889900..9d0365f 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -7,22 +7,24 @@ import time FUNCTION_MAP = { 'zone': database.Database.set_zone, - 'ip4network': database.Database.set_ip4network, + 'hostname': database.Database.set_hostname, 'asn': database.Database.set_asn, + 'ip4network': database.Database.set_ip4network, + 'ip4address': database.Database.set_ip4address, } if __name__ == '__main__': # Parsing arguments parser = argparse.ArgumentParser( - description="TODO") + description="Import base rules to the database") parser.add_argument( 'type', choices=FUNCTION_MAP.keys(), help="Type of rule inputed") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="List of domains domains to block (with their subdomains)") + help="File with one rule per line") parser.add_argument( '-f', '--first-party', action='store_true', help="The input only comes from verified first-party sources") diff --git a/import_rapid7.sh b/import_rapid7.sh index 993bfe7..4b5714f 100755 --- a/import_rapid7.sh +++ b/import_rapid7.sh @@ -12,7 +12,7 @@ function feed_rapid7_fdns { # dataset curl -L "$link" | gunzip } -function feed_rapid7_rdns { # dataset +function feed_rapid7_rdns { dataset=$1 line=$(curl -s https://opendata.rapid7.com/sonar.rdns_v2/ | grep "href=\".\+-rdns.json.gz\"") link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)" diff --git a/validate_list.py b/validate_list.py index 62301c2..23e46d7 100755 --- a/validate_list.py +++ b/validate_list.py @@ -13,16 +13,16 @@ if __name__ == '__main__': # Parsing arguments parser = argparse.ArgumentParser( - description="Filter out invalid domain names.") + description="Filter out invalid domain name/ip addresses from a list.") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="TODO") + help="Input file, one element per line") parser.add_argument( '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, - help="TODO") + help="Output file, one element per line") parser.add_argument( '-d', '--domain', action='store_true', - help="Can be domain") + help="Can be domain name") parser.add_argument( '-4', '--ip4', action='store_true', help="Can be IP4") From 38cf532854e8a82860dcf9f75b6946a41c76ca36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 20 Dec 2019 17:15:39 +0100 Subject: [PATCH 40/40] Updated README Split in two actually (program and list). Closes #3 Also, Closes #1 Because I forgot to do it earlier. --- README.md | 169 +++++++++++++++++++++++++++------------------ dist/README.md | 74 ++++++++++++++++++++ export_lists.sh | 7 +- fetch_resources.sh | 12 ---- 4 files changed, 180 insertions(+), 82 deletions(-) create mode 100644 dist/README.md diff --git a/README.md b/README.md index f27b6f6..7229f30 100644 --- a/README.md +++ b/README.md @@ -1,98 +1,133 @@ # eulaurarien -Generates a host list of first-party trackers for ad-blocking. +This program is able to generate a list of every hostnames being a DNS redirection to a list of DNS zones and IP networks. -The latest list is available here: +It is primarilyy used to generate [Geoffrey Frogeye's block list of first-party trackers](https://git.frogeye.fr/geoffrey/eulaurarien/src/branch/master/dist/README.md) (learn about first-party trackers by following this link). -**DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk. +If you want to contribute but don't want to create an account on this forge, contact me the way you like: -## What's a first-party tracker? +## How does this work -Traditionally, websites load trackers scripts directly. -For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users. -In order to block those, one can simply block the host `trackercompany.com`. +This program takes as input: -However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`. -The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script. -Those are the first-party trackers. +- Lists of hostnames to match +- Lists of DNS zone to match (a domain and their subdomains) +- Lists of IP address / IP networks to match +- Lists of Autonomous System numbers to match +- An enormous quantity of DNS records -Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since: +It will be able to output hostnames being a DNS redirection to any item in the lists provided. -1. Most ad-blocker don't support wildcards -2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com` +DNS records can either come from [Rapid7 Open Data Sets](https://opendata.rapid7.com/sonar.fdns_v2/) or can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns). -So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot. -That's where this scripts comes in, to generate a list of such subdomains. - -## How does this script work - -> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this. - -It takes an input a list of websites with trackers included. -So far, this list is manually-generated from the list of clients of such first-party trackers -(latter we should use a general list of websites to be more exhaustive). -It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes. - -Additionaly, or alternatively, you can feed the script some browsing history and get domains from there. - -It then find the DNS redirections of those domains, and compare with regexes of known tracking domains. -It finally outputs the matching ones. - -## Requirements - -> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this. - -Just to build the list, you can find an already-built list in the releases. - -- Bash -- [Python 3.4+](https://www.python.org/) -- [progressbar2](https://pypi.org/project/progressbar2/) -- dnspython -- [A Python wrapper for re2](https://pypi.org/project/google-re2/) (optional, just speeds things up) - -(if you don't want to collect the subdomains, you can skip the following) - -- Firefox -- Selenium -- seleniumwire +Those subdomains can either be provided as is, come from [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html), from your browsing history, or from analyzing the traffic a web browser makes when opening an URL (the program provides utility to do all that). ## Usage -> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this. +Remember you can get an already generated and up-to-date list of first-party trackers from [here](https://git.frogeye.fr/geoffrey/eulaurarien/src/branch/master/dist/README.md). -This is only if you want to build the list yourself. -If you just want to use the list, the latest build is available here: -It was build using additional sources not included in this repository for privacy reasons. +The following is for the people wanting to build their own list. -### Add personal sources +### Requirements -The list of websites provided in this script is by no mean exhaustive, -so adding your own browsing history will help create a better list. +Depending on the sources you'll be using to generate the list, you'll need to install some of the following: + +- [Bash](https://www.gnu.org/software/bash/bash.html) +- [Coreutils](https://www.gnu.org/software/coreutils/) +- [curl](https://curl.haxx.se) +- [pv](http://www.ivarch.com/programs/pv.shtml) +- [Python 3.4+](https://www.python.org/) +- [coloredlogs](https://pypi.org/project/coloredlogs/) (sorry I can't help myself) +- [massdns](https://github.com/blechschmidt/massdns) in your `$PATH` (only if you have subdomains as a source) +- [Firefox](https://www.mozilla.org/firefox/) (only if you have websites as a source) +- [selenium (Python bindings)](https://pypi.python.org/pypi/selenium) (only if you have websites as a source) +- [selenium-wire](https://pypi.org/project/selenium-wire/) (only if you have websites as a source) + +### Create a new database + +The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it. +For now there's no way to remove data from it, so here's the command to recreate it: `./db.py --initialize`. + +### Gather external sources + +External sources are not stored in this repository. +You'll need to fetch them by running `./fetch_resources.sh`. +Those include: + +- Third-party trackers lists +- TLD lists (used to test the validity of hostnames) +- List of public DNS resolvers (for DNS resolving from subdomains) +- Top 1M subdomains + +### Import rules into the database + +You need to put the lists of rules for matching in the different subfolders: + +- `rules`: Lists of DNS zones +- `rules_ip`: Lists of IP networks (for IP addresses append `/32`) +- `rules_asn`: Lists of Autonomous Systems numbers (IP ranges will be deducted from them) +- `rules_adblock`: Lists of DNS zones, but in the form of AdBlock lists (only the ones concerning domains will be extracted) +- `rules_hosts`: Lists of DNS zones, but in the form of hosts lists + +See the provided examples for syntax. + +In each folder: + +- `first-party.ext` will be the only files considered for the first-party variant of the list +- `*.cache.ext` are from external sources, and thus might be deleted / overwrote +- `*.custom.ext` are for sources that you don't want commited + +Then, run `./import_rules.sh`. + +### Add subdomains + +If you plan to resolve DNS records yourself (as the DNS records datasets are not exhaustive), +the top 1M subdomains provided might not be enough. + +You can add them into the `subdomains` folder. +It follows the same specificities as the rules folder for `*.cache.ext` and `*.custom.ext` files. + +#### Add personal sources + +Adding your own browsing history will help create a more suited subdomains list. Here's reference command for possible sources: - **Pi-hole**: `sqlite3 /etc/pihole-FTL.db "select distinct domain from queries" > /path/to/eulaurarien/subdomains/my-pihole.custom.list` - **Firefox**: `cp ~/.mozilla/firefox/.default/places.sqlite temp; sqlite3 temp "select distinct rev_host from moz_places" | rev | sed 's|^\.||' > /path/to/eulaurarien/subdomains/my-firefox.custom.list; rm temp` -### Collect subdomains from websites +#### Collect subdomains from websites -Just run `collect_subdomain.sh`. +You can add the websites URLs into the `websites` folder. +It follows the same specificities as the rules folder for `*.cache.ext` and `*.custom.ext` files. + +Then, run `collect_subdomain.sh`. This is a long step, and might be memory-intensive from time to time. -This step is optional if you already added personal sources. -Alternatively, you can get just download the list of subdomains used to generate the official block list here: (put it in the `subdomains` folder). +> **Note:** For first-party tracking, a list of subdomains issued from the websites in the repository is avaliable here: -### Extract tracking domains +### Resolve DNS records -Make sure your system is configured with a DNS server without limitation. -Then, run `filter_subdomain.sh`. -The files you need will be in the folder `dist`. +Once you've added subdomains, you'll need to resolve them to get their DNS records. +The program will use a list of public nameservers to do that, but you can add your own in the `nameservers` directory. -## Contributing +Then, run `./resolve_subdomains.sh`. +Note that this is a network intensive process, not in term of bandwith, but in terms of packet number. -### Adding websites +> Some VPS providers might detect this as a DDoS attack and cut the network access. +> Some Wi-Fi connections can be rendered unusable for other uses, some routers might cease to work. +> Since massdns does not support yet rate limiting, my best bet was a Raspberry Pi with a slow ethernet link (Raspberry Pi < 4). -Just add the URL to the relevant list: `websites/.list`. +The DNS records will automatically be imported into the database. +If you want to re-import the records without re-doing the resolving, just run the last line of the `./resolve_subdomains.sh` script. -### Adding first-party trackers regex +### Import DNS records from Rapid7 + +Just run `./import_rapid7.sh`. +This will download about 35 GiB of data, but only the matching records will be stored (about a few MiB for the tracking rules). +Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help). + +### Export the lists + +For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them). +For other purposes, tinker with the `./export.py` program. -Just add them to `regexes.py`. diff --git a/dist/README.md b/dist/README.md new file mode 100644 index 0000000..31db01f --- /dev/null +++ b/dist/README.md @@ -0,0 +1,74 @@ +# Geoffrey Frogeye's block list of first-party trackers + +## What's a first-party tracker? + +A tracker is a script put on many websites to gather informations about the visitor. +They can be used for multiple reasons: statistics, risk management, marketing, ads serving… +In any case, they are a threat to Internet users' privacy and many may want to block them. + +Traditionnaly, trackers are served from a third-party. +For example, `website1.com` and `website2.com` both load their tracking script from `https://trackercompany.com/trackerscript.js`. +In order to block those, one can simply block the hostname `trackercompany.com`, which is what most ad blockers do. + +However, to circumvent this block, tracker companies made the websites using them load trackers from `somestring.website1.com`. +The latter is a DNS redirection to `website1.trackercompany.com`, directly to an IP address belonging to the tracking company. +Those are called first-party trackers. + +In order to block those trackers, ad blockers would need to block every subdomain pointing to anything under `trackercompany.com` or to their network. +Unfortunately, most don't support those blocking methods as they are not DNS-aware, e.g. they only see `somestring.website1.com`. + +This list is an inventory of every `somestring.website1.com` found to allow non DNS-aware ad blocker to still block first-party trackers. + +## List variants + +### First-party trackers (recommended) + +- Hosts file: +- Raw list: + +This list contains every hostname redirecting to [a hand-picked list of first-party trackers](https://git.frogeye.fr/geoffrey/eulaurarien/src/branch/master/rules/first-party.list). +It should be safe from false-positives. +Don't be afraid of the size of the list, as this is due to the nature of first-party trackers: a single tracker generates at least one hostname per client (typically two). + +### First-party only trackers + +- Hosts file: +- Raw list: + +This is the same list as above, albeit not containing the hostnames under the tracking company domains. +This reduces the size of the list, but it doesn't prevent from third-party tracking too. +Use in conjunction with other block lists. + +### Multi-party trackers + +- Hosts file: +- Raw list: + +As first-party trackers usually evolve from third-party trackers, this list contains every hostname redirecting to trackers found in existing lists of third-party trackers (see next section). +Since the latter were not designed with first-party trackers in mind, they are likely to contain false-positives. +In the other hand, they might protect against first-party tracker that we're not aware of / have not yet confirmed. + +#### Source of third-party trackers + +- [EasyPrivacy](https://easylist.to/easylist/easyprivacy.txt) + +(yes there's only one for now. A lot of existing ones cause a lot of false positives) + +### Multi-party only trackers + +- Hosts file: +- Raw list: + +This is the same list as above, albeit not containing the hostnames under the tracking company domains. +This reduces the size of the list, but it doesn't prevent from third-party tracking too. +Use in conjunction with other block lists, especially the ones used to generate this list in the previous section. + +## Meta + +In case of false positives/negatives, or any other question contact me the way you like: + +The software used to generate this list is available here: + +Some of the first-party tracker included in this list have been found by: +- [Aeris](https://imirhil.fr/) +- NextDNS and [their blocklist](https://github.com/nextdns/cname-cloaking-blocklist)'s contributors diff --git a/export_lists.sh b/export_lists.sh index b9853ed..5120562 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -54,7 +54,7 @@ do rules_output=$(./export.py --count $partyness_flags $trackerness_flags) function link() { # link partyness, link trackerness - url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt" + url="https://hostfiles.frogeye.fr/${1}party-${2}-hosts.txt" if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ] then url="$url (this one)" @@ -66,17 +66,18 @@ do echo "# First-party trackers host list" echo "# Variant: ${partyness}-party ${trackerness}" echo "#" - echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" + echo "# About first-party trackers: TODO" echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" echo "#" echo "# In case of false positives/negatives, or any other question," echo "# contact me the way you like: https://geoffrey.frogeye.fr" echo "#" - echo "# Latest versions:" + echo "# Latest versions and variants:" echo "# - First-party trackers : $(link first trackers)" echo "# - … excluding redirected: $(link first only-trackers)" echo "# - First and third party : $(link multi trackers)" echo "# - … excluding redirected: $(link multi only-trackers)" + echo '# (variants informations: TODO)' echo '# (you can remove `-hosts` to get the raw list)' echo "#" echo "# Generation date: $gen_date" diff --git a/fetch_resources.sh b/fetch_resources.sh index cb66ff7..393d8e1 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -17,18 +17,6 @@ function dl() { log "Retrieving rules…" rm -f rules*/*.cache.* dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt -# From firebog.net Tracking & Telemetry Lists -# dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list -# dl https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-blocklist.txt rules/notrack-blocklist.cache.list -# False positives: https://github.com/WaLLy3K/wally3k.github.io/issues/73 -> 69.media.tumblr.com chicdn.net -dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt -dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt rules_hosts/spy.cache.txt -# dl https://raw.githubusercontent.com/Kees1958/WS3_annual_most_used_survey_blocklist/master/w3tech_hostfile.txt rules/w3tech.cache.list -# False positives: agreements.apple.com -> edgekey.net -# dl https://www.github.developerdan.com/hosts/lists/ads-and-tracking-extended.txt rules_hosts/ads-and-tracking-extended.cache.txt # Lots of false-positives -# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/android-tracking.txt rules_hosts/android-tracking.cache.txt -# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt -# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt log "Retrieving TLD list…" dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list