Workflow: POO and individual tables per types

Mostly for performances reasons.
First one to implement threading later.
Second one to speed up the dichotomy,
but it doesn't seem that much better so far.
This commit is contained in:
Geoffrey Frogeye 2019-12-13 00:11:21 +01:00
parent 1484733a90
commit 57416b6e2c
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
10 changed files with 525 additions and 360 deletions

View file

@ -1,5 +0,0 @@
libaccel.so: accel.o
clang -shared -Wl,-soname,libaccel.so -o libaccel.so accel.o
accel.o: accel.c
clang -c -fPIC -O3 accel.c -o accel.o

37
accel.c
View file

@ -1,37 +0,0 @@
#include <stdlib.h>
int ip4_flat(char* value, wchar_t* flat)
{
unsigned char value_index = 0;
unsigned char octet_index = 0;
unsigned char octet_value = 0;
char flat_index;
unsigned char value_chara;
do {
value_chara = value[value_index];
if (value_chara >= '0' && value_chara <= '9') {
octet_value *= 10;
octet_value += value_chara - '0';
} else if (value_chara == '.') {
for (flat_index = (octet_index+1)*8-1; flat_index >= octet_index*8; flat_index--) {
flat[flat_index] = '0' + (octet_value & 1);
octet_value >>= 1;
}
octet_index++;
octet_value = 0;
} else if (value_chara == '\0') {
if (octet_index != 3) {
return 1;
}
for (flat_index = 31; flat_index >= 24; flat_index--) {
flat[flat_index] = '0' + (octet_value & 1);
octet_value >>= 1;
}
return 0;
} else {
return 1;
}
value_index++;
} while (1); // This ugly thing save one comparison
return 1;
}

View file

@ -1,256 +1,385 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sqlite3
import os
import argparse
import typing
import ipaddress
import enum
import time
import ctypes
""" """
Utility functions to interact with the database. Utility functions to interact with the database.
""" """
# TODO Rule level and source priority import sqlite3
import typing
import time
import os
import logging
import argparse
import coloredlogs
import ipaddress
import ctypes
VERSION = 2 coloredlogs.install(
PATH = f"blocking.db" level='DEBUG',
CONN = None fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
C = None # Cursor )
TIME_DICT: typing.Dict[str, float] = dict()
TIME_LAST = time.perf_counter() DbValue = typing.Union[None, int, float, str, bytes]
TIME_STEP = 'start'
ACCEL = ctypes.cdll.LoadLibrary('./libaccel.so')
ACCEL_IP4_BUF = ctypes.create_unicode_buffer('Z'*32, 32)
def time_step(step: str) -> None: class Database():
global TIME_LAST VERSION = 3
global TIME_STEP PATH = "blocking.db"
now = time.perf_counter()
TIME_DICT.setdefault(TIME_STEP, 0.0)
TIME_DICT[TIME_STEP] += now - TIME_LAST
TIME_STEP = step
TIME_LAST = time.perf_counter()
def open(self) -> None:
self.conn = sqlite3.connect(self.PATH)
self.cursor = self.conn.cursor()
self.execute("PRAGMA foreign_keys = ON")
# self.conn.create_function("prepare_ip4address", 1,
# Database.prepare_ip4address,
# deterministic=True)
def time_print() -> None: def execute(self, cmd: str, args: typing.Union[
time_step('postprint') typing.Tuple[DbValue, ...],
total = sum(TIME_DICT.values()) typing.Dict[str, DbValue]] = None) -> None:
for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): self.cursor.execute(cmd, args or tuple())
print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s")
print(f"{'total':<20}: {1:7.2%} = {total:.6f} s")
def get_meta(self, key: str) -> typing.Optional[int]:
class RowType(enum.Enum): try:
AS = 1 self.execute("SELECT value FROM meta WHERE key=?", (key,))
DomainTree = 2 except sqlite3.OperationalError:
Domain = 3 return None
IPv4Network = 4 for ver, in self.cursor:
IPv6Network = 6 return ver
def open_db() -> None:
time_step('open_db')
global CONN
global C
CONN = sqlite3.connect(PATH)
C = CONN.cursor()
# C.execute("PRAGMA foreign_keys = ON");
initialized = False
try:
C.execute("SELECT value FROM meta WHERE key='version'")
version_ex = C.fetchone()
if version_ex:
if version_ex[0] == VERSION:
initialized = True
else:
print(f"Database version {version_ex[0]} found,"
"it will be deleted.")
except sqlite3.OperationalError:
pass
if not initialized:
time_step('init_db')
print(f"Creating database version {VERSION}.")
CONN.close()
os.unlink(PATH)
CONN = sqlite3.connect(PATH)
C = CONN.cursor()
with open("database_schema.sql", 'r') as db_schema:
C.executescript(db_schema.read())
C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,))
CONN.commit()
time_step('other')
def close_db() -> None:
assert CONN
time_step('close_db_commit')
CONN.commit()
time_step('close_db')
CONN.close()
time_step('other')
time_print()
def refresh() -> None:
assert C
C.execute('UPDATE blocking SET updated = 0')
# TODO PERF Use a meta value instead
RULE_SUBDOMAIN_COMMAND = \
'INSERT INTO blocking (key, type, updated, firstpart, level) ' \
f'VALUES (?, {RowType.DomainTree.value}, 1, ?, 0) ' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \
'updated=1, firstparty=?, level=0'
def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None:
assert C
subdomain = subdomain[::-1]
C.execute(RULE_SUBDOMAIN_COMMAND,
(subdomain, int(first_party), int(first_party)))
# Since regex type takes precedence over domain type,
# and firstparty takes precedence over multiparty,
# we can afford to replace the whole row without checking
# the row without checking previous values and making sure
# firstparty subdomains are updated last
def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
for char in address.packed:
for i in range(7, -1, -1):
yield (char >> i) & 0b1
def ip_flat(address: ipaddress.IPv4Address) -> str:
return ''.join(map(str, ip_get_bits(address)))
def ip4_flat(address: bytes) -> typing.Optional[str]:
carg = ctypes.c_char_p(address)
ret = ACCEL.ip4_flat(carg, ACCEL_IP4_BUF)
if ret != 0:
return None return None
return ACCEL_IP4_BUF.value
def set_meta(self, key: str, val: int) -> None:
self.execute("INSERT INTO meta VALUES (?, ?) "
"ON CONFLICT (key) DO "
"UPDATE set value=?",
(key, val, val))
RULE_IP4NETWORK_COMMAND = \ def close(self) -> None:
'INSERT INTO blocking (key, type, updated, firstparty, level) ' \ self.enter_step('close_commit')
f'VALUES (?, {RowType.IPv4Network.value}, 1, ?, 0) ' \ self.conn.commit()
'ON CONFLICT(key)' \ self.enter_step('close')
f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ self.conn.close()
'updated=1, firstparty=?, level=0' self.profile()
def initialize(self) -> None:
self.enter_step('initialize')
self.close()
os.unlink(self.PATH)
self.open()
self.log.info("Creating database version %d.", self.VERSION)
with open("database_schema.sql", 'r') as db_schema:
self.cursor.executescript(db_schema.read())
self.set_meta('version', self.VERSION)
self.conn.commit()
def feed_rule_ip4network(network: ipaddress.IPv4Network, def __init__(self) -> None:
first_party: bool = False) -> None: self.log = logging.getLogger('db')
assert C self.time_last = time.perf_counter()
flat = ip_flat(network.network_address)[:network.prefixlen] self.time_step = 'init'
C.execute(RULE_IP4NETWORK_COMMAND, self.time_dict: typing.Dict[str, float] = dict()
(flat, int(first_party), int(first_party))) self.step_dict: typing.Dict[str, int] = dict()
self.accel_ip4_buf = ctypes.create_unicode_buffer('Z'*32, 32)
self.open()
version = self.get_meta('version')
if version != self.VERSION:
if version is not None:
self.log.warning(
"Outdated database version: %d found, will be rebuilt.",
version)
self.initialize()
FEED_A_COMMAND_FETCH = \ updated = self.get_meta('updated')
'SELECT key, firstparty FROM blocking ' \ if updated is None:
'WHERE key<=? ' \ self.execute('SELECT max(updated) FROM rules')
'AND instr(?, key) > 0 ' \ data = self.cursor.fetchone()
f'AND type={RowType.IPv4Network.value} ' \ updated, = data
'ORDER BY key DESC ' self.updated = updated or 1
# UPSERT are not issued often relative to FETCH, def enter_step(self, name: str) -> None:
# merging the both might be counterproductive now = time.perf_counter()
try:
self.time_dict[self.time_step] += now - self.time_last
self.step_dict[self.time_step] += 1
except KeyError:
self.time_dict[self.time_step] = now - self.time_last
self.step_dict[self.time_step] = 1
self.time_step = name
self.time_last = time.perf_counter()
FEED_A_COMMAND_UPSERT = \ def profile(self) -> None:
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ self.enter_step('profile')
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ total = sum(self.time_dict.values())
'ON CONFLICT(key)' \ for key, secs in sorted(self.time_dict.items(), key=lambda t: t[1]):
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ times = self.step_dict[key]
'updated=1, firstparty=? ' \ self.log.debug(f"{key:<20}: {times:9d} × {secs/times:5.3e} "
'WHERE updated=0 OR firstparty<?' f"= {secs:9.2f} s ({secs/total:7.2%}) ")
self.log.debug(f"{'total':<20}: "
f"{total:9.2f} s ({1:7.2%})")
def prepare_hostname(self, hostname: str) -> str:
return hostname[::-1] + '.'
def feed_a(name: bytes, value_ip: bytes) -> None: def prepare_zone(self, zone: str) -> str:
assert C return self.prepare_hostname(zone)
assert CONN
time_step('a_flat')
value_dec = ip4_flat(value_ip)
if value_dec is None:
# Malformed IPs
time_step('a_malformed')
return
time_step('a_fetch')
C.execute(FEED_A_COMMAND_FETCH, (value_dec, value_dec))
base = C.fetchone()
time_step('a_fetch_confirm')
name = name[::-1]
for b_key, b_firstparty in C:
time_step('a_upsert')
C.execute(FEED_A_COMMAND_UPSERT,
(name, b_key, b_firstparty, # Insert
b_key, b_firstparty, b_firstparty) # Update
)
time_step('a_fetch_confirm')
time_step('a_end')
@staticmethod
def prepare_ip4address(address: str) -> int:
total = 0
for i, octet in enumerate(address.split('.')):
total += int(octet) << (3-i)*8
return total
# return '{:02x}{:02x}{:02x}{:02x}'.format(
# *[int(c) for c in address.split('.')])
# return base64.b16encode(packed).decode()
# return '{:08b}{:08b}{:08b}{:08b}'.format(
# *[int(c) for c in address.split('.')])
# carg = ctypes.c_wchar_p(address)
# ret = ACCEL.ip4_flat(carg, self.accel_ip4_buf)
# if ret != 0:
# raise ValueError
# return self.accel_ip4_buf.value
# packed = ipaddress.ip_address(address).packed
# return packed
FEED_CNAME_COMMAND_FETCH = \ def prepare_ip4network(self, network: str) -> typing.Tuple[int, int]:
'SELECT key, type, firstparty FROM blocking ' \ # def prepare_ip4network(network: str) -> str:
'WHERE key<=? ' \ net = ipaddress.ip_network(network)
f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ mini = self.prepare_ip4address(net.network_address.exploded)
'ORDER BY key DESC ' \ maxi = self.prepare_ip4address(net.broadcast_address.exploded)
'LIMIT 1' # mini = net.network_address.packed
# Optimisations that renders the index unused # maxi = net.broadcast_address.packed
# (and thus counterproductive until fixed): return mini, maxi
# return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen]
# 'AND instr(?, key) > 0 ' \ def expire(self) -> None:
self.enter_step('expire')
self.updated += 1
self.set_meta('updated', self.updated)
# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ def update_references(self) -> None:
# f'(type={RowType.Domain.value} AND key=?)) ' \ self.enter_step('update_refs')
self.execute('UPDATE rules AS r SET refs='
'(SELECT count(*) FROM rules '
'WHERE source=r.id)')
# Might be fixable by using multiple SELECT and a JOIN def prune(self) -> None:
# In the meantime the confirm is very light so it's ok self.enter_step('prune')
self.execute('DELETE FROM rules WHERE updated<?', (self.updated,))
FEED_CNAME_COMMAND_UPSERT = \ def export(self, first_party_only: bool = False,
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ end_chain_only: bool = False) -> typing.Iterable[str]:
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ command = 'SELECT val FROM rules ' \
'ON CONFLICT(key)' \ 'INNER JOIN hostname ON rules.id = hostname.entry'
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ restrictions: typing.List[str] = list()
'updated=1, firstparty=? ' \ if first_party_only:
'WHERE updated=0 OR firstparty<?' restrictions.append('rules.first_party = 1')
if end_chain_only:
restrictions.append('rules.refs = 0')
if restrictions:
command += ' WHERE ' + ' AND '.join(restrictions)
self.execute(command)
for val, in self.cursor:
yield val[:-1][::-1]
def get_domain(self, domain: str) -> typing.Iterable[int]:
def feed_cname(name: bytes, value: bytes) -> None: self.enter_step('get_domain_prepare')
assert C domain_prep = self.prepare_hostname(domain)
assert CONN self.enter_step('get_domain_select')
time_step('cname_decode') self.execute(
value = value[::-1] 'SELECT null, entry FROM hostname '
value_dec = value.decode() 'WHERE val=:d '
time_step('cname_fetch') 'UNION '
C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,)) 'SELECT * FROM ('
time_step('cname_fetch_confirm') 'SELECT val, entry FROM zone '
for b_key, b_type, b_firstparty in C: 'WHERE val<=:d '
matching = b_key == value_dec[:len(b_key)] and ( 'ORDER BY val DESC LIMIT 1'
len(value_dec) == len(b_key) ')',
or ( {'d': domain_prep}
b_type == RowType.DomainTree.value )
and value_dec[len(b_key)] == '.' for val, entry in self.cursor:
) self.enter_step('get_domain_confirm')
if not (val is None or domain_prep.startswith(val)):
continue
self.enter_step('get_domain_yield')
yield entry
def get_ip4(self, address: str) -> typing.Iterable[int]:
self.enter_step('get_ip4_prepare')
try:
address_prep = self.prepare_ip4address(address)
except (ValueError, IndexError):
self.log.error("Invalid ip4address: %s", address)
return
self.enter_step('get_ip4_select')
self.execute(
'SELECT entry FROM ip4address '
# 'SELECT null, entry FROM ip4address '
'WHERE val=:a '
'UNION '
# 'SELECT * FROM ('
# 'SELECT val, entry FROM ip4network '
# 'WHERE val<=:a '
# 'AND instr(:a, val) > 0 '
# 'ORDER BY val DESC'
# ')'
'SELECT entry FROM ip4network '
'WHERE :a BETWEEN mini AND maxi ',
{'a': address_prep}
)
for val, entry in self.cursor:
# self.enter_step('get_ip4_confirm')
# if not (val is None or val.startswith(address_prep)):
# # PERF startswith but from the end
# continue
self.enter_step('get_ip4_yield')
yield entry
def _set_generic(self,
table: str,
select_query: str,
insert_query: str,
prep: typing.Dict[str, DbValue],
is_first_party: bool = False,
source: int = None,
) -> None:
# Since this isn't the bulk of the processing,
# here abstraction > performaces
# Fields based on the source
if source is None:
first_party = int(is_first_party)
level = 0
else:
self.enter_step(f'set_{table}_source')
self.execute(
'SELECT first_party, level FROM rules '
'WHERE id=?',
(source,)
)
first_party, level = self.cursor.fetchone()
level += 1
self.enter_step(f'set_{table}_select')
self.execute(select_query, prep)
rules_prep = {
"source": source,
"updated": self.updated,
"first_party": first_party,
"level": level,
}
# If the entry already exists
for entry, in self.cursor: # only one
self.enter_step(f'set_{table}_update')
rules_prep['entry'] = entry
self.execute(
'UPDATE rules SET '
'source=:source, updated=:updated, '
'first_party=:first_party, level=:level '
'WHERE id=:entry AND (updated<:updated OR '
'first_party<:first_party OR level<:level)',
rules_prep
)
# Only update if any of the following:
# - the entry is outdataed
# - the entry was not a first_party but this is
# - this is closer to the original rule
return
# If it does not exist
if source is not None:
self.enter_step(f'set_{table}_incsrc')
self.execute('UPDATE rules SET refs = refs + 1 WHERE id=?',
(source,))
self.enter_step(f'set_{table}_insert')
self.execute(
'INSERT INTO rules '
'(source, updated, first_party, refs, level) '
'VALUES (:source, :updated, :first_party, 0, :level) ',
rules_prep
)
self.execute('SELECT id FROM rules WHERE rowid=?',
(self.cursor.lastrowid,))
for entry, in self.cursor: # only one
prep['entry'] = entry
self.execute(insert_query, prep)
return
assert False
def set_hostname(self, hostname: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_hostname_prepare')
prep: typing.Dict[str, DbValue] = {
'val': self.prepare_hostname(hostname),
}
self._set_generic(
'hostname',
'SELECT entry FROM hostname WHERE val=:val',
'INSERT INTO hostname (val, entry) '
'VALUES (:val, :entry)',
prep,
*args, **kwargs
)
def set_ip4address(self, ip4address: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_ip4add_prepare')
try:
ip4address_prep = self.prepare_ip4address(ip4address)
except (ValueError, IndexError):
self.log.error("Invalid ip4address: %s", ip4address)
return
prep: typing.Dict[str, DbValue] = {
'val': ip4address_prep,
}
self._set_generic(
'ip4add',
'SELECT entry FROM ip4address WHERE val=:val',
'INSERT INTO ip4address (val, entry) '
'VALUES (:val, :entry)',
prep,
*args, **kwargs
)
def set_zone(self, zone: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_zone_prepare')
prep: typing.Dict[str, DbValue] = {
'val': self.prepare_zone(zone),
}
self._set_generic(
'zone',
'SELECT entry FROM zone WHERE val=:val',
'INSERT INTO zone (val, entry) '
'VALUES (:val, :entry)',
prep,
*args, **kwargs
)
def set_ip4network(self, ip4network: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_ip4net_prepare')
try:
ip4network_prep = self.prepare_ip4network(ip4network)
except (ValueError, IndexError):
self.log.error("Invalid ip4network: %s", ip4network)
return
prep: typing.Dict[str, DbValue] = {
'mini': ip4network_prep[0],
'maxi': ip4network_prep[1],
}
self._set_generic(
'ip4net',
'SELECT entry FROM ip4network WHERE mini=:mini AND maxi=:maxi',
'INSERT INTO ip4network (mini, maxi, entry) '
'VALUES (:mini, :maxi, :entry)',
prep,
*args, **kwargs
) )
if not matching:
continue
name = name[::-1]
time_step('cname_upsert')
C.execute(FEED_CNAME_COMMAND_UPSERT,
(name, b_key, b_firstparty, # Insert
b_key, b_firstparty, b_firstparty) # Update
)
time_step('cname_fetch_confirm')
time_step('cname_end')
if __name__ == '__main__': if __name__ == '__main__':
@ -259,13 +388,28 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Database operations") description="Database operations")
parser.add_argument( parser.add_argument(
'-r', '--refresh', action='store_true', '-i', '--initialize', action='store_true',
help="Reconstruct the whole database")
parser.add_argument(
'-p', '--prune', action='store_true',
help="Remove old entries from database")
parser.add_argument(
'-e', '--expire', action='store_true',
help="Set the whole database as an old source") help="Set the whole database as an old source")
parser.add_argument(
'-r', '--references', action='store_true',
help="Update the reference count")
args = parser.parse_args() args = parser.parse_args()
open_db() DB = Database()
if args.refresh: if args.initialize:
refresh() DB.initialize()
if args.prune:
DB.prune()
if args.expire:
DB.expire()
if args.references and not args.prune:
DB.update_references()
close_db() DB.close()

View file

@ -1,21 +1,49 @@
-- Remember to increment DB_VERSION -- Remember to increment DB_VERSION
-- in database.py on changes to this file -- in database.py on changes to this file
CREATE TABLE blocking ( CREATE TABLE rules (
key TEXT PRIMARY KEY, -- Contains the reversed domain name or IP in binary form id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT, -- The rule this one is based on source INTEGER, -- The rule this one is based on
type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network
updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes)
firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) first_party INTEGER, -- 1: this blocks a first party for sure, 0: maybe
refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) (used for -only lists) refs INTEGER, -- Number of entries issued from this one
level INTEGER, -- Level of recursion to the original rule (used for source priority) level INTEGER, -- Level of recursion to the root source rule (used for source priority)
FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE
); );
CREATE INDEX "blocking_type_key" ON "blocking" (
"type", CREATE TABLE asn (
"key" DESC val INTEGER PRIMARY KEY,
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
); );
CREATE TABLE hostname (
val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone)
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE zone (
val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching)
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE ip4address (
val INTEGER PRIMARY KEY,
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE ip4network (
-- val TEXT PRIMARY KEY,
mini INTEGER,
maxi INTEGER,
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE INDEX ip4network_minmax ON ip4network (mini, maxi);
-- Store various things -- Store various things
CREATE TABLE meta ( CREATE TABLE meta (
key TEXT PRIMARY KEY, key TEXT PRIMARY KEY,

30
export.py Executable file
View file

@ -0,0 +1,30 @@
#!/usr/bin/env python3
import database
import argparse
import sys
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="TODO")
parser.add_argument(
'-f', '--first-party', action='store_true',
help="TODO")
parser.add_argument(
'-e', '--end-chain', action='store_true',
help="TODO")
args = parser.parse_args()
DB = database.Database()
for domain in DB.export(first_party_only=args.first_party,
end_chain_only=args.end_chain):
print(domain, file=args.output)
DB.close()

View file

@ -3,42 +3,56 @@
import database import database
import argparse import argparse
import sys import sys
import logging
FUNCTION_MAP = {
b'a': database.feed_a,
b'cname': database.feed_cname,
}
if __name__ == '__main__': if __name__ == '__main__':
# Parsing arguments # Parsing arguments
log = logging.getLogger('feed_dns')
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="TODO") description="TODO")
parser.add_argument( parser.add_argument(
'-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, # '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer,
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO") help="TODO")
args = parser.parse_args() args = parser.parse_args()
database.open_db() DB = database.Database()
try: try:
database.time_step('iowait') DB.enter_step('iowait')
line: bytes # line: bytes
line: str
for line in args.input: for line in args.input:
database.time_step('feed_json_parse') DB.enter_step('feed_json_parse')
split = line.split(b'"') # split = line.split(b'"')
name = split[7] split = line.split('"')
dtype = split[11] try:
value = split[15] name = split[7]
dtype = split[11]
value = split[15]
except IndexError:
log.error("Invalid JSON: %s", line)
continue
# DB.enter_step('feed_json_assert')
# data = json.loads(line) # data = json.loads(line)
# assert dtype == data['type'] # assert dtype == data['type']
# assert name == data['name'] # assert name == data['name']
# assert value == data['value'] # assert value == data['value']
database.time_step('feed_switch')
FUNCTION_MAP[dtype](name, value) DB.enter_step('feed_switch')
database.time_step('iowait') if dtype == 'a':
for rule in DB.get_ip4(value):
DB.set_hostname(name, source=rule)
elif dtype == 'cname':
for rule in DB.get_domain(value):
DB.set_hostname(name, source=rule)
elif dtype == 'ptr':
for rule in DB.get_domain(value):
DB.set_ip4address(name, source=rule)
DB.enter_step('iowait')
except KeyboardInterrupt: except KeyboardInterrupt:
print("Interupted.") log.warning("Interupted.")
pass pass
database.close_db() DB.close()

View file

@ -13,7 +13,7 @@ if __name__ == '__main__':
description="TODO") description="TODO")
parser.add_argument( parser.add_argument(
'type', 'type',
choices={'subdomains', 'ip4network'}, choices={'zone', 'ip4network'},
help="Type of rule inputed") help="Type of rule inputed")
parser.add_argument( parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin, '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
@ -23,18 +23,16 @@ if __name__ == '__main__':
help="The input only comes from verified first-party sources") help="The input only comes from verified first-party sources")
args = parser.parse_args() args = parser.parse_args()
database.open_db() DB = database.Database()
if args.type == 'subdomains': FUNCTION_MAP = {
for rule in args.input: 'zone': DB.set_zone,
database.feed_rule_subdomains( 'ip4network': DB.set_ip4network,
rule.strip(), first_party=args.first_party) }
elif args.type == 'ip4network':
for rule in args.input:
network = ipaddress.ip_network(rule.strip())
database.feed_rule_ip4network(
network, first_party=args.first_party)
else:
assert False
database.close_db() fun = FUNCTION_MAP[args.type]
for rule in args.input:
fun(rule.strip(), is_first_party=args.first_party)
DB.close()

View file

@ -4,37 +4,14 @@ function log() {
echo -e "\033[33m$@\033[0m" echo -e "\033[33m$@\033[0m"
} }
if [ ! -f temp/all_resolved.csv ] log "Updating references…"
then ./database.py --references
echo "Run ./resolve_subdomains.sh first!"
exit 1
fi
# Gather all the rules for filtering log "Exporting lists…"
log "Compiling rules…" ./export.py --first-party | sort -u > dist/firstparty-trackers.txt
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt ./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list ./export.py | sort -u > dist/multiparty-trackers.txt
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list ./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt
log "Filtering first-party tracking domains…"
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
log "Filtering first-party curated tracking domains…"
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
log "Filtering multi-party tracking domains…"
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt
log "Filtering multi-party curated tracking domains…"
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist # Format the blocklist so it can be used as a hostlist
function generate_hosts { function generate_hosts {
@ -61,14 +38,14 @@ function generate_hosts {
echo "#" echo "#"
echo "# Generation date: $(date -Isec)" echo "# Generation date: $(date -Isec)"
echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source websites: TODO"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: TODO"
echo "#" echo "#"
echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)" echo "# Number of known first-party trackers: TODO"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#" echo "#"
echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)" echo "# Number of known multi-party trackers: TODO"
echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo echo

14
import_rules.sh Executable file
View file

@ -0,0 +1,14 @@
#!/usr/bin/env bash
function log() {
echo -e "\033[33m$@\033[0m"
}
log "Importing rules…"
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party

View file

@ -5,18 +5,20 @@ function log() {
} }
log "Preparing database…" log "Preparing database…"
./database.py --refresh ./database.py --expire
log "Compiling rules…" ./import_rules.sh
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains # TODO Fetch 'em
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains log "Reading PTR records…"
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network pv ptr.json.gz | gunzip | ./feed_dns.py
# NOTE: Ensure first-party sources are last log "Reading A records…"
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party pv a.json.gz | gunzip | ./feed_dns.py
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party log "Reading CNAME records…"
pv cname.json.gz | gunzip | ./feed_dns.py
log "Pruning old data…"
./database.py --prune
./filter_subdomains.sh
# log "Reading A records…"
# pv a.json.gz | gunzip | ./feed_dns.py
# log "Reading CNAME records…"
# pv cname.json.gz | gunzip | ./feed_dns.py