Browse Source

Workflow: POO and individual tables per types

Mostly for performances reasons.
First one to implement threading later.
Second one to speed up the dichotomy,
but it doesn't seem that much better so far.
newworkflow_threaded v2.0-beta
Geoffrey Frogeye 2 years ago
parent
commit
57416b6e2c
Signed by: geoffrey GPG Key ID: D8A7ECA00A8CD3DD
  1. 5
      Makefile
  2. 37
      accel.c
  3. 632
      database.py
  4. 50
      database_schema.sql
  5. 30
      export.py
  6. 52
      feed_dns.py
  7. 30
      feed_rules.py
  8. 45
      filter_subdomains.sh
  9. 14
      import_rules.sh
  10. 28
      new_workflow.sh

5
Makefile

@ -1,5 +0,0 @@
libaccel.so: accel.o
clang -shared -Wl,-soname,libaccel.so -o libaccel.so accel.o
accel.o: accel.c
clang -c -fPIC -O3 accel.c -o accel.o

37
accel.c

@ -1,37 +0,0 @@
#include <stdlib.h>
int ip4_flat(char* value, wchar_t* flat)
{
unsigned char value_index = 0;
unsigned char octet_index = 0;
unsigned char octet_value = 0;
char flat_index;
unsigned char value_chara;
do {
value_chara = value[value_index];
if (value_chara >= '0' && value_chara <= '9') {
octet_value *= 10;
octet_value += value_chara - '0';
} else if (value_chara == '.') {
for (flat_index = (octet_index+1)*8-1; flat_index >= octet_index*8; flat_index--) {
flat[flat_index] = '0' + (octet_value & 1);
octet_value >>= 1;
}
octet_index++;
octet_value = 0;
} else if (value_chara == '\0') {
if (octet_index != 3) {
return 1;
}
for (flat_index = 31; flat_index >= 24; flat_index--) {
flat[flat_index] = '0' + (octet_value & 1);
octet_value >>= 1;
}
return 0;
} else {
return 1;
}
value_index++;
} while (1); // This ugly thing save one comparison
return 1;
}

632
database.py

@ -1,256 +1,385 @@
#!/usr/bin/env python3
"""
Utility functions to interact with the database.
"""
import sqlite3
import typing
import time
import os
import logging
import argparse
import typing
import coloredlogs
import ipaddress
import enum
import time
import ctypes
"""
Utility functions to interact with the database.
"""
coloredlogs.install(
level='DEBUG',
fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
)
DbValue = typing.Union[None, int, float, str, bytes]
class Database():
VERSION = 3
PATH = "blocking.db"
def open(self) -> None:
self.conn = sqlite3.connect(self.PATH)
self.cursor = self.conn.cursor()
self.execute("PRAGMA foreign_keys = ON")
# self.conn.create_function("prepare_ip4address", 1,
# Database.prepare_ip4address,
# deterministic=True)
def execute(self, cmd: str, args: typing.Union[
typing.Tuple[DbValue, ...],
typing.Dict[str, DbValue]] = None) -> None:
self.cursor.execute(cmd, args or tuple())
def get_meta(self, key: str) -> typing.Optional[int]:
try:
self.execute("SELECT value FROM meta WHERE key=?", (key,))
except sqlite3.OperationalError:
return None
for ver, in self.cursor:
return ver
return None
# TODO Rule level and source priority
VERSION = 2
PATH = f"blocking.db"
CONN = None
C = None # Cursor
TIME_DICT: typing.Dict[str, float] = dict()
TIME_LAST = time.perf_counter()
TIME_STEP = 'start'
ACCEL = ctypes.cdll.LoadLibrary('./libaccel.so')
ACCEL_IP4_BUF = ctypes.create_unicode_buffer('Z'*32, 32)
def time_step(step: str) -> None:
global TIME_LAST
global TIME_STEP
now = time.perf_counter()
TIME_DICT.setdefault(TIME_STEP, 0.0)
TIME_DICT[TIME_STEP] += now - TIME_LAST
TIME_STEP = step
TIME_LAST = time.perf_counter()
def time_print() -> None:
time_step('postprint')
total = sum(TIME_DICT.values())
for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]):
print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s")
print(f"{'total':<20}: {1:7.2%} = {total:.6f} s")
class RowType(enum.Enum):
AS = 1
DomainTree = 2
Domain = 3
IPv4Network = 4
IPv6Network = 6
def open_db() -> None:
time_step('open_db')
global CONN
global C
CONN = sqlite3.connect(PATH)
C = CONN.cursor()
# C.execute("PRAGMA foreign_keys = ON");
initialized = False
try:
C.execute("SELECT value FROM meta WHERE key='version'")
version_ex = C.fetchone()
if version_ex:
if version_ex[0] == VERSION:
initialized = True
else:
print(f"Database version {version_ex[0]} found,"
"it will be deleted.")
except sqlite3.OperationalError:
pass
if not initialized:
time_step('init_db')
print(f"Creating database version {VERSION}.")
CONN.close()
os.unlink(PATH)
CONN = sqlite3.connect(PATH)
C = CONN.cursor()
def set_meta(self, key: str, val: int) -> None:
self.execute("INSERT INTO meta VALUES (?, ?) "
"ON CONFLICT (key) DO "
"UPDATE set value=?",
(key, val, val))
def close(self) -> None:
self.enter_step('close_commit')
self.conn.commit()
self.enter_step('close')
self.conn.close()
self.profile()
def initialize(self) -> None:
self.enter_step('initialize')
self.close()
os.unlink(self.PATH)
self.open()
self.log.info("Creating database version %d.", self.VERSION)
with open("database_schema.sql", 'r') as db_schema:
C.executescript(db_schema.read())
C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,))
CONN.commit()
time_step('other')
def close_db() -> None:
assert CONN
time_step('close_db_commit')
CONN.commit()
time_step('close_db')
CONN.close()
time_step('other')
time_print()
def refresh() -> None:
assert C
C.execute('UPDATE blocking SET updated = 0')
# TODO PERF Use a meta value instead
RULE_SUBDOMAIN_COMMAND = \
'INSERT INTO blocking (key, type, updated, firstpart, level) ' \
f'VALUES (?, {RowType.DomainTree.value}, 1, ?, 0) ' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \
'updated=1, firstparty=?, level=0'
def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None:
assert C
subdomain = subdomain[::-1]
C.execute(RULE_SUBDOMAIN_COMMAND,
(subdomain, int(first_party), int(first_party)))
# Since regex type takes precedence over domain type,
# and firstparty takes precedence over multiparty,
# we can afford to replace the whole row without checking
# the row without checking previous values and making sure
# firstparty subdomains are updated last
def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
for char in address.packed:
for i in range(7, -1, -1):
yield (char >> i) & 0b1
self.cursor.executescript(db_schema.read())
self.set_meta('version', self.VERSION)
self.conn.commit()
def __init__(self) -> None:
self.log = logging.getLogger('db')
self.time_last = time.perf_counter()
self.time_step = 'init'
self.time_dict: typing.Dict[str, float] = dict()
self.step_dict: typing.Dict[str, int] = dict()
self.accel_ip4_buf = ctypes.create_unicode_buffer('Z'*32, 32)
self.open()
version = self.get_meta('version')
if version != self.VERSION:
if version is not None:
self.log.warning(
"Outdated database version: %d found, will be rebuilt.",
version)
self.initialize()
updated = self.get_meta('updated')
if updated is None:
self.execute('SELECT max(updated) FROM rules')
data = self.cursor.fetchone()
updated, = data
self.updated = updated or 1
def enter_step(self, name: str) -> None:
now = time.perf_counter()
try:
self.time_dict[self.time_step] += now - self.time_last
self.step_dict[self.time_step] += 1
except KeyError:
self.time_dict[self.time_step] = now - self.time_last
self.step_dict[self.time_step] = 1
self.time_step = name
self.time_last = time.perf_counter()
def profile(self) -> None:
self.enter_step('profile')
total = sum(self.time_dict.values())
for key, secs in sorted(self.time_dict.items(), key=lambda t: t[1]):
times = self.step_dict[key]
self.log.debug(f"{key:<20}: {times:9d} × {secs/times:5.3e} "
f"= {secs:9.2f} s ({secs/total:7.2%}) ")
self.log.debug(f"{'total':<20}: "
f"{total:9.2f} s ({1:7.2%})")
def prepare_hostname(self, hostname: str) -> str:
return hostname[::-1] + '.'
def prepare_zone(self, zone: str) -> str:
return self.prepare_hostname(zone)
@staticmethod
def prepare_ip4address(address: str) -> int:
total = 0
for i, octet in enumerate(address.split('.')):
total += int(octet) << (3-i)*8
return total
# return '{:02x}{:02x}{:02x}{:02x}'.format(
# *[int(c) for c in address.split('.')])
# return base64.b16encode(packed).decode()
# return '{:08b}{:08b}{:08b}{:08b}'.format(
# *[int(c) for c in address.split('.')])
# carg = ctypes.c_wchar_p(address)
# ret = ACCEL.ip4_flat(carg, self.accel_ip4_buf)
# if ret != 0:
# raise ValueError
# return self.accel_ip4_buf.value
# packed = ipaddress.ip_address(address).packed
# return packed
def prepare_ip4network(self, network: str) -> typing.Tuple[int, int]:
# def prepare_ip4network(network: str) -> str:
net = ipaddress.ip_network(network)
mini = self.prepare_ip4address(net.network_address.exploded)
maxi = self.prepare_ip4address(net.broadcast_address.exploded)
# mini = net.network_address.packed
# maxi = net.broadcast_address.packed
return mini, maxi
# return Database.prepare_ip4address(net.network_address.exploded)[:net.prefixlen]
def expire(self) -> None:
self.enter_step('expire')
self.updated += 1
self.set_meta('updated', self.updated)
def update_references(self) -> None:
self.enter_step('update_refs')
self.execute('UPDATE rules AS r SET refs='
'(SELECT count(*) FROM rules '
'WHERE source=r.id)')
def prune(self) -> None:
self.enter_step('prune')
self.execute('DELETE FROM rules WHERE updated<?', (self.updated,))
def export(self, first_party_only: bool = False,
end_chain_only: bool = False) -> typing.Iterable[str]:
command = 'SELECT val FROM rules ' \
'INNER JOIN hostname ON rules.id = hostname.entry'
restrictions: typing.List[str] = list()
if first_party_only:
restrictions.append('rules.first_party = 1')
if end_chain_only:
restrictions.append('rules.refs = 0')
if restrictions:
command += ' WHERE ' + ' AND '.join(restrictions)
self.execute(command)
for val, in self.cursor:
yield val[:-1][::-1]
def get_domain(self, domain: str) -> typing.Iterable[int]:
self.enter_step('get_domain_prepare')
domain_prep = self.prepare_hostname(domain)
self.enter_step('get_domain_select')
self.execute(
'SELECT null, entry FROM hostname '
'WHERE val=:d '
'UNION '
'SELECT * FROM ('
'SELECT val, entry FROM zone '
'WHERE val<=:d '
'ORDER BY val DESC LIMIT 1'
')',
{'d': domain_prep}
)
for val, entry in self.cursor:
self.enter_step('get_domain_confirm')
if not (val is None or domain_prep.startswith(val)):
continue
self.enter_step('get_domain_yield')
yield entry
def get_ip4(self, address: str) -> typing.Iterable[int]:
self.enter_step('get_ip4_prepare')
try:
address_prep = self.prepare_ip4address(address)
except (ValueError, IndexError):
self.log.error("Invalid ip4address: %s", address)
return
self.enter_step('get_ip4_select')
self.execute(
'SELECT entry FROM ip4address '
# 'SELECT null, entry FROM ip4address '
'WHERE val=:a '
'UNION '
# 'SELECT * FROM ('
# 'SELECT val, entry FROM ip4network '
# 'WHERE val<=:a '
# 'AND instr(:a, val) > 0 '
# 'ORDER BY val DESC'
# ')'
'SELECT entry FROM ip4network '
'WHERE :a BETWEEN mini AND maxi ',
{'a': address_prep}
)
for val, entry in self.cursor:
# self.enter_step('get_ip4_confirm')
# if not (val is None or val.startswith(address_prep)):
# # PERF startswith but from the end
# continue
self.enter_step('get_ip4_yield')
yield entry
def _set_generic(self,
table: str,
select_query: str,
insert_query: str,
prep: typing.Dict[str, DbValue],
is_first_party: bool = False,
source: int = None,
) -> None:
# Since this isn't the bulk of the processing,
# here abstraction > performaces
# Fields based on the source
if source is None:
first_party = int(is_first_party)
level = 0
else:
self.enter_step(f'set_{table}_source')
self.execute(
'SELECT first_party, level FROM rules '
'WHERE id=?',
(source,)
)
first_party, level = self.cursor.fetchone()
level += 1
self.enter_step(f'set_{table}_select')
self.execute(select_query, prep)
rules_prep = {
"source": source,
"updated": self.updated,
"first_party": first_party,
"level": level,
}
# If the entry already exists
for entry, in self.cursor: # only one
self.enter_step(f'set_{table}_update')
rules_prep['entry'] = entry
self.execute(
'UPDATE rules SET '
'source=:source, updated=:updated, '
'first_party=:first_party, level=:level '
'WHERE id=:entry AND (updated<:updated OR '
'first_party<:first_party OR level<:level)',
rules_prep
)
# Only update if any of the following:
# - the entry is outdataed
# - the entry was not a first_party but this is
# - this is closer to the original rule
return
# If it does not exist
if source is not None:
self.enter_step(f'set_{table}_incsrc')
self.execute('UPDATE rules SET refs = refs + 1 WHERE id=?',
(source,))
self.enter_step(f'set_{table}_insert')
self.execute(
'INSERT INTO rules '
'(source, updated, first_party, refs, level) '
'VALUES (:source, :updated, :first_party, 0, :level) ',
rules_prep
)
self.execute('SELECT id FROM rules WHERE rowid=?',
(self.cursor.lastrowid,))
for entry, in self.cursor: # only one
prep['entry'] = entry
self.execute(insert_query, prep)
return
assert False
def set_hostname(self, hostname: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_hostname_prepare')
prep: typing.Dict[str, DbValue] = {
'val': self.prepare_hostname(hostname),
}
self._set_generic(
'hostname',
'SELECT entry FROM hostname WHERE val=:val',
'INSERT INTO hostname (val, entry) '
'VALUES (:val, :entry)',
prep,
*args, **kwargs
)
def ip_flat(address: ipaddress.IPv4Address) -> str:
return ''.join(map(str, ip_get_bits(address)))
def set_ip4address(self, ip4address: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_ip4add_prepare')
try:
ip4address_prep = self.prepare_ip4address(ip4address)
except (ValueError, IndexError):
self.log.error("Invalid ip4address: %s", ip4address)
return
prep: typing.Dict[str, DbValue] = {
'val': ip4address_prep,
}
self._set_generic(
'ip4add',
'SELECT entry FROM ip4address WHERE val=:val',
'INSERT INTO ip4address (val, entry) '
'VALUES (:val, :entry)',
prep,
*args, **kwargs
)
def set_zone(self, zone: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_zone_prepare')
prep: typing.Dict[str, DbValue] = {
'val': self.prepare_zone(zone),
}
self._set_generic(
'zone',
'SELECT entry FROM zone WHERE val=:val',
'INSERT INTO zone (val, entry) '
'VALUES (:val, :entry)',
prep,
*args, **kwargs
)
def ip4_flat(address: bytes) -> typing.Optional[str]:
carg = ctypes.c_char_p(address)
ret = ACCEL.ip4_flat(carg, ACCEL_IP4_BUF)
if ret != 0:
return None
return ACCEL_IP4_BUF.value
RULE_IP4NETWORK_COMMAND = \
'INSERT INTO blocking (key, type, updated, firstparty, level) ' \
f'VALUES (?, {RowType.IPv4Network.value}, 1, ?, 0) ' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \
'updated=1, firstparty=?, level=0'
def feed_rule_ip4network(network: ipaddress.IPv4Network,
first_party: bool = False) -> None:
assert C
flat = ip_flat(network.network_address)[:network.prefixlen]
C.execute(RULE_IP4NETWORK_COMMAND,
(flat, int(first_party), int(first_party)))
FEED_A_COMMAND_FETCH = \
'SELECT key, firstparty FROM blocking ' \
'WHERE key<=? ' \
'AND instr(?, key) > 0 ' \
f'AND type={RowType.IPv4Network.value} ' \
'ORDER BY key DESC '
# UPSERT are not issued often relative to FETCH,
# merging the both might be counterproductive
FEED_A_COMMAND_UPSERT = \
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \
'updated=1, firstparty=? ' \
'WHERE updated=0 OR firstparty<?'
def feed_a(name: bytes, value_ip: bytes) -> None:
assert C
assert CONN
time_step('a_flat')
value_dec = ip4_flat(value_ip)
if value_dec is None:
# Malformed IPs
time_step('a_malformed')
return
time_step('a_fetch')
C.execute(FEED_A_COMMAND_FETCH, (value_dec, value_dec))
base = C.fetchone()
time_step('a_fetch_confirm')
name = name[::-1]
for b_key, b_firstparty in C:
time_step('a_upsert')
C.execute(FEED_A_COMMAND_UPSERT,
(name, b_key, b_firstparty, # Insert
b_key, b_firstparty, b_firstparty) # Update
)
time_step('a_fetch_confirm')
time_step('a_end')
FEED_CNAME_COMMAND_FETCH = \
'SELECT key, type, firstparty FROM blocking ' \
'WHERE key<=? ' \
f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \
'ORDER BY key DESC ' \
'LIMIT 1'
# Optimisations that renders the index unused
# (and thus counterproductive until fixed):
# 'AND instr(?, key) > 0 ' \
# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \
# f'(type={RowType.Domain.value} AND key=?)) ' \
# Might be fixable by using multiple SELECT and a JOIN
# In the meantime the confirm is very light so it's ok
FEED_CNAME_COMMAND_UPSERT = \
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \
'updated=1, firstparty=? ' \
'WHERE updated=0 OR firstparty<?'
def feed_cname(name: bytes, value: bytes) -> None:
assert C
assert CONN
time_step('cname_decode')
value = value[::-1]
value_dec = value.decode()
time_step('cname_fetch')
C.execute(FEED_CNAME_COMMAND_FETCH, (value_dec,))
time_step('cname_fetch_confirm')
for b_key, b_type, b_firstparty in C:
matching = b_key == value_dec[:len(b_key)] and (
len(value_dec) == len(b_key)
or (
b_type == RowType.DomainTree.value
and value_dec[len(b_key)] == '.'
)
def set_ip4network(self, ip4network: str,
*args: typing.Any, **kwargs: typing.Any) -> None:
self.enter_step('set_ip4net_prepare')
try:
ip4network_prep = self.prepare_ip4network(ip4network)
except (ValueError, IndexError):
self.log.error("Invalid ip4network: %s", ip4network)
return
prep: typing.Dict[str, DbValue] = {
'mini': ip4network_prep[0],
'maxi': ip4network_prep[1],
}
self._set_generic(
'ip4net',
'SELECT entry FROM ip4network WHERE mini=:mini AND maxi=:maxi',
'INSERT INTO ip4network (mini, maxi, entry) '
'VALUES (:mini, :maxi, :entry)',
prep,
*args, **kwargs
)
if not matching:
continue
name = name[::-1]
time_step('cname_upsert')
C.execute(FEED_CNAME_COMMAND_UPSERT,
(name, b_key, b_firstparty, # Insert
b_key, b_firstparty, b_firstparty) # Update
)
time_step('cname_fetch_confirm')
time_step('cname_end')
if __name__ == '__main__':
@ -259,13 +388,28 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Database operations")
parser.add_argument(
'-r', '--refresh', action='store_true',
'-i', '--initialize', action='store_true',
help="Reconstruct the whole database")
parser.add_argument(
'-p', '--prune', action='store_true',
help="Remove old entries from database")
parser.add_argument(
'-e', '--expire', action='store_true',
help="Set the whole database as an old source")
parser.add_argument(
'-r', '--references', action='store_true',
help="Update the reference count")
args = parser.parse_args()
open_db()
DB = Database()
if args.refresh:
refresh()
if args.initialize:
DB.initialize()
if args.prune:
DB.prune()
if args.expire:
DB.expire()
if args.references and not args.prune:
DB.update_references()
close_db()
DB.close()

50
database_schema.sql

@ -1,20 +1,48 @@
-- Remember to increment DB_VERSION
-- in database.py on changes to this file
CREATE TABLE blocking (
key TEXT PRIMARY KEY, -- Contains the reversed domain name or IP in binary form
source TEXT, -- The rule this one is based on
type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network
CREATE TABLE rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source INTEGER, -- The rule this one is based on
updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes)
firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party)
refs INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) (used for -only lists)
level INTEGER, -- Level of recursion to the original rule (used for source priority)
FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE
first_party INTEGER, -- 1: this blocks a first party for sure, 0: maybe
refs INTEGER, -- Number of entries issued from this one
level INTEGER, -- Level of recursion to the root source rule (used for source priority)
FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE INDEX "blocking_type_key" ON "blocking" (
"type",
"key" DESC
CREATE TABLE asn (
val INTEGER PRIMARY KEY,
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE hostname (
val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for consistency with zone)
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE zone (
val TEXT PRIMARY KEY, -- rev'd, ends with a dot (for easier matching)
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE ip4address (
val INTEGER PRIMARY KEY,
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE TABLE ip4network (
-- val TEXT PRIMARY KEY,
mini INTEGER,
maxi INTEGER,
entry INTEGER,
FOREIGN KEY (entry) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE INDEX ip4network_minmax ON ip4network (mini, maxi);
-- Store various things
CREATE TABLE meta (

30
export.py

@ -0,0 +1,30 @@
#!/usr/bin/env python3
import database
import argparse
import sys
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="TODO")
parser.add_argument(
'-f', '--first-party', action='store_true',
help="TODO")
parser.add_argument(
'-e', '--end-chain', action='store_true',
help="TODO")
args = parser.parse_args()
DB = database.Database()
for domain in DB.export(first_party_only=args.first_party,
end_chain_only=args.end_chain):
print(domain, file=args.output)
DB.close()

52
feed_dns.py

@ -3,42 +3,56 @@
import database
import argparse
import sys
FUNCTION_MAP = {
b'a': database.feed_a,
b'cname': database.feed_cname,
}
import logging
if __name__ == '__main__':
# Parsing arguments
log = logging.getLogger('feed_dns')
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
'-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer,
# '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer,
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = parser.parse_args()
database.open_db()
DB = database.Database()
try:
database.time_step('iowait')
line: bytes
DB.enter_step('iowait')
# line: bytes
line: str
for line in args.input:
database.time_step('feed_json_parse')
split = line.split(b'"')
name = split[7]
dtype = split[11]
value = split[15]
DB.enter_step('feed_json_parse')
# split = line.split(b'"')
split = line.split('"')
try:
name = split[7]
dtype = split[11]
value = split[15]
except IndexError:
log.error("Invalid JSON: %s", line)
continue
# DB.enter_step('feed_json_assert')
# data = json.loads(line)
# assert dtype == data['type']
# assert name == data['name']
# assert value == data['value']
database.time_step('feed_switch')
FUNCTION_MAP[dtype](name, value)
database.time_step('iowait')
DB.enter_step('feed_switch')
if dtype == 'a':
for rule in DB.get_ip4(value):
DB.set_hostname(name, source=rule)
elif dtype == 'cname':
for rule in DB.get_domain(value):
DB.set_hostname(name, source=rule)
elif dtype == 'ptr':
for rule in DB.get_domain(value):
DB.set_ip4address(name, source=rule)
DB.enter_step('iowait')
except KeyboardInterrupt:
print("Interupted.")
log.warning("Interupted.")
pass
database.close_db()
DB.close()

30
feed_rules.py

@ -13,7 +13,7 @@ if __name__ == '__main__':
description="TODO")
parser.add_argument(
'type',
choices={'subdomains', 'ip4network'},
choices={'zone', 'ip4network'},
help="Type of rule inputed")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
@ -23,18 +23,16 @@ if __name__ == '__main__':
help="The input only comes from verified first-party sources")
args = parser.parse_args()
database.open_db()
if args.type == 'subdomains':
for rule in args.input:
database.feed_rule_subdomains(
rule.strip(), first_party=args.first_party)
elif args.type == 'ip4network':
for rule in args.input:
network = ipaddress.ip_network(rule.strip())
database.feed_rule_ip4network(
network, first_party=args.first_party)
else:
assert False
database.close_db()
DB = database.Database()
FUNCTION_MAP = {
'zone': DB.set_zone,
'ip4network': DB.set_ip4network,
}
fun = FUNCTION_MAP[args.type]
for rule in args.input:
fun(rule.strip(), is_first_party=args.first_party)
DB.close()

45
filter_subdomains.sh

@ -4,37 +4,14 @@ function log() {
echo -e "\033[33m$@\033[0m"
}
if [ ! -f temp/all_resolved.csv ]
then
echo "Run ./resolve_subdomains.sh first!"
exit 1
fi
log "Updating references…"
./database.py --references
# Gather all the rules for filtering
log "Compiling rules…"
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt
log "Filtering first-party tracking domains…"
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
log "Filtering first-party curated tracking domains…"
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
log "Filtering multi-party tracking domains…"
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt
log "Filtering multi-party curated tracking domains…"
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt
log "Exporting lists…"
./export.py --first-party | sort -u > dist/firstparty-trackers.txt
./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt
./export.py | sort -u > dist/multiparty-trackers.txt
./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist
function generate_hosts {
@ -61,14 +38,14 @@ function generate_hosts {
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source websites: TODO"
echo "# Number of source subdomains: TODO"
echo "#"
echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)"
echo "# Number of known first-party trackers: TODO"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#"
echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)"
echo "# Number of known multi-party trackers: TODO"
echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo

14
import_rules.sh

@ -0,0 +1,14 @@
#!/usr/bin/env bash
function log() {
echo -e "\033[33m$@\033[0m"
}
log "Importing rules…"
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party

28
new_workflow.sh

@ -5,18 +5,20 @@ function log() {
}
log "Preparing database…"
./database.py --refresh
./database.py --expire
log "Compiling rules…"
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
# NOTE: Ensure first-party sources are last
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party
./import_rules.sh
# TODO Fetch 'em
log "Reading PTR records…"
pv ptr.json.gz | gunzip | ./feed_dns.py
log "Reading A records…"
pv a.json.gz | gunzip | ./feed_dns.py
log "Reading CNAME records…"
pv cname.json.gz | gunzip | ./feed_dns.py
log "Pruning old data…"
./database.py --prune
./filter_subdomains.sh
# log "Reading A records…"
# pv a.json.gz | gunzip | ./feed_dns.py
# log "Reading CNAME records…"
# pv cname.json.gz | gunzip | ./feed_dns.py
Loading…
Cancel
Save