Workflow: Base for new one

While I'm automating this you'll need to download the A set from
https://opendata.rapid7.com/sonar.fdns_v2/ to the file a.json.gz.
newworkflow_parseropti
Geoffrey Frogeye 2019-12-09 08:12:48 +01:00
parent 62e6c9005b
commit 7937496882
Signed by: geoffrey
GPG Key ID: D8A7ECA00A8CD3DD
6 changed files with 389 additions and 0 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
*.log
*.db
*.db-journal
nameservers
nameservers.head

260
database.py Executable file
View File

@ -0,0 +1,260 @@
#!/usr/bin/env python3
import sqlite3
import os
import argparse
import typing
import ipaddress
import enum
import time
import pprint
"""
Utility functions to interact with the database.
"""
VERSION = 1
PATH = f"blocking.db"
CONN = None
C = None # Cursor
TIME_DICT: typing.Dict[str, float] = dict()
TIME_LAST = time.perf_counter()
TIME_STEP = 'start'
def time_step(step: str) -> None:
global TIME_LAST
global TIME_STEP
now = time.perf_counter()
TIME_DICT.setdefault(TIME_STEP, 0.0)
TIME_DICT[TIME_STEP] += now - TIME_LAST
TIME_STEP = step
TIME_LAST = time.perf_counter()
def time_print() -> None:
time_step('postprint')
total = sum(TIME_DICT.values())
for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]):
print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s")
print(f"{'total':<20}: {1:7.2%} = {total:.6f} s")
class RowType(enum.Enum):
AS = 1
DomainTree = 2
Domain = 3
IPv4Network = 4
IPv6Network = 6
def open_db() -> None:
time_step('open_db')
global CONN
global C
CONN = sqlite3.connect(PATH)
C = CONN.cursor()
# C.execute("PRAGMA foreign_keys = ON");
initialized = False
try:
C.execute("SELECT value FROM meta WHERE key='version'")
version_ex = C.fetchone()
if version_ex:
if version_ex[0] == VERSION:
initialized = True
else:
print(f"Database version {version_ex[0]} found,"
"it will be deleted.")
except sqlite3.OperationalError:
pass
if not initialized:
time_step('init_db')
print(f"Creating database version {VERSION}.")
CONN.close()
os.unlink(PATH)
CONN = sqlite3.connect(PATH)
C = CONN.cursor()
with open("database_schema.sql", 'r') as db_schema:
C.executescript(db_schema.read())
C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,))
CONN.commit()
time_step('other')
def close_db() -> None:
assert CONN
time_step('close_db_commit')
CONN.commit()
time_step('close_db')
CONN.close()
time_step('other')
time_print()
def refresh() -> None:
assert C
C.execute('UPDATE blocking SET updated = 0')
# TODO PERF Use a meta value instead
RULE_SUBDOMAIN_COMMAND = \
'INSERT INTO blocking (key, type, updated, firstparty) ' \
f'VALUES (?, {RowType.DomainTree.value}, 1, ?) ' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \
'updated=1, firstparty=?'
def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None:
assert C
subdomain = subdomain[::-1]
C.execute(RULE_SUBDOMAIN_COMMAND,
(subdomain, int(first_party), int(first_party)))
# Since regex type takes precedence over domain type,
# and firstparty takes precedence over multiparty,
# we can afford to replace the whole row without checking
# the row without checking previous values and making sure
# firstparty subdomains are updated last
def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
for char in address.packed:
for i in range(7, -1, -1):
yield (char >> i) & 0b1
def ip_flat(address: ipaddress.IPv4Address) -> str:
return ''.join(map(str, ip_get_bits(address)))
def ip4_flat(address: str) -> str:
return '{:08b}{:08b}{:08b}{:08b}'.format(
*[int(c) for c in address.split('.')])
RULE_IP4NETWORK_COMMAND = \
'INSERT INTO blocking (key, type, updated, firstparty) ' \
f'VALUES (?, {RowType.IPv4Network.value}, 1, ?) ' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \
'updated=1, firstparty=?'
def feed_rule_ip4network(network: ipaddress.IPv4Network,
first_party: bool = False) -> None:
assert C
flat = ip_flat(network.network_address)[:network.prefixlen]
C.execute(RULE_IP4NETWORK_COMMAND,
(flat, int(first_party), int(first_party)))
FEED_A_COMMAND_FETCH = \
'SELECT key, firstparty FROM blocking ' \
'WHERE key<=? ' \
'AND updated=1 ' \
f'AND type={RowType.IPv4Network.value} ' \
'ORDER BY key DESC ' \
'LIMIT 1'
FEED_A_COMMAND_UPSERT = \
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \
'updated=1, firstparty=? ' \
'WHERE updated=0 OR firstparty<?'
def feed_a(name: str, value_ip: str) -> None:
assert C
assert CONN
time_step('a_flat')
try:
value = ip4_flat(value_ip)
except (ValueError, IndexError):
# Malformed IPs
return
time_step('a_fetch')
C.execute(FEED_A_COMMAND_FETCH, (value,))
base = C.fetchone()
time_step('a_fetch_confirm')
if not base:
return
b_key, b_firstparty = base
if not value.startswith(b_key):
return
name = name[::-1]
time_step('a_upsert')
C.execute(FEED_A_COMMAND_UPSERT,
(name, b_key, b_firstparty, # Insert
b_key, b_firstparty, b_firstparty) # Update
)
time_step('other')
FEED_CNAME_COMMAND_FETCH = \
'SELECT key, type, firstparty FROM blocking ' \
'WHERE key<=? ' \
f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \
'AND updated=1 ' \
'ORDER BY key DESC ' \
'LIMIT 1'
# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \
# f'(type={RowType.Domain.value} AND key=?)) ' \
# This optimisation is counter productive
FEED_CNAME_COMMAND_UPSERT = \
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \
'ON CONFLICT(key)' \
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \
'updated=1, firstparty=? ' \
'WHERE updated=0 OR firstparty<?'
def feed_cname(name: str, value: str) -> None:
assert C
assert CONN
value = value[::-1]
time_step('cname_fetch')
C.execute(FEED_CNAME_COMMAND_FETCH, (value,))
base = C.fetchone()
time_step('cname_fetch_confirm')
if not base:
# Should only happen at an extremum of the database
return
b_key, b_type, b_firstparty = base
matching = b_key == value[:len(b_key)] and (
len(value) == len(b_key)
or (
b_type == RowType.DomainTree.value
and value[len(b_key)] == '.'
)
)
if not matching:
return
name = name[::-1]
time_step('cname_upsert')
C.execute(FEED_CNAME_COMMAND_UPSERT,
(name, b_key, b_firstparty, # Insert
b_key, b_firstparty, b_firstparty) # Update
)
time_step('other')
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="Database operations")
parser.add_argument(
'-r', '--refresh', action='store_true',
help="Set the whole database as an old source")
args = parser.parse_args()
open_db()
if args.refresh:
refresh()
close_db()

22
database_schema.sql Normal file
View File

@ -0,0 +1,22 @@
-- Remember to increment DB_VERSION
-- in database.py on changes to this file
CREATE TABLE blocking (
key text PRIMARY KEY, -- Contains the reversed domain name or IP in binary form
source TEXT, -- The rule this one is based on
type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network
updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes)
firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party)
FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE
);
CREATE INDEX "blocking_type_updated_key" ON "blocking" (
"type",
"updated",
"key" DESC
);
-- Store various things
CREATE TABLE meta (
key text PRIMARY KEY,
value integer
);

43
feed_dns.py Executable file
View File

@ -0,0 +1,43 @@
#!/usr/bin/env python3
import database
import argparse
import sys
FUNCTION_MAP = {
'a': database.feed_a,
'cname': database.feed_cname,
}
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = parser.parse_args()
database.open_db()
try:
database.time_step('iowait')
for line in args.input:
database.time_step('feed_json_parse')
split = line.split('"')
name = split[7]
dtype = split[11]
value = split[15]
# data = json.loads(line)
# assert dtype == data['type']
# assert name == data['name']
# assert value == data['value']
database.time_step('feed_switch')
FUNCTION_MAP[dtype](name, value)
database.time_step('iowait')
except KeyboardInterrupt:
print("Interupted.")
pass
database.close_db()

40
feed_rules.py Executable file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
import database
import argparse
import sys
import ipaddress
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
'type',
choices={'subdomains', 'ip4network'},
help="Type of rule inputed")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="List of domains domains to block (with their subdomains)")
parser.add_argument(
'-f', '--first-party', action='store_true',
help="The input only comes from verified first-party sources")
args = parser.parse_args()
database.open_db()
if args.type == 'subdomains':
for rule in args.input:
database.feed_rule_subdomains(
rule.strip(), first_party=args.first_party)
elif args.type == 'ip4network':
for rule in args.input:
network = ipaddress.ip_network(rule.strip())
database.feed_rule_ip4network(
network, first_party=args.first_party)
else:
assert False
database.close_db()

22
new_workflow.sh Executable file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
function log() {
echo -e "\033[33m$@\033[0m"
}
log "Preparing database…"
./database.py --refresh
log "Compiling rules…"
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
# NOTE: Ensure first-party sources are last
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party
# log "Reading A records…"
# pv a.json.gz | gunzip | ./feed_dns.py
# log "Reading CNAME records…"
# pv cname.json.gz | gunzip | ./feed_dns.py