Browse Source
Workflow: Base for new one
Workflow: Base for new one
While I'm automating this you'll need to download the A set from https://opendata.rapid7.com/sonar.fdns_v2/ to the file a.json.gz.newworkflow_parseropti
6 changed files with 389 additions and 0 deletions
-
2.gitignore
-
260database.py
-
22database_schema.sql
-
43feed_dns.py
-
40feed_rules.py
-
22new_workflow.sh
@ -1,3 +1,5 @@ |
|||
*.log |
|||
*.db |
|||
*.db-journal |
|||
nameservers |
|||
nameservers.head |
@ -0,0 +1,260 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
import sqlite3 |
|||
import os |
|||
import argparse |
|||
import typing |
|||
import ipaddress |
|||
import enum |
|||
import time |
|||
import pprint |
|||
|
|||
""" |
|||
Utility functions to interact with the database. |
|||
""" |
|||
|
|||
VERSION = 1 |
|||
PATH = f"blocking.db" |
|||
CONN = None |
|||
C = None # Cursor |
|||
TIME_DICT: typing.Dict[str, float] = dict() |
|||
TIME_LAST = time.perf_counter() |
|||
TIME_STEP = 'start' |
|||
|
|||
|
|||
def time_step(step: str) -> None: |
|||
global TIME_LAST |
|||
global TIME_STEP |
|||
now = time.perf_counter() |
|||
TIME_DICT.setdefault(TIME_STEP, 0.0) |
|||
TIME_DICT[TIME_STEP] += now - TIME_LAST |
|||
TIME_STEP = step |
|||
TIME_LAST = time.perf_counter() |
|||
|
|||
|
|||
def time_print() -> None: |
|||
time_step('postprint') |
|||
total = sum(TIME_DICT.values()) |
|||
for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]): |
|||
print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s") |
|||
print(f"{'total':<20}: {1:7.2%} = {total:.6f} s") |
|||
|
|||
|
|||
class RowType(enum.Enum): |
|||
AS = 1 |
|||
DomainTree = 2 |
|||
Domain = 3 |
|||
IPv4Network = 4 |
|||
IPv6Network = 6 |
|||
|
|||
|
|||
def open_db() -> None: |
|||
time_step('open_db') |
|||
global CONN |
|||
global C |
|||
CONN = sqlite3.connect(PATH) |
|||
C = CONN.cursor() |
|||
# C.execute("PRAGMA foreign_keys = ON"); |
|||
initialized = False |
|||
try: |
|||
C.execute("SELECT value FROM meta WHERE key='version'") |
|||
version_ex = C.fetchone() |
|||
if version_ex: |
|||
if version_ex[0] == VERSION: |
|||
initialized = True |
|||
else: |
|||
print(f"Database version {version_ex[0]} found," |
|||
"it will be deleted.") |
|||
except sqlite3.OperationalError: |
|||
pass |
|||
if not initialized: |
|||
time_step('init_db') |
|||
print(f"Creating database version {VERSION}.") |
|||
CONN.close() |
|||
os.unlink(PATH) |
|||
CONN = sqlite3.connect(PATH) |
|||
C = CONN.cursor() |
|||
with open("database_schema.sql", 'r') as db_schema: |
|||
C.executescript(db_schema.read()) |
|||
C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,)) |
|||
CONN.commit() |
|||
time_step('other') |
|||
|
|||
|
|||
def close_db() -> None: |
|||
assert CONN |
|||
time_step('close_db_commit') |
|||
CONN.commit() |
|||
time_step('close_db') |
|||
CONN.close() |
|||
time_step('other') |
|||
time_print() |
|||
|
|||
|
|||
def refresh() -> None: |
|||
assert C |
|||
C.execute('UPDATE blocking SET updated = 0') |
|||
# TODO PERF Use a meta value instead |
|||
|
|||
|
|||
RULE_SUBDOMAIN_COMMAND = \ |
|||
'INSERT INTO blocking (key, type, updated, firstparty) ' \ |
|||
f'VALUES (?, {RowType.DomainTree.value}, 1, ?) ' \ |
|||
'ON CONFLICT(key)' \ |
|||
f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \ |
|||
'updated=1, firstparty=?' |
|||
|
|||
|
|||
def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None: |
|||
assert C |
|||
subdomain = subdomain[::-1] |
|||
C.execute(RULE_SUBDOMAIN_COMMAND, |
|||
(subdomain, int(first_party), int(first_party))) |
|||
# Since regex type takes precedence over domain type, |
|||
# and firstparty takes precedence over multiparty, |
|||
# we can afford to replace the whole row without checking |
|||
# the row without checking previous values and making sure |
|||
# firstparty subdomains are updated last |
|||
|
|||
|
|||
def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: |
|||
for char in address.packed: |
|||
for i in range(7, -1, -1): |
|||
yield (char >> i) & 0b1 |
|||
|
|||
|
|||
def ip_flat(address: ipaddress.IPv4Address) -> str: |
|||
return ''.join(map(str, ip_get_bits(address))) |
|||
|
|||
|
|||
def ip4_flat(address: str) -> str: |
|||
return '{:08b}{:08b}{:08b}{:08b}'.format( |
|||
*[int(c) for c in address.split('.')]) |
|||
|
|||
|
|||
RULE_IP4NETWORK_COMMAND = \ |
|||
'INSERT INTO blocking (key, type, updated, firstparty) ' \ |
|||
f'VALUES (?, {RowType.IPv4Network.value}, 1, ?) ' \ |
|||
'ON CONFLICT(key)' \ |
|||
f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \ |
|||
'updated=1, firstparty=?' |
|||
|
|||
|
|||
def feed_rule_ip4network(network: ipaddress.IPv4Network, |
|||
first_party: bool = False) -> None: |
|||
assert C |
|||
flat = ip_flat(network.network_address)[:network.prefixlen] |
|||
C.execute(RULE_IP4NETWORK_COMMAND, |
|||
(flat, int(first_party), int(first_party))) |
|||
|
|||
|
|||
FEED_A_COMMAND_FETCH = \ |
|||
'SELECT key, firstparty FROM blocking ' \ |
|||
'WHERE key<=? ' \ |
|||
'AND updated=1 ' \ |
|||
f'AND type={RowType.IPv4Network.value} ' \ |
|||
'ORDER BY key DESC ' \ |
|||
'LIMIT 1' |
|||
|
|||
FEED_A_COMMAND_UPSERT = \ |
|||
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ |
|||
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ |
|||
'ON CONFLICT(key)' \ |
|||
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ |
|||
'updated=1, firstparty=? ' \ |
|||
'WHERE updated=0 OR firstparty<?' |
|||
|
|||
|
|||
def feed_a(name: str, value_ip: str) -> None: |
|||
assert C |
|||
assert CONN |
|||
time_step('a_flat') |
|||
try: |
|||
value = ip4_flat(value_ip) |
|||
except (ValueError, IndexError): |
|||
# Malformed IPs |
|||
return |
|||
time_step('a_fetch') |
|||
C.execute(FEED_A_COMMAND_FETCH, (value,)) |
|||
base = C.fetchone() |
|||
time_step('a_fetch_confirm') |
|||
if not base: |
|||
return |
|||
b_key, b_firstparty = base |
|||
if not value.startswith(b_key): |
|||
return |
|||
name = name[::-1] |
|||
time_step('a_upsert') |
|||
C.execute(FEED_A_COMMAND_UPSERT, |
|||
(name, b_key, b_firstparty, # Insert |
|||
b_key, b_firstparty, b_firstparty) # Update |
|||
) |
|||
time_step('other') |
|||
|
|||
|
|||
FEED_CNAME_COMMAND_FETCH = \ |
|||
'SELECT key, type, firstparty FROM blocking ' \ |
|||
'WHERE key<=? ' \ |
|||
f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \ |
|||
'AND updated=1 ' \ |
|||
'ORDER BY key DESC ' \ |
|||
'LIMIT 1' |
|||
# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \ |
|||
# f'(type={RowType.Domain.value} AND key=?)) ' \ |
|||
# This optimisation is counter productive |
|||
|
|||
FEED_CNAME_COMMAND_UPSERT = \ |
|||
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \ |
|||
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \ |
|||
'ON CONFLICT(key)' \ |
|||
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \ |
|||
'updated=1, firstparty=? ' \ |
|||
'WHERE updated=0 OR firstparty<?' |
|||
|
|||
|
|||
def feed_cname(name: str, value: str) -> None: |
|||
assert C |
|||
assert CONN |
|||
value = value[::-1] |
|||
time_step('cname_fetch') |
|||
C.execute(FEED_CNAME_COMMAND_FETCH, (value,)) |
|||
base = C.fetchone() |
|||
time_step('cname_fetch_confirm') |
|||
if not base: |
|||
# Should only happen at an extremum of the database |
|||
return |
|||
b_key, b_type, b_firstparty = base |
|||
matching = b_key == value[:len(b_key)] and ( |
|||
len(value) == len(b_key) |
|||
or ( |
|||
b_type == RowType.DomainTree.value |
|||
and value[len(b_key)] == '.' |
|||
) |
|||
) |
|||
if not matching: |
|||
return |
|||
name = name[::-1] |
|||
time_step('cname_upsert') |
|||
C.execute(FEED_CNAME_COMMAND_UPSERT, |
|||
(name, b_key, b_firstparty, # Insert |
|||
b_key, b_firstparty, b_firstparty) # Update |
|||
) |
|||
time_step('other') |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# Parsing arguments |
|||
parser = argparse.ArgumentParser( |
|||
description="Database operations") |
|||
parser.add_argument( |
|||
'-r', '--refresh', action='store_true', |
|||
help="Set the whole database as an old source") |
|||
args = parser.parse_args() |
|||
|
|||
open_db() |
|||
|
|||
if args.refresh: |
|||
refresh() |
|||
|
|||
close_db() |
@ -0,0 +1,22 @@ |
|||
-- Remember to increment DB_VERSION |
|||
-- in database.py on changes to this file |
|||
|
|||
CREATE TABLE blocking ( |
|||
key text PRIMARY KEY, -- Contains the reversed domain name or IP in binary form |
|||
source TEXT, -- The rule this one is based on |
|||
type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network |
|||
updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes) |
|||
firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party) |
|||
FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE |
|||
); |
|||
CREATE INDEX "blocking_type_updated_key" ON "blocking" ( |
|||
"type", |
|||
"updated", |
|||
"key" DESC |
|||
); |
|||
|
|||
-- Store various things |
|||
CREATE TABLE meta ( |
|||
key text PRIMARY KEY, |
|||
value integer |
|||
); |
@ -0,0 +1,43 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
import database |
|||
import argparse |
|||
import sys |
|||
|
|||
FUNCTION_MAP = { |
|||
'a': database.feed_a, |
|||
'cname': database.feed_cname, |
|||
} |
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# Parsing arguments |
|||
parser = argparse.ArgumentParser( |
|||
description="TODO") |
|||
parser.add_argument( |
|||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin, |
|||
help="TODO") |
|||
args = parser.parse_args() |
|||
|
|||
database.open_db() |
|||
|
|||
try: |
|||
database.time_step('iowait') |
|||
for line in args.input: |
|||
database.time_step('feed_json_parse') |
|||
split = line.split('"') |
|||
name = split[7] |
|||
dtype = split[11] |
|||
value = split[15] |
|||
# data = json.loads(line) |
|||
# assert dtype == data['type'] |
|||
# assert name == data['name'] |
|||
# assert value == data['value'] |
|||
database.time_step('feed_switch') |
|||
FUNCTION_MAP[dtype](name, value) |
|||
database.time_step('iowait') |
|||
except KeyboardInterrupt: |
|||
print("Interupted.") |
|||
pass |
|||
|
|||
database.close_db() |
@ -0,0 +1,40 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
import database |
|||
import argparse |
|||
import sys |
|||
import ipaddress |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# Parsing arguments |
|||
parser = argparse.ArgumentParser( |
|||
description="TODO") |
|||
parser.add_argument( |
|||
'type', |
|||
choices={'subdomains', 'ip4network'}, |
|||
help="Type of rule inputed") |
|||
parser.add_argument( |
|||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin, |
|||
help="List of domains domains to block (with their subdomains)") |
|||
parser.add_argument( |
|||
'-f', '--first-party', action='store_true', |
|||
help="The input only comes from verified first-party sources") |
|||
args = parser.parse_args() |
|||
|
|||
database.open_db() |
|||
|
|||
if args.type == 'subdomains': |
|||
for rule in args.input: |
|||
database.feed_rule_subdomains( |
|||
rule.strip(), first_party=args.first_party) |
|||
elif args.type == 'ip4network': |
|||
for rule in args.input: |
|||
network = ipaddress.ip_network(rule.strip()) |
|||
database.feed_rule_ip4network( |
|||
network, first_party=args.first_party) |
|||
else: |
|||
assert False |
|||
|
|||
database.close_db() |
@ -0,0 +1,22 @@ |
|||
#!/usr/bin/env bash |
|||
|
|||
function log() { |
|||
echo -e "\033[33m$@\033[0m" |
|||
} |
|||
|
|||
log "Preparing database…" |
|||
./database.py --refresh |
|||
|
|||
log "Compiling rules…" |
|||
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains |
|||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains |
|||
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains |
|||
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network |
|||
# NOTE: Ensure first-party sources are last |
|||
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party |
|||
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party |
|||
|
|||
# log "Reading A records…" |
|||
# pv a.json.gz | gunzip | ./feed_dns.py |
|||
# log "Reading CNAME records…" |
|||
# pv cname.json.gz | gunzip | ./feed_dns.py |
Write
Preview
Loading…
Cancel
Save
Reference in new issue