Workflow: Base for new one
While I'm automating this you'll need to download the A set from https://opendata.rapid7.com/sonar.fdns_v2/ to the file a.json.gz.
This commit is contained in:
parent
62e6c9005b
commit
7937496882
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,5 @@
|
|||
*.log
|
||||
*.db
|
||||
*.db-journal
|
||||
nameservers
|
||||
nameservers.head
|
||||
|
|
260
database.py
Executable file
260
database.py
Executable file
|
@ -0,0 +1,260 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sqlite3
|
||||
import os
|
||||
import argparse
|
||||
import typing
|
||||
import ipaddress
|
||||
import enum
|
||||
import time
|
||||
import pprint
|
||||
|
||||
"""
|
||||
Utility functions to interact with the database.
|
||||
"""
|
||||
|
||||
VERSION = 1
|
||||
PATH = f"blocking.db"
|
||||
CONN = None
|
||||
C = None # Cursor
|
||||
TIME_DICT: typing.Dict[str, float] = dict()
|
||||
TIME_LAST = time.perf_counter()
|
||||
TIME_STEP = 'start'
|
||||
|
||||
|
||||
def time_step(step: str) -> None:
|
||||
global TIME_LAST
|
||||
global TIME_STEP
|
||||
now = time.perf_counter()
|
||||
TIME_DICT.setdefault(TIME_STEP, 0.0)
|
||||
TIME_DICT[TIME_STEP] += now - TIME_LAST
|
||||
TIME_STEP = step
|
||||
TIME_LAST = time.perf_counter()
|
||||
|
||||
|
||||
def time_print() -> None:
|
||||
time_step('postprint')
|
||||
total = sum(TIME_DICT.values())
|
||||
for key, secs in sorted(TIME_DICT.items(), key=lambda t: t[1]):
|
||||
print(f"{key:<20}: {secs/total:7.2%} = {secs:.6f} s")
|
||||
print(f"{'total':<20}: {1:7.2%} = {total:.6f} s")
|
||||
|
||||
|
||||
class RowType(enum.Enum):
|
||||
AS = 1
|
||||
DomainTree = 2
|
||||
Domain = 3
|
||||
IPv4Network = 4
|
||||
IPv6Network = 6
|
||||
|
||||
|
||||
def open_db() -> None:
|
||||
time_step('open_db')
|
||||
global CONN
|
||||
global C
|
||||
CONN = sqlite3.connect(PATH)
|
||||
C = CONN.cursor()
|
||||
# C.execute("PRAGMA foreign_keys = ON");
|
||||
initialized = False
|
||||
try:
|
||||
C.execute("SELECT value FROM meta WHERE key='version'")
|
||||
version_ex = C.fetchone()
|
||||
if version_ex:
|
||||
if version_ex[0] == VERSION:
|
||||
initialized = True
|
||||
else:
|
||||
print(f"Database version {version_ex[0]} found,"
|
||||
"it will be deleted.")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
if not initialized:
|
||||
time_step('init_db')
|
||||
print(f"Creating database version {VERSION}.")
|
||||
CONN.close()
|
||||
os.unlink(PATH)
|
||||
CONN = sqlite3.connect(PATH)
|
||||
C = CONN.cursor()
|
||||
with open("database_schema.sql", 'r') as db_schema:
|
||||
C.executescript(db_schema.read())
|
||||
C.execute("INSERT INTO meta VALUES ('version', ?)", (VERSION,))
|
||||
CONN.commit()
|
||||
time_step('other')
|
||||
|
||||
|
||||
def close_db() -> None:
|
||||
assert CONN
|
||||
time_step('close_db_commit')
|
||||
CONN.commit()
|
||||
time_step('close_db')
|
||||
CONN.close()
|
||||
time_step('other')
|
||||
time_print()
|
||||
|
||||
|
||||
def refresh() -> None:
|
||||
assert C
|
||||
C.execute('UPDATE blocking SET updated = 0')
|
||||
# TODO PERF Use a meta value instead
|
||||
|
||||
|
||||
RULE_SUBDOMAIN_COMMAND = \
|
||||
'INSERT INTO blocking (key, type, updated, firstparty) ' \
|
||||
f'VALUES (?, {RowType.DomainTree.value}, 1, ?) ' \
|
||||
'ON CONFLICT(key)' \
|
||||
f'DO UPDATE SET source=null, type={RowType.DomainTree.value}, ' \
|
||||
'updated=1, firstparty=?'
|
||||
|
||||
|
||||
def feed_rule_subdomains(subdomain: str, first_party: bool = False) -> None:
|
||||
assert C
|
||||
subdomain = subdomain[::-1]
|
||||
C.execute(RULE_SUBDOMAIN_COMMAND,
|
||||
(subdomain, int(first_party), int(first_party)))
|
||||
# Since regex type takes precedence over domain type,
|
||||
# and firstparty takes precedence over multiparty,
|
||||
# we can afford to replace the whole row without checking
|
||||
# the row without checking previous values and making sure
|
||||
# firstparty subdomains are updated last
|
||||
|
||||
|
||||
def ip_get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
|
||||
for char in address.packed:
|
||||
for i in range(7, -1, -1):
|
||||
yield (char >> i) & 0b1
|
||||
|
||||
|
||||
def ip_flat(address: ipaddress.IPv4Address) -> str:
|
||||
return ''.join(map(str, ip_get_bits(address)))
|
||||
|
||||
|
||||
def ip4_flat(address: str) -> str:
|
||||
return '{:08b}{:08b}{:08b}{:08b}'.format(
|
||||
*[int(c) for c in address.split('.')])
|
||||
|
||||
|
||||
RULE_IP4NETWORK_COMMAND = \
|
||||
'INSERT INTO blocking (key, type, updated, firstparty) ' \
|
||||
f'VALUES (?, {RowType.IPv4Network.value}, 1, ?) ' \
|
||||
'ON CONFLICT(key)' \
|
||||
f'DO UPDATE SET source=null, type={RowType.IPv4Network.value}, ' \
|
||||
'updated=1, firstparty=?'
|
||||
|
||||
|
||||
def feed_rule_ip4network(network: ipaddress.IPv4Network,
|
||||
first_party: bool = False) -> None:
|
||||
assert C
|
||||
flat = ip_flat(network.network_address)[:network.prefixlen]
|
||||
C.execute(RULE_IP4NETWORK_COMMAND,
|
||||
(flat, int(first_party), int(first_party)))
|
||||
|
||||
|
||||
FEED_A_COMMAND_FETCH = \
|
||||
'SELECT key, firstparty FROM blocking ' \
|
||||
'WHERE key<=? ' \
|
||||
'AND updated=1 ' \
|
||||
f'AND type={RowType.IPv4Network.value} ' \
|
||||
'ORDER BY key DESC ' \
|
||||
'LIMIT 1'
|
||||
|
||||
FEED_A_COMMAND_UPSERT = \
|
||||
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \
|
||||
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \
|
||||
'ON CONFLICT(key)' \
|
||||
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \
|
||||
'updated=1, firstparty=? ' \
|
||||
'WHERE updated=0 OR firstparty<?'
|
||||
|
||||
|
||||
def feed_a(name: str, value_ip: str) -> None:
|
||||
assert C
|
||||
assert CONN
|
||||
time_step('a_flat')
|
||||
try:
|
||||
value = ip4_flat(value_ip)
|
||||
except (ValueError, IndexError):
|
||||
# Malformed IPs
|
||||
return
|
||||
time_step('a_fetch')
|
||||
C.execute(FEED_A_COMMAND_FETCH, (value,))
|
||||
base = C.fetchone()
|
||||
time_step('a_fetch_confirm')
|
||||
if not base:
|
||||
return
|
||||
b_key, b_firstparty = base
|
||||
if not value.startswith(b_key):
|
||||
return
|
||||
name = name[::-1]
|
||||
time_step('a_upsert')
|
||||
C.execute(FEED_A_COMMAND_UPSERT,
|
||||
(name, b_key, b_firstparty, # Insert
|
||||
b_key, b_firstparty, b_firstparty) # Update
|
||||
)
|
||||
time_step('other')
|
||||
|
||||
|
||||
FEED_CNAME_COMMAND_FETCH = \
|
||||
'SELECT key, type, firstparty FROM blocking ' \
|
||||
'WHERE key<=? ' \
|
||||
f'AND (type={RowType.DomainTree.value} OR type={RowType.Domain.value}) ' \
|
||||
'AND updated=1 ' \
|
||||
'ORDER BY key DESC ' \
|
||||
'LIMIT 1'
|
||||
# f'WHERE ((type={RowType.DomainTree.value} AND key<=?) OR ' \
|
||||
# f'(type={RowType.Domain.value} AND key=?)) ' \
|
||||
# This optimisation is counter productive
|
||||
|
||||
FEED_CNAME_COMMAND_UPSERT = \
|
||||
'INSERT INTO blocking (key, source, type, updated, firstparty) ' \
|
||||
f'VALUES (?, ?, {RowType.Domain.value}, 1, ?)' \
|
||||
'ON CONFLICT(key)' \
|
||||
f'DO UPDATE SET source=?, type={RowType.Domain.value}, ' \
|
||||
'updated=1, firstparty=? ' \
|
||||
'WHERE updated=0 OR firstparty<?'
|
||||
|
||||
|
||||
def feed_cname(name: str, value: str) -> None:
|
||||
assert C
|
||||
assert CONN
|
||||
value = value[::-1]
|
||||
time_step('cname_fetch')
|
||||
C.execute(FEED_CNAME_COMMAND_FETCH, (value,))
|
||||
base = C.fetchone()
|
||||
time_step('cname_fetch_confirm')
|
||||
if not base:
|
||||
# Should only happen at an extremum of the database
|
||||
return
|
||||
b_key, b_type, b_firstparty = base
|
||||
matching = b_key == value[:len(b_key)] and (
|
||||
len(value) == len(b_key)
|
||||
or (
|
||||
b_type == RowType.DomainTree.value
|
||||
and value[len(b_key)] == '.'
|
||||
)
|
||||
)
|
||||
if not matching:
|
||||
return
|
||||
name = name[::-1]
|
||||
time_step('cname_upsert')
|
||||
C.execute(FEED_CNAME_COMMAND_UPSERT,
|
||||
(name, b_key, b_firstparty, # Insert
|
||||
b_key, b_firstparty, b_firstparty) # Update
|
||||
)
|
||||
time_step('other')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Parsing arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Database operations")
|
||||
parser.add_argument(
|
||||
'-r', '--refresh', action='store_true',
|
||||
help="Set the whole database as an old source")
|
||||
args = parser.parse_args()
|
||||
|
||||
open_db()
|
||||
|
||||
if args.refresh:
|
||||
refresh()
|
||||
|
||||
close_db()
|
22
database_schema.sql
Normal file
22
database_schema.sql
Normal file
|
@ -0,0 +1,22 @@
|
|||
-- Remember to increment DB_VERSION
|
||||
-- in database.py on changes to this file
|
||||
|
||||
CREATE TABLE blocking (
|
||||
key text PRIMARY KEY, -- Contains the reversed domain name or IP in binary form
|
||||
source TEXT, -- The rule this one is based on
|
||||
type INTEGER, -- Type of the field: 1: AS, 2: domain tree, 3: domain, 4: IPv4 network, 6: IPv6 network
|
||||
updated INTEGER, -- If the row was updated during last data import (0: No, 1: Yes)
|
||||
firstparty INTEGER, -- Which blocking list this row is issued from (0: first-party, 1: multi-party)
|
||||
FOREIGN KEY (source) REFERENCES blocking(key) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX "blocking_type_updated_key" ON "blocking" (
|
||||
"type",
|
||||
"updated",
|
||||
"key" DESC
|
||||
);
|
||||
|
||||
-- Store various things
|
||||
CREATE TABLE meta (
|
||||
key text PRIMARY KEY,
|
||||
value integer
|
||||
);
|
43
feed_dns.py
Executable file
43
feed_dns.py
Executable file
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import database
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
FUNCTION_MAP = {
|
||||
'a': database.feed_a,
|
||||
'cname': database.feed_cname,
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Parsing arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="TODO")
|
||||
parser.add_argument(
|
||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||
help="TODO")
|
||||
args = parser.parse_args()
|
||||
|
||||
database.open_db()
|
||||
|
||||
try:
|
||||
database.time_step('iowait')
|
||||
for line in args.input:
|
||||
database.time_step('feed_json_parse')
|
||||
split = line.split('"')
|
||||
name = split[7]
|
||||
dtype = split[11]
|
||||
value = split[15]
|
||||
# data = json.loads(line)
|
||||
# assert dtype == data['type']
|
||||
# assert name == data['name']
|
||||
# assert value == data['value']
|
||||
database.time_step('feed_switch')
|
||||
FUNCTION_MAP[dtype](name, value)
|
||||
database.time_step('iowait')
|
||||
except KeyboardInterrupt:
|
||||
print("Interupted.")
|
||||
pass
|
||||
|
||||
database.close_db()
|
40
feed_rules.py
Executable file
40
feed_rules.py
Executable file
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import database
|
||||
import argparse
|
||||
import sys
|
||||
import ipaddress
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Parsing arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="TODO")
|
||||
parser.add_argument(
|
||||
'type',
|
||||
choices={'subdomains', 'ip4network'},
|
||||
help="Type of rule inputed")
|
||||
parser.add_argument(
|
||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||
help="List of domains domains to block (with their subdomains)")
|
||||
parser.add_argument(
|
||||
'-f', '--first-party', action='store_true',
|
||||
help="The input only comes from verified first-party sources")
|
||||
args = parser.parse_args()
|
||||
|
||||
database.open_db()
|
||||
|
||||
if args.type == 'subdomains':
|
||||
for rule in args.input:
|
||||
database.feed_rule_subdomains(
|
||||
rule.strip(), first_party=args.first_party)
|
||||
elif args.type == 'ip4network':
|
||||
for rule in args.input:
|
||||
network = ipaddress.ip_network(rule.strip())
|
||||
database.feed_rule_ip4network(
|
||||
network, first_party=args.first_party)
|
||||
else:
|
||||
assert False
|
||||
|
||||
database.close_db()
|
22
new_workflow.sh
Executable file
22
new_workflow.sh
Executable file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
function log() {
|
||||
echo -e "\033[33m$@\033[0m"
|
||||
}
|
||||
|
||||
log "Preparing database…"
|
||||
./database.py --refresh
|
||||
|
||||
log "Compiling rules…"
|
||||
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py subdomains
|
||||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py subdomains
|
||||
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains
|
||||
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
|
||||
# NOTE: Ensure first-party sources are last
|
||||
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py subdomains --first-party
|
||||
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party
|
||||
|
||||
# log "Reading A records…"
|
||||
# pv a.json.gz | gunzip | ./feed_dns.py
|
||||
# log "Reading CNAME records…"
|
||||
# pv cname.json.gz | gunzip | ./feed_dns.py
|
Loading…
Reference in a new issue