eulaurarien/feed_dns.py
Geoffrey Frogeye 5023b85d7c
Added intermediate representation for DNS datasets
It's just CSV.
The DNS from the datasets are not ordered consistently,
so we need to parse it completly.
It seems that converting to an IR before sending data to ./feed_dns.py
through a pipe is faster than decoding the JSON in ./feed_dns.py.
This will also reduce the storage of the resolved subdomains by
about 15% (compressed).
2019-12-13 21:59:35 +01:00

65 lines
2.2 KiB
Python
Executable file

#!/usr/bin/env python3
import database
import argparse
import sys
import logging
import csv
import json
if __name__ == '__main__':
# Parsing arguments
log = logging.getLogger('feed_dns')
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
# '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer,
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = parser.parse_args()
DB = database.Database(write=True)
try:
DB.enter_step('iowait')
for row in csv.reader(args.input):
# for line in args.input:
DB.enter_step('feed_csv_parse')
dtype, timestamp, name, value = row
# DB.enter_step('feed_json_parse')
# data = json.loads(line)
# dtype = data['type'][0]
# # timestamp = data['timestamp']
# name = data['name']
# value = data['value']
DB.enter_step('feed_switch')
if dtype == 'a':
for rule in DB.get_ip4(value):
if not list(DB.get_domain_in_zone(name)):
DB.set_hostname(name, source=rule,
updated=int(timestamp))
# updated=int(data['timestamp']))
elif dtype == 'c':
for rule in DB.get_domain(value):
if not list(DB.get_domain_in_zone(name)):
DB.set_hostname(name, source=rule,
updated=int(timestamp))
# updated=int(data['timestamp']))
elif dtype == 'p':
for rule in DB.get_domain(value):
if not list(DB.get_ip4_in_network(name)):
DB.set_ip4address(name, source=rule,
updated=int(timestamp))
# updated=int(data['timestamp']))
else:
raise NotImplementedError(f'Type: {dtype}')
DB.enter_step('iowait')
except KeyboardInterrupt:
log.warning("Interupted.")
pass
DB.close()