Browse Source
Added intermediate representation for DNS datasets
Added intermediate representation for DNS datasets
It's just CSV. The DNS from the datasets are not ordered consistently, so we need to parse it completly. It seems that converting to an IR before sending data to ./feed_dns.py through a pipe is faster than decoding the JSON in ./feed_dns.py. This will also reduce the storage of the resolved subdomains by about 15% (compressed).newworkflow_threaded^2
5 changed files with 68 additions and 31 deletions
@ -0,0 +1,36 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
import argparse |
|||
import sys |
|||
import logging |
|||
import json |
|||
import csv |
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# Parsing arguments |
|||
log = logging.getLogger('json_to_csv') |
|||
parser = argparse.ArgumentParser( |
|||
description="TODO") |
|||
parser.add_argument( |
|||
# '-i', '--input', type=argparse.FileType('rb'), default=sys.stdin.buffer, |
|||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin, |
|||
help="TODO") |
|||
parser.add_argument( |
|||
# '-i', '--output', type=argparse.FileType('wb'), default=sys.stdout.buffer, |
|||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout, |
|||
help="TODO") |
|||
args = parser.parse_args() |
|||
|
|||
writer = csv.writer(args.output) |
|||
for line in args.input: |
|||
data = json.loads(line) |
|||
try: |
|||
writer.writerow([ |
|||
data['type'][0], |
|||
data['timestamp'], |
|||
data['name'], |
|||
data['value']]) |
|||
except IndexError: |
|||
log.error('Could not parse line: %s', line) |
|||
pass |
Write
Preview
Loading…
Cancel
Save
Reference in new issue