Clever pruning mechanism

2019-12-25 14:54:57 +01:00 · 2019-12-25 14:54:57 +01:00 · b310ca2fc2
parent bb9e6de62f
commit b310ca2fc2
7 changed files with 71 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -49,8 +49,7 @@ Depending on the sources you'll be using to generate the list, you'll need to in

 The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
 It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order.
-You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ;
-`TIMESTAMP` can be generated using `date +%s`.
+You can purge of old records the database by running `./prune.sh`.

 ### Gather external sources

@ -82,7 +81,13 @@ In each folder:
 - `*.custom.ext` are for sources that you don't want commited

 Then, run `./import_rules.sh`.
-Note that removed rules and every record depending on them will be automatically pruned.
+
+If you removed rules and you want to remove every record depending on those rules immediately,
+run the following command:
+
+```
+./db.py --prune --prune-before "$(cat "last_updates/rules.txt")" --prune-base
+```

 ### Add subdomains

@ -127,12 +132,24 @@ If you want to re-import the records without re-doing the resolving, just run th

 ### Import DNS records from Rapid7

-Just run `./import_rapid7.sh`.
-This will download about 35 GiB of data, but only the matching records will be stored (about a few MiB for the tracking rules).
+If you have a Rapid7 Organization API key, make sure to append to `.env`:
+
+```
+RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+```
+
+Then, run `./import_rapid7.sh`.
+This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules).
 Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help).

+The script remembers which were the last sets downloaded, and will only import newer sets.
+If you want to force re-importing, run `rm temp/rapid7_timestamps/sonar.*`.
+
 ### Export the lists

 For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
 For other purposes, tinker with the `./export.py` program.

+### Everything
+
+Once you've made sure every step runs fine, you can use `./eulaurarien.sh` to run every step consecutively.
--- a/eulaurarien.sh
+++ b/eulaurarien.sh
@ -2,9 +2,12 @@

 # Main script for eulaurarien

+[ ! -f .env ] && touch .env
+
 ./fetch_resources.sh
 ./collect_subdomains.sh
 ./import_rules.sh
 ./resolve_subdomains.sh
 ./import_rapid7.sh
+./prune.sh

--- a/import_rapid7.sh
+++ b/import_rapid7.sh
@ -11,6 +11,19 @@ function api_call {
    curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/"
 }

+function get_timestamp { # study, dataset
+    study="$1"
+    dataset="$2"
+    if [ -z "$RAPID7_API_KEY" ]
+    then
+        line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
+        echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4
+    else
+        filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
+        echo $filename | cut -d'-' -f4
+    fi
+}
+
 function get_download_url { # study, dataset
    study="$1"
    dataset="$2"
@ -28,13 +41,30 @@ function feed_rapid7 { # study, dataset
    study="$1"
    dataset="$2"
    shift; shift
-    link="$(get_download_url $study $dataset)"
-    log "Reading $dataset dataset from $link…"
-    curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@
+    new_ts="$(get_timestamp $study $dataset)"
+    old_ts_file="last_updates/rapid7_${study}_${dataset}.txt"
+    if [ -f "$old_ts_file" ]
+    then
+        old_ts=$(cat "$old_ts_file")
+    else
+        old_ts="0"
+    fi
+    if [ $new_ts -gt $old_ts ]
+    then
+        link="$(get_download_url $study $dataset)"
+        log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…"
+        # curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@
+        if [ $? -eq 0 ]
+        then
+            echo $new_ts > $old_ts_file
+        fi
+    else
+        log "Skipping $dataset as there is no new version since $old_ts"
+    fi
 }

 feed_rapid7 sonar.rdns_v2 rdns
 feed_rapid7 sonar.fdns_v2 fdns_a --ip4-cache "$CACHE_SIZE"
-feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE"
+# feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE"
 feed_rapid7 sonar.fdns_v2 fdns_cname

--- a/import_rules.sh
+++ b/import_rules.sh
@ -5,7 +5,7 @@ function log() {
 }

 log "Importing rules…"
-BEFORE="$(date +%s)"
+date +%s > "last_updates/rules.txt"
 cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
 cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
 cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
@ -18,5 +18,3 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as

 ./feed_asn.py

-log "Pruning old rules…"
-./db.py --prune --prune-before "$BEFORE" --prune-base
--- a/last_updates/.gitignore
+++ b/last_updates/.gitignore
@ -0,0 +1 @@
+*.txt
--- a/prune.sh
+++ b/prune.sh
@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+function log() {
+    echo -e "\033[33m$@\033[0m"
+}
+
+oldest="$(cat last_updates/*.txt | sort -n | head -1)"
+log "Pruning every record before ${oldest}…"
+./db.py --prune --prune-before "$oldest"
--- a/resolve_subdomains.sh
+++ b/resolve_subdomains.sh
@ -16,6 +16,7 @@ log "Compiling subdomains…"
 pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list

 log "Resolving subdomain…"
+date +%s > "last_updates/massdns.txt"
 massdns --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list

 log "Importing into database…"