From b310ca2fc219cb2f632b7117924ddd33f64fdbb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Wed, 25 Dec 2019 14:54:57 +0100 Subject: [PATCH] Clever pruning mechanism --- README.md | 27 ++++++++++++++++++++++----- eulaurarien.sh | 3 +++ import_rapid7.sh | 38 ++++++++++++++++++++++++++++++++++---- import_rules.sh | 4 +--- last_updates/.gitignore | 1 + prune.sh | 9 +++++++++ resolve_subdomains.sh | 1 + 7 files changed, 71 insertions(+), 12 deletions(-) create mode 100644 last_updates/.gitignore create mode 100755 prune.sh diff --git a/README.md b/README.md index 2bde314..6430148 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,7 @@ Depending on the sources you'll be using to generate the list, you'll need to in The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it. It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order. -You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ; -`TIMESTAMP` can be generated using `date +%s`. +You can purge of old records the database by running `./prune.sh`. ### Gather external sources @@ -82,7 +81,13 @@ In each folder: - `*.custom.ext` are for sources that you don't want commited Then, run `./import_rules.sh`. -Note that removed rules and every record depending on them will be automatically pruned. + +If you removed rules and you want to remove every record depending on those rules immediately, +run the following command: + +``` +./db.py --prune --prune-before "$(cat "last_updates/rules.txt")" --prune-base +``` ### Add subdomains @@ -127,12 +132,24 @@ If you want to re-import the records without re-doing the resolving, just run th ### Import DNS records from Rapid7 -Just run `./import_rapid7.sh`. -This will download about 35 GiB of data, but only the matching records will be stored (about a few MiB for the tracking rules). +If you have a Rapid7 Organization API key, make sure to append to `.env`: + +``` +RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +``` + +Then, run `./import_rapid7.sh`. +This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules). Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help). +The script remembers which were the last sets downloaded, and will only import newer sets. +If you want to force re-importing, run `rm temp/rapid7_timestamps/sonar.*`. + ### Export the lists For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them). For other purposes, tinker with the `./export.py` program. +### Everything + +Once you've made sure every step runs fine, you can use `./eulaurarien.sh` to run every step consecutively. diff --git a/eulaurarien.sh b/eulaurarien.sh index a78ae27..068d678 100755 --- a/eulaurarien.sh +++ b/eulaurarien.sh @@ -2,9 +2,12 @@ # Main script for eulaurarien +[ ! -f .env ] && touch .env + ./fetch_resources.sh ./collect_subdomains.sh ./import_rules.sh ./resolve_subdomains.sh ./import_rapid7.sh +./prune.sh diff --git a/import_rapid7.sh b/import_rapid7.sh index b0d0a0c..7e70a6c 100755 --- a/import_rapid7.sh +++ b/import_rapid7.sh @@ -11,6 +11,19 @@ function api_call { curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/" } +function get_timestamp { # study, dataset + study="$1" + dataset="$2" + if [ -z "$RAPID7_API_KEY" ] + then + line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1) + echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4 + else + filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1) + echo $filename | cut -d'-' -f4 + fi +} + function get_download_url { # study, dataset study="$1" dataset="$2" @@ -28,13 +41,30 @@ function feed_rapid7 { # study, dataset study="$1" dataset="$2" shift; shift - link="$(get_download_url $study $dataset)" - log "Reading $dataset dataset from $link…" - curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ + new_ts="$(get_timestamp $study $dataset)" + old_ts_file="last_updates/rapid7_${study}_${dataset}.txt" + if [ -f "$old_ts_file" ] + then + old_ts=$(cat "$old_ts_file") + else + old_ts="0" + fi + if [ $new_ts -gt $old_ts ] + then + link="$(get_download_url $study $dataset)" + log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…" + # curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ + if [ $? -eq 0 ] + then + echo $new_ts > $old_ts_file + fi + else + log "Skipping $dataset as there is no new version since $old_ts" + fi } feed_rapid7 sonar.rdns_v2 rdns feed_rapid7 sonar.fdns_v2 fdns_a --ip4-cache "$CACHE_SIZE" -feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE" +# feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE" feed_rapid7 sonar.fdns_v2 fdns_cname diff --git a/import_rules.sh b/import_rules.sh index 14c8c78..f1c3f46 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -5,7 +5,7 @@ function log() { } log "Importing rules…" -BEFORE="$(date +%s)" +date +%s > "last_updates/rules.txt" cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone @@ -18,5 +18,3 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py -log "Pruning old rules…" -./db.py --prune --prune-before "$BEFORE" --prune-base diff --git a/last_updates/.gitignore b/last_updates/.gitignore new file mode 100644 index 0000000..2211df6 --- /dev/null +++ b/last_updates/.gitignore @@ -0,0 +1 @@ +*.txt diff --git a/prune.sh b/prune.sh new file mode 100755 index 0000000..b999993 --- /dev/null +++ b/prune.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +function log() { + echo -e "\033[33m$@\033[0m" +} + +oldest="$(cat last_updates/*.txt | sort -n | head -1)" +log "Pruning every record before ${oldest}…" +./db.py --prune --prune-before "$oldest" diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index d163b77..fde695e 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -16,6 +16,7 @@ log "Compiling subdomains…" pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list log "Resolving subdomain…" +date +%s > "last_updates/massdns.txt" massdns --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list log "Importing into database…"