Clever pruning mechanism
This commit is contained in:
parent
bb9e6de62f
commit
b310ca2fc2
27
README.md
27
README.md
|
@ -49,8 +49,7 @@ Depending on the sources you'll be using to generate the list, you'll need to in
|
|||
|
||||
The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
|
||||
It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order.
|
||||
You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ;
|
||||
`TIMESTAMP` can be generated using `date +%s`.
|
||||
You can purge of old records the database by running `./prune.sh`.
|
||||
|
||||
### Gather external sources
|
||||
|
||||
|
@ -82,7 +81,13 @@ In each folder:
|
|||
- `*.custom.ext` are for sources that you don't want commited
|
||||
|
||||
Then, run `./import_rules.sh`.
|
||||
Note that removed rules and every record depending on them will be automatically pruned.
|
||||
|
||||
If you removed rules and you want to remove every record depending on those rules immediately,
|
||||
run the following command:
|
||||
|
||||
```
|
||||
./db.py --prune --prune-before "$(cat "last_updates/rules.txt")" --prune-base
|
||||
```
|
||||
|
||||
### Add subdomains
|
||||
|
||||
|
@ -127,12 +132,24 @@ If you want to re-import the records without re-doing the resolving, just run th
|
|||
|
||||
### Import DNS records from Rapid7
|
||||
|
||||
Just run `./import_rapid7.sh`.
|
||||
This will download about 35 GiB of data, but only the matching records will be stored (about a few MiB for the tracking rules).
|
||||
If you have a Rapid7 Organization API key, make sure to append to `.env`:
|
||||
|
||||
```
|
||||
RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
|
||||
```
|
||||
|
||||
Then, run `./import_rapid7.sh`.
|
||||
This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules).
|
||||
Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help).
|
||||
|
||||
The script remembers which were the last sets downloaded, and will only import newer sets.
|
||||
If you want to force re-importing, run `rm temp/rapid7_timestamps/sonar.*`.
|
||||
|
||||
### Export the lists
|
||||
|
||||
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
|
||||
For other purposes, tinker with the `./export.py` program.
|
||||
|
||||
### Everything
|
||||
|
||||
Once you've made sure every step runs fine, you can use `./eulaurarien.sh` to run every step consecutively.
|
||||
|
|
|
@ -2,9 +2,12 @@
|
|||
|
||||
# Main script for eulaurarien
|
||||
|
||||
[ ! -f .env ] && touch .env
|
||||
|
||||
./fetch_resources.sh
|
||||
./collect_subdomains.sh
|
||||
./import_rules.sh
|
||||
./resolve_subdomains.sh
|
||||
./import_rapid7.sh
|
||||
./prune.sh
|
||||
|
||||
|
|
|
@ -11,6 +11,19 @@ function api_call {
|
|||
curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/"
|
||||
}
|
||||
|
||||
function get_timestamp { # study, dataset
|
||||
study="$1"
|
||||
dataset="$2"
|
||||
if [ -z "$RAPID7_API_KEY" ]
|
||||
then
|
||||
line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
|
||||
echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4
|
||||
else
|
||||
filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
|
||||
echo $filename | cut -d'-' -f4
|
||||
fi
|
||||
}
|
||||
|
||||
function get_download_url { # study, dataset
|
||||
study="$1"
|
||||
dataset="$2"
|
||||
|
@ -28,13 +41,30 @@ function feed_rapid7 { # study, dataset
|
|||
study="$1"
|
||||
dataset="$2"
|
||||
shift; shift
|
||||
link="$(get_download_url $study $dataset)"
|
||||
log "Reading $dataset dataset from $link…"
|
||||
curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@
|
||||
new_ts="$(get_timestamp $study $dataset)"
|
||||
old_ts_file="last_updates/rapid7_${study}_${dataset}.txt"
|
||||
if [ -f "$old_ts_file" ]
|
||||
then
|
||||
old_ts=$(cat "$old_ts_file")
|
||||
else
|
||||
old_ts="0"
|
||||
fi
|
||||
if [ $new_ts -gt $old_ts ]
|
||||
then
|
||||
link="$(get_download_url $study $dataset)"
|
||||
log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…"
|
||||
# curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
echo $new_ts > $old_ts_file
|
||||
fi
|
||||
else
|
||||
log "Skipping $dataset as there is no new version since $old_ts"
|
||||
fi
|
||||
}
|
||||
|
||||
feed_rapid7 sonar.rdns_v2 rdns
|
||||
feed_rapid7 sonar.fdns_v2 fdns_a --ip4-cache "$CACHE_SIZE"
|
||||
feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE"
|
||||
# feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE"
|
||||
feed_rapid7 sonar.fdns_v2 fdns_cname
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ function log() {
|
|||
}
|
||||
|
||||
log "Importing rules…"
|
||||
BEFORE="$(date +%s)"
|
||||
date +%s > "last_updates/rules.txt"
|
||||
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
|
||||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
|
||||
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
|
||||
|
@ -18,5 +18,3 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as
|
|||
|
||||
./feed_asn.py
|
||||
|
||||
log "Pruning old rules…"
|
||||
./db.py --prune --prune-before "$BEFORE" --prune-base
|
||||
|
|
1
last_updates/.gitignore
vendored
Normal file
1
last_updates/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*.txt
|
9
prune.sh
Executable file
9
prune.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
function log() {
|
||||
echo -e "\033[33m$@\033[0m"
|
||||
}
|
||||
|
||||
oldest="$(cat last_updates/*.txt | sort -n | head -1)"
|
||||
log "Pruning every record before ${oldest}…"
|
||||
./db.py --prune --prune-before "$oldest"
|
|
@ -16,6 +16,7 @@ log "Compiling subdomains…"
|
|||
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
|
||||
|
||||
log "Resolving subdomain…"
|
||||
date +%s > "last_updates/massdns.txt"
|
||||
massdns --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
|
||||
|
||||
log "Importing into database…"
|
||||
|
|
Loading…
Reference in a new issue