Clever pruning mechanism
This commit is contained in:
parent
bb9e6de62f
commit
b310ca2fc2
27
README.md
27
README.md
|
@ -49,8 +49,7 @@ Depending on the sources you'll be using to generate the list, you'll need to in
|
||||||
|
|
||||||
The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
|
The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
|
||||||
It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order.
|
It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order.
|
||||||
You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ;
|
You can purge of old records the database by running `./prune.sh`.
|
||||||
`TIMESTAMP` can be generated using `date +%s`.
|
|
||||||
|
|
||||||
### Gather external sources
|
### Gather external sources
|
||||||
|
|
||||||
|
@ -82,7 +81,13 @@ In each folder:
|
||||||
- `*.custom.ext` are for sources that you don't want commited
|
- `*.custom.ext` are for sources that you don't want commited
|
||||||
|
|
||||||
Then, run `./import_rules.sh`.
|
Then, run `./import_rules.sh`.
|
||||||
Note that removed rules and every record depending on them will be automatically pruned.
|
|
||||||
|
If you removed rules and you want to remove every record depending on those rules immediately,
|
||||||
|
run the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
./db.py --prune --prune-before "$(cat "last_updates/rules.txt")" --prune-base
|
||||||
|
```
|
||||||
|
|
||||||
### Add subdomains
|
### Add subdomains
|
||||||
|
|
||||||
|
@ -127,12 +132,24 @@ If you want to re-import the records without re-doing the resolving, just run th
|
||||||
|
|
||||||
### Import DNS records from Rapid7
|
### Import DNS records from Rapid7
|
||||||
|
|
||||||
Just run `./import_rapid7.sh`.
|
If you have a Rapid7 Organization API key, make sure to append to `.env`:
|
||||||
This will download about 35 GiB of data, but only the matching records will be stored (about a few MiB for the tracking rules).
|
|
||||||
|
```
|
||||||
|
RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, run `./import_rapid7.sh`.
|
||||||
|
This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules).
|
||||||
Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help).
|
Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help).
|
||||||
|
|
||||||
|
The script remembers which were the last sets downloaded, and will only import newer sets.
|
||||||
|
If you want to force re-importing, run `rm temp/rapid7_timestamps/sonar.*`.
|
||||||
|
|
||||||
### Export the lists
|
### Export the lists
|
||||||
|
|
||||||
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
|
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` forlder (please change the links before distributing them).
|
||||||
For other purposes, tinker with the `./export.py` program.
|
For other purposes, tinker with the `./export.py` program.
|
||||||
|
|
||||||
|
### Everything
|
||||||
|
|
||||||
|
Once you've made sure every step runs fine, you can use `./eulaurarien.sh` to run every step consecutively.
|
||||||
|
|
|
@ -2,9 +2,12 @@
|
||||||
|
|
||||||
# Main script for eulaurarien
|
# Main script for eulaurarien
|
||||||
|
|
||||||
|
[ ! -f .env ] && touch .env
|
||||||
|
|
||||||
./fetch_resources.sh
|
./fetch_resources.sh
|
||||||
./collect_subdomains.sh
|
./collect_subdomains.sh
|
||||||
./import_rules.sh
|
./import_rules.sh
|
||||||
./resolve_subdomains.sh
|
./resolve_subdomains.sh
|
||||||
./import_rapid7.sh
|
./import_rapid7.sh
|
||||||
|
./prune.sh
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,19 @@ function api_call {
|
||||||
curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/"
|
curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function get_timestamp { # study, dataset
|
||||||
|
study="$1"
|
||||||
|
dataset="$2"
|
||||||
|
if [ -z "$RAPID7_API_KEY" ]
|
||||||
|
then
|
||||||
|
line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
|
||||||
|
echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4
|
||||||
|
else
|
||||||
|
filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
|
||||||
|
echo $filename | cut -d'-' -f4
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
function get_download_url { # study, dataset
|
function get_download_url { # study, dataset
|
||||||
study="$1"
|
study="$1"
|
||||||
dataset="$2"
|
dataset="$2"
|
||||||
|
@ -28,13 +41,30 @@ function feed_rapid7 { # study, dataset
|
||||||
study="$1"
|
study="$1"
|
||||||
dataset="$2"
|
dataset="$2"
|
||||||
shift; shift
|
shift; shift
|
||||||
link="$(get_download_url $study $dataset)"
|
new_ts="$(get_timestamp $study $dataset)"
|
||||||
log "Reading $dataset dataset from $link…"
|
old_ts_file="last_updates/rapid7_${study}_${dataset}.txt"
|
||||||
curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@
|
if [ -f "$old_ts_file" ]
|
||||||
|
then
|
||||||
|
old_ts=$(cat "$old_ts_file")
|
||||||
|
else
|
||||||
|
old_ts="0"
|
||||||
|
fi
|
||||||
|
if [ $new_ts -gt $old_ts ]
|
||||||
|
then
|
||||||
|
link="$(get_download_url $study $dataset)"
|
||||||
|
log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…"
|
||||||
|
# curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@
|
||||||
|
if [ $? -eq 0 ]
|
||||||
|
then
|
||||||
|
echo $new_ts > $old_ts_file
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "Skipping $dataset as there is no new version since $old_ts"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
feed_rapid7 sonar.rdns_v2 rdns
|
feed_rapid7 sonar.rdns_v2 rdns
|
||||||
feed_rapid7 sonar.fdns_v2 fdns_a --ip4-cache "$CACHE_SIZE"
|
feed_rapid7 sonar.fdns_v2 fdns_a --ip4-cache "$CACHE_SIZE"
|
||||||
feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE"
|
# feed_rapid7 sonar.fdns_v2 fdns_aaaa --ip6-cache "$CACHE_SIZE"
|
||||||
feed_rapid7 sonar.fdns_v2 fdns_cname
|
feed_rapid7 sonar.fdns_v2 fdns_cname
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ function log() {
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Importing rules…"
|
log "Importing rules…"
|
||||||
BEFORE="$(date +%s)"
|
date +%s > "last_updates/rules.txt"
|
||||||
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
|
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
|
||||||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
|
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
|
||||||
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
|
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
|
||||||
|
@ -18,5 +18,3 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as
|
||||||
|
|
||||||
./feed_asn.py
|
./feed_asn.py
|
||||||
|
|
||||||
log "Pruning old rules…"
|
|
||||||
./db.py --prune --prune-before "$BEFORE" --prune-base
|
|
||||||
|
|
1
last_updates/.gitignore
vendored
Normal file
1
last_updates/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
*.txt
|
9
prune.sh
Executable file
9
prune.sh
Executable file
|
@ -0,0 +1,9 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
function log() {
|
||||||
|
echo -e "\033[33m$@\033[0m"
|
||||||
|
}
|
||||||
|
|
||||||
|
oldest="$(cat last_updates/*.txt | sort -n | head -1)"
|
||||||
|
log "Pruning every record before ${oldest}…"
|
||||||
|
./db.py --prune --prune-before "$oldest"
|
|
@ -16,6 +16,7 @@ log "Compiling subdomains…"
|
||||||
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
|
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
|
||||||
|
|
||||||
log "Resolving subdomain…"
|
log "Resolving subdomain…"
|
||||||
|
date +%s > "last_updates/massdns.txt"
|
||||||
massdns --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
|
massdns --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
|
||||||
|
|
||||||
log "Importing into database…"
|
log "Importing into database…"
|
||||||
|
|
Loading…
Reference in a new issue