diff --git a/.env.default b/.env.default index 1755c0f..5412e18 100644 --- a/.env.default +++ b/.env.default @@ -2,3 +2,4 @@ RAPID7_API_KEY= CACHE_SIZE=536870912 MASSDNS_HASHMAP_SIZE=1000 PROFILE=0 +SINGLE_PROCESS=0 diff --git a/README.md b/README.md index 6430148..7680c5d 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,9 @@ Depending on the sources you'll be using to generate the list, you'll need to in The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it. It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order. + You can purge of old records the database by running `./prune.sh`. +When you remove a source of data, remove its corresponding file in `last_updates` to fix the pruning process. ### Gather external sources @@ -143,7 +145,7 @@ This will download about 35 GiB of data the first time, but only the matching re Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help). The script remembers which were the last sets downloaded, and will only import newer sets. -If you want to force re-importing, run `rm temp/rapid7_timestamps/sonar.*`. +If you want to force re-importing, run `rm last_updates/rapid7_*.txt`. ### Export the lists diff --git a/import_rapid7.sh b/import_rapid7.sh index 7e70a6c..a44e822 100755 --- a/import_rapid7.sh +++ b/import_rapid7.sh @@ -53,7 +53,8 @@ function feed_rapid7 { # study, dataset then link="$(get_download_url $study $dataset)" log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…" - # curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ + [ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process" + curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ $EXTRA_ARGS if [ $? -eq 0 ] then echo $new_ts > $old_ts_file diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index fde695e..82f85c5 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -20,4 +20,5 @@ date +%s > "last_updates/massdns.txt" massdns --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list log "Importing into database…" -pv temp/all_resolved.txt | ./feed_dns.py massdns +[ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process" +pv temp/all_resolved.txt | ./feed_dns.py massdns --ip4-cache "$CACHE_SIZE" $EXTRA_ARGS