Remove support for Rapid7
They changed their privacy / pricing model and as such I don't have access to their massive DNS dataset anymore, even after asking. Since 2022-01-02, I put the list on freeze while looking for an alternative, but couldn't find any. To make the list update again with the remaining DNS sources I have, I put the last version of the list generated with the Rapid7 dataset as an input for subdomains, that will now get resolved with MassDNS.
This commit is contained in:
parent
49a36f32f2
commit
3b6f7a58b3
|
@ -1,4 +1,3 @@
|
|||
RAPID7_API_KEY=
|
||||
CACHE_SIZE=536870912
|
||||
MASSDNS_HASHMAP_SIZE=1000
|
||||
PROFILE=0
|
||||
|
|
19
README.md
19
README.md
|
@ -18,7 +18,7 @@ This program takes as input:
|
|||
|
||||
It will be able to output hostnames being a DNS redirection to any item in the lists provided.
|
||||
|
||||
DNS records can either come from [Rapid7 Open Data Sets](https://opendata.rapid7.com/sonar.fdns_v2/) or can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns).
|
||||
DNS records can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns).
|
||||
|
||||
Those subdomains can either be provided as is, come from [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html), from your browsing history, or from analyzing the traffic a web browser makes when opening an URL (the program provides utility to do all that).
|
||||
|
||||
|
@ -41,7 +41,6 @@ Depending on the sources you'll be using to generate the list, you'll need to in
|
|||
- [coloredlogs](https://pypi.org/project/coloredlogs/) (sorry I can't help myself)
|
||||
- [numpy](https://www.numpy.org/)
|
||||
- [python-abp](https://pypi.org/project/python-abp/) (only if you intend to use AdBlock rules as a rule source)
|
||||
- [jq](http://stedolan.github.io/jq/) (only if you have a Rapid7 API key)
|
||||
- [massdns](https://github.com/blechschmidt/massdns) in your `$PATH` (only if you have subdomains as a source)
|
||||
- [Firefox](https://www.mozilla.org/firefox/) (only if you have websites as a source)
|
||||
- [selenium (Python bindings)](https://pypi.python.org/pypi/selenium) (only if you have websites as a source)
|
||||
|
@ -135,22 +134,6 @@ Note that this is a network intensive process, not in term of bandwith, but in t
|
|||
The DNS records will automatically be imported into the database.
|
||||
If you want to re-import the records without re-doing the resolving, just run the last line of the `./resolve_subdomains.sh` script.
|
||||
|
||||
### Import DNS records from Rapid7
|
||||
|
||||
If you have a Rapid7 Organization API key, make sure to append to `.env`:
|
||||
|
||||
```
|
||||
RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
|
||||
```
|
||||
|
||||
Then, run `./import_rapid7.sh`.
|
||||
This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules).
|
||||
Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help).
|
||||
|
||||
The script remembers which were the last sets downloaded, and will only newer sets.
|
||||
If the first-party rules changed, the corresponding sets will be re-imported anyway.
|
||||
If you want to force re-importing, run `rm last_updates/rapid7_*.txt`.
|
||||
|
||||
### Export the lists
|
||||
|
||||
For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them).
|
||||
|
|
1
dist/README.md
vendored
1
dist/README.md
vendored
|
@ -102,7 +102,6 @@ Some of the first-party tracker included in this list have been found by:
|
|||
|
||||
The list was generated using data from
|
||||
|
||||
- [Rapid7 OpenData](https://opendata.rapid7.com/sonar.fdns_v2/), who kindly provided a free account
|
||||
- [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html)
|
||||
- [Public DNS Server List](https://public-dns.info/)
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
./collect_subdomains.sh
|
||||
./import_rules.sh
|
||||
./resolve_subdomains.sh
|
||||
./import_rapid7.sh
|
||||
./prune.sh
|
||||
./export_lists.sh
|
||||
./generate_index.py
|
||||
|
|
|
@ -76,7 +76,7 @@ do
|
|||
echo "# Oldest record: $oldest_date"
|
||||
echo "# Number of source websites: $number_websites"
|
||||
echo "# Number of source subdomains: $number_subdomains"
|
||||
echo "# Number of source DNS records: ~2E9 + $number_dns"
|
||||
echo "# Number of source DNS records: $number_dns"
|
||||
echo "#"
|
||||
echo "# Input rules: $rules_input"
|
||||
echo "# Subsequent rules: $rules_found"
|
||||
|
|
29
feed_dns.py
29
feed_dns.py
|
@ -130,34 +130,6 @@ class Parser:
|
|||
raise NotImplementedError
|
||||
|
||||
|
||||
class Rapid7Parser(Parser):
|
||||
def consume(self) -> None:
|
||||
data = dict()
|
||||
for line in self.buf:
|
||||
self.prof.enter_step("parse_rapid7")
|
||||
split = line.split('"')
|
||||
|
||||
try:
|
||||
for k in range(1, 14, 4):
|
||||
key = split[k]
|
||||
val = split[k + 2]
|
||||
data[key] = val
|
||||
|
||||
select, writer = FUNCTION_MAP[data["type"]]
|
||||
record = (
|
||||
select,
|
||||
writer,
|
||||
int(data["timestamp"]),
|
||||
data["name"],
|
||||
data["value"],
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
# IndexError: missing field
|
||||
# KeyError: Unknown type field
|
||||
self.log.exception("Cannot parse: %s", line)
|
||||
self.register(record)
|
||||
|
||||
|
||||
class MassDnsParser(Parser):
|
||||
# massdns --output Snrql
|
||||
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
|
||||
|
@ -200,7 +172,6 @@ class MassDnsParser(Parser):
|
|||
|
||||
|
||||
PARSERS = {
|
||||
"rapid7": Rapid7Parser,
|
||||
"massdns": MassDnsParser,
|
||||
}
|
||||
|
||||
|
|
|
@ -1,81 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
source .env.default
|
||||
source .env
|
||||
|
||||
function log() {
|
||||
echo -e "\033[33m$@\033[0m"
|
||||
}
|
||||
|
||||
function api_call {
|
||||
curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/"
|
||||
}
|
||||
|
||||
function get_timestamp { # study, dataset
|
||||
study="$1"
|
||||
dataset="$2"
|
||||
if [ -z "$RAPID7_API_KEY" ]
|
||||
then
|
||||
line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
|
||||
echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4
|
||||
else
|
||||
filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
|
||||
echo $filename | cut -d'-' -f4
|
||||
fi
|
||||
}
|
||||
|
||||
function get_download_url { # study, dataset
|
||||
study="$1"
|
||||
dataset="$2"
|
||||
if [ -z "$RAPID7_API_KEY" ]
|
||||
then
|
||||
line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
|
||||
echo "https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)"
|
||||
else
|
||||
filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
|
||||
api_call "$study/$filename/download" | jq '.url' -r
|
||||
fi
|
||||
}
|
||||
|
||||
function feed_rapid7 { # study, dataset, rule_file, ./feed_dns args
|
||||
# The dataset will be imported if:
|
||||
# none of this dataset was ever imported
|
||||
# or
|
||||
# the last dataset imported is older than the one to be imported
|
||||
# or
|
||||
# the rule_file is newer than when the last dataset was imported
|
||||
#
|
||||
# (note the difference between the age oft the dataset itself and
|
||||
# the date when it is imported)
|
||||
study="$1"
|
||||
dataset="$2"
|
||||
rule_file="$3"
|
||||
shift; shift; shift
|
||||
new_ts="$(get_timestamp $study $dataset)"
|
||||
old_ts_file="last_updates/rapid7_${study}_${dataset}.txt"
|
||||
if [ -f "$old_ts_file" ]
|
||||
then
|
||||
old_ts=$(cat "$old_ts_file")
|
||||
else
|
||||
old_ts="0"
|
||||
fi
|
||||
if [ $new_ts -gt $old_ts ] || [ $rule_file -nt $old_ts_file ]
|
||||
then
|
||||
link="$(get_download_url $study $dataset)"
|
||||
log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…"
|
||||
[ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process"
|
||||
curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ $EXTRA_ARGS
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
echo $new_ts > $old_ts_file
|
||||
fi
|
||||
else
|
||||
log "Skipping $dataset as there is no new version since $old_ts"
|
||||
fi
|
||||
}
|
||||
|
||||
# feed_rapid7 sonar.rdns_v2 rdns rules_asn/first-party.list
|
||||
feed_rapid7 sonar.fdns_v2 fdns_a rules_asn/first-party.list --ip4-cache "$CACHE_SIZE"
|
||||
# feed_rapid7 sonar.fdns_v2 fdns_aaaa rules_asn/first-party.list --ip6-cache "$CACHE_SIZE"
|
||||
feed_rapid7 sonar.fdns_v2 fdns_cname rules/first-party.list
|
||||
|
Loading…
Reference in a new issue