From 3b6f7a58b36312d6849512a84ecabe8429327da1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Sun, 13 Nov 2022 20:10:27 +0100 Subject: [PATCH] Remove support for Rapid7 They changed their privacy / pricing model and as such I don't have access to their massive DNS dataset anymore, even after asking. Since 2022-01-02, I put the list on freeze while looking for an alternative, but couldn't find any. To make the list update again with the remaining DNS sources I have, I put the last version of the list generated with the Rapid7 dataset as an input for subdomains, that will now get resolved with MassDNS. --- .env.default | 1 - README.md | 19 +----------- dist/README.md | 1 - eulaurarien.sh | 1 - export_lists.sh | 2 +- feed_dns.py | 29 ----------------- import_rapid7.sh | 81 ------------------------------------------------ 7 files changed, 2 insertions(+), 132 deletions(-) delete mode 100755 import_rapid7.sh diff --git a/.env.default b/.env.default index d9e9f13..e96ff6b 100644 --- a/.env.default +++ b/.env.default @@ -1,4 +1,3 @@ -RAPID7_API_KEY= CACHE_SIZE=536870912 MASSDNS_HASHMAP_SIZE=1000 PROFILE=0 diff --git a/README.md b/README.md index 0985a9d..1689217 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ This program takes as input: It will be able to output hostnames being a DNS redirection to any item in the lists provided. -DNS records can either come from [Rapid7 Open Data Sets](https://opendata.rapid7.com/sonar.fdns_v2/) or can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns). +DNS records can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns). Those subdomains can either be provided as is, come from [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html), from your browsing history, or from analyzing the traffic a web browser makes when opening an URL (the program provides utility to do all that). @@ -41,7 +41,6 @@ Depending on the sources you'll be using to generate the list, you'll need to in - [coloredlogs](https://pypi.org/project/coloredlogs/) (sorry I can't help myself) - [numpy](https://www.numpy.org/) - [python-abp](https://pypi.org/project/python-abp/) (only if you intend to use AdBlock rules as a rule source) -- [jq](http://stedolan.github.io/jq/) (only if you have a Rapid7 API key) - [massdns](https://github.com/blechschmidt/massdns) in your `$PATH` (only if you have subdomains as a source) - [Firefox](https://www.mozilla.org/firefox/) (only if you have websites as a source) - [selenium (Python bindings)](https://pypi.python.org/pypi/selenium) (only if you have websites as a source) @@ -135,22 +134,6 @@ Note that this is a network intensive process, not in term of bandwith, but in t The DNS records will automatically be imported into the database. If you want to re-import the records without re-doing the resolving, just run the last line of the `./resolve_subdomains.sh` script. -### Import DNS records from Rapid7 - -If you have a Rapid7 Organization API key, make sure to append to `.env`: - -``` -RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx -``` - -Then, run `./import_rapid7.sh`. -This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules). -Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help). - -The script remembers which were the last sets downloaded, and will only newer sets. -If the first-party rules changed, the corresponding sets will be re-imported anyway. -If you want to force re-importing, run `rm last_updates/rapid7_*.txt`. - ### Export the lists For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them). diff --git a/dist/README.md b/dist/README.md index 9a2d0e3..c034b0e 100644 --- a/dist/README.md +++ b/dist/README.md @@ -102,7 +102,6 @@ Some of the first-party tracker included in this list have been found by: The list was generated using data from -- [Rapid7 OpenData](https://opendata.rapid7.com/sonar.fdns_v2/), who kindly provided a free account - [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html) - [Public DNS Server List](https://public-dns.info/) diff --git a/eulaurarien.sh b/eulaurarien.sh index cb8aba3..c6ef23d 100755 --- a/eulaurarien.sh +++ b/eulaurarien.sh @@ -8,7 +8,6 @@ ./collect_subdomains.sh ./import_rules.sh ./resolve_subdomains.sh -./import_rapid7.sh ./prune.sh ./export_lists.sh ./generate_index.py diff --git a/export_lists.sh b/export_lists.sh index b294a0d..119b544 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -76,7 +76,7 @@ do echo "# Oldest record: $oldest_date" echo "# Number of source websites: $number_websites" echo "# Number of source subdomains: $number_subdomains" - echo "# Number of source DNS records: ~2E9 + $number_dns" + echo "# Number of source DNS records: $number_dns" echo "#" echo "# Input rules: $rules_input" echo "# Subsequent rules: $rules_found" diff --git a/feed_dns.py b/feed_dns.py index 41368a1..cbb5ecf 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -130,34 +130,6 @@ class Parser: raise NotImplementedError -class Rapid7Parser(Parser): - def consume(self) -> None: - data = dict() - for line in self.buf: - self.prof.enter_step("parse_rapid7") - split = line.split('"') - - try: - for k in range(1, 14, 4): - key = split[k] - val = split[k + 2] - data[key] = val - - select, writer = FUNCTION_MAP[data["type"]] - record = ( - select, - writer, - int(data["timestamp"]), - data["name"], - data["value"], - ) - except (IndexError, KeyError): - # IndexError: missing field - # KeyError: Unknown type field - self.log.exception("Cannot parse: %s", line) - self.register(record) - - class MassDnsParser(Parser): # massdns --output Snrql # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 @@ -200,7 +172,6 @@ class MassDnsParser(Parser): PARSERS = { - "rapid7": Rapid7Parser, "massdns": MassDnsParser, } diff --git a/import_rapid7.sh b/import_rapid7.sh deleted file mode 100755 index 7c57e33..0000000 --- a/import_rapid7.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env bash - -source .env.default -source .env - -function log() { - echo -e "\033[33m$@\033[0m" -} - -function api_call { - curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/" -} - -function get_timestamp { # study, dataset - study="$1" - dataset="$2" - if [ -z "$RAPID7_API_KEY" ] - then - line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1) - echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4 - else - filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1) - echo $filename | cut -d'-' -f4 - fi -} - -function get_download_url { # study, dataset - study="$1" - dataset="$2" - if [ -z "$RAPID7_API_KEY" ] - then - line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1) - echo "https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)" - else - filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1) - api_call "$study/$filename/download" | jq '.url' -r - fi -} - -function feed_rapid7 { # study, dataset, rule_file, ./feed_dns args - # The dataset will be imported if: - # none of this dataset was ever imported - # or - # the last dataset imported is older than the one to be imported - # or - # the rule_file is newer than when the last dataset was imported - # - # (note the difference between the age oft the dataset itself and - # the date when it is imported) - study="$1" - dataset="$2" - rule_file="$3" - shift; shift; shift - new_ts="$(get_timestamp $study $dataset)" - old_ts_file="last_updates/rapid7_${study}_${dataset}.txt" - if [ -f "$old_ts_file" ] - then - old_ts=$(cat "$old_ts_file") - else - old_ts="0" - fi - if [ $new_ts -gt $old_ts ] || [ $rule_file -nt $old_ts_file ] - then - link="$(get_download_url $study $dataset)" - log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…" - [ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process" - curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ $EXTRA_ARGS - if [ $? -eq 0 ] - then - echo $new_ts > $old_ts_file - fi - else - log "Skipping $dataset as there is no new version since $old_ts" - fi -} - -# feed_rapid7 sonar.rdns_v2 rdns rules_asn/first-party.list -feed_rapid7 sonar.fdns_v2 fdns_a rules_asn/first-party.list --ip4-cache "$CACHE_SIZE" -# feed_rapid7 sonar.fdns_v2 fdns_aaaa rules_asn/first-party.list --ip6-cache "$CACHE_SIZE" -feed_rapid7 sonar.fdns_v2 fdns_cname rules/first-party.list -