From 3b6f7a58b36312d6849512a84ecabe8429327da1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?=
 <geoffrey@frogeye.fr>
Date: Sun, 13 Nov 2022 20:10:27 +0100
Subject: [PATCH] Remove support for Rapid7

They changed their privacy / pricing model and as such I don't have
access to their massive DNS dataset anymore,
even after asking.

Since 2022-01-02, I put the list on freeze while looking for an alternative,
but couldn't find any.
To make the list update again with the remaining DNS sources I have,
I put the last version of the list generated with the Rapid7 dataset
as an input for subdomains, that will now get resolved with MassDNS.
---
 .env.default     |  1 -
 README.md        | 19 +-----------
 dist/README.md   |  1 -
 eulaurarien.sh   |  1 -
 export_lists.sh  |  2 +-
 feed_dns.py      | 29 -----------------
 import_rapid7.sh | 81 ------------------------------------------------
 7 files changed, 2 insertions(+), 132 deletions(-)
 delete mode 100755 import_rapid7.sh

diff --git a/.env.default b/.env.default
index d9e9f13..e96ff6b 100644
--- a/.env.default
+++ b/.env.default
@@ -1,4 +1,3 @@
-RAPID7_API_KEY=
 CACHE_SIZE=536870912
 MASSDNS_HASHMAP_SIZE=1000
 PROFILE=0
diff --git a/README.md b/README.md
index 0985a9d..1689217 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ This program takes as input:
 
 It will be able to output hostnames being a DNS redirection to any item in the lists provided.
 
-DNS records can either come from [Rapid7 Open Data Sets](https://opendata.rapid7.com/sonar.fdns_v2/) or can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns).
+DNS records can be locally resolved from a list of subdomains using [MassDNS](https://github.com/blechschmidt/massdns).
 
 Those subdomains can either be provided as is, come from [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html), from your browsing history, or from analyzing the traffic a web browser makes when opening an URL (the program provides utility to do all that).
 
@@ -41,7 +41,6 @@ Depending on the sources you'll be using to generate the list, you'll need to in
 - [coloredlogs](https://pypi.org/project/coloredlogs/) (sorry I can't help myself)
 - [numpy](https://www.numpy.org/)
 - [python-abp](https://pypi.org/project/python-abp/) (only if you intend to use AdBlock rules as a rule source)
-- [jq](http://stedolan.github.io/jq/) (only if you have a Rapid7 API key)
 - [massdns](https://github.com/blechschmidt/massdns) in your `$PATH` (only if you have subdomains as a source)
 - [Firefox](https://www.mozilla.org/firefox/) (only if you have websites as a source)
 - [selenium (Python bindings)](https://pypi.python.org/pypi/selenium) (only if you have websites as a source)
@@ -135,22 +134,6 @@ Note that this is a network intensive process, not in term of bandwith, but in t
 The DNS records will automatically be imported into the database.
 If you want to re-import the records without re-doing the resolving, just run the last line of the `./resolve_subdomains.sh` script.
 
-### Import DNS records from Rapid7
-
-If you have a Rapid7 Organization API key, make sure to append to `.env`:
-
-```
-RAPID7_API_KEY=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
-```
-
-Then, run `./import_rapid7.sh`.
-This will download about 35 GiB of data the first time, but only the matching records will be stored (about a few MiB for the tracking rules).
-Note the download speed will most likely be limited by the database operation thoughput (a quick RAM will help).
-
-The script remembers which were the last sets downloaded, and will only newer sets.
-If the first-party rules changed, the corresponding sets will be re-imported anyway.
-If you want to force re-importing, run `rm last_updates/rapid7_*.txt`.
-
 ### Export the lists
 
 For the tracking list, use `./export_lists.sh`, the output will be in the `dist` folder (please change the links before distributing them).
diff --git a/dist/README.md b/dist/README.md
index 9a2d0e3..c034b0e 100644
--- a/dist/README.md
+++ b/dist/README.md
@@ -102,7 +102,6 @@ Some of the first-party tracker included in this list have been found by:
 
 The list was generated using data from
 
-- [Rapid7 OpenData](https://opendata.rapid7.com/sonar.fdns_v2/), who kindly provided a free account
 - [Cisco Umbrella Popularity List](http://s3-us-west-1.amazonaws.com/umbrella-static/index.html)
 - [Public DNS Server List](https://public-dns.info/)
 
diff --git a/eulaurarien.sh b/eulaurarien.sh
index cb8aba3..c6ef23d 100755
--- a/eulaurarien.sh
+++ b/eulaurarien.sh
@@ -8,7 +8,6 @@
 ./collect_subdomains.sh
 ./import_rules.sh
 ./resolve_subdomains.sh
-./import_rapid7.sh
 ./prune.sh
 ./export_lists.sh
 ./generate_index.py
diff --git a/export_lists.sh b/export_lists.sh
index b294a0d..119b544 100755
--- a/export_lists.sh
+++ b/export_lists.sh
@@ -76,7 +76,7 @@ do
             echo "# Oldest record: $oldest_date"
             echo "# Number of source websites: $number_websites"
             echo "# Number of source subdomains: $number_subdomains"
-            echo "# Number of source DNS records: ~2E9 + $number_dns"
+            echo "# Number of source DNS records: $number_dns"
             echo "#"
             echo "# Input rules: $rules_input"
             echo "# Subsequent rules: $rules_found"
diff --git a/feed_dns.py b/feed_dns.py
index 41368a1..cbb5ecf 100755
--- a/feed_dns.py
+++ b/feed_dns.py
@@ -130,34 +130,6 @@ class Parser:
         raise NotImplementedError
 
 
-class Rapid7Parser(Parser):
-    def consume(self) -> None:
-        data = dict()
-        for line in self.buf:
-            self.prof.enter_step("parse_rapid7")
-            split = line.split('"')
-
-            try:
-                for k in range(1, 14, 4):
-                    key = split[k]
-                    val = split[k + 2]
-                    data[key] = val
-
-                select, writer = FUNCTION_MAP[data["type"]]
-                record = (
-                    select,
-                    writer,
-                    int(data["timestamp"]),
-                    data["name"],
-                    data["value"],
-                )
-            except (IndexError, KeyError):
-                # IndexError: missing field
-                # KeyError: Unknown type field
-                self.log.exception("Cannot parse: %s", line)
-            self.register(record)
-
-
 class MassDnsParser(Parser):
     # massdns --output Snrql
     # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
@@ -200,7 +172,6 @@ class MassDnsParser(Parser):
 
 
 PARSERS = {
-    "rapid7": Rapid7Parser,
     "massdns": MassDnsParser,
 }
 
diff --git a/import_rapid7.sh b/import_rapid7.sh
deleted file mode 100755
index 7c57e33..0000000
--- a/import_rapid7.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env bash
-
-source .env.default
-source .env
-
-function log() {
-    echo -e "\033[33m$@\033[0m"
-}
-
-function api_call {
-    curl -s -H "X-Api-Key: $RAPID7_API_KEY" "https://us.api.insight.rapid7.com/opendata/studies/$1/"
-}
-
-function get_timestamp { # study, dataset
-    study="$1"
-    dataset="$2"
-    if [ -z "$RAPID7_API_KEY" ]
-    then
-        line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
-        echo "$line" | cut -d'"' -f2 | cut -d'/' -f3 | cut -d'-' -f4
-    else
-        filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
-        echo $filename | cut -d'-' -f4
-    fi
-}
-
-function get_download_url { # study, dataset
-    study="$1"
-    dataset="$2"
-    if [ -z "$RAPID7_API_KEY" ]
-    then
-        line=$(curl -s "https://opendata.rapid7.com/$study/" | grep "href=\".\+-$dataset.json.gz\"" | head -1)
-        echo "https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)"
-    else
-        filename=$(api_call "$study" | jq '.sonarfile_set[]' -r | grep "${dataset}.json.gz" | sort | tail -1)
-        api_call "$study/$filename/download" | jq '.url' -r
-    fi
-}
-
-function feed_rapid7 { # study, dataset, rule_file, ./feed_dns args
-    # The dataset will be imported if:
-    #   none of this dataset was ever imported
-    #  or
-    #   the last dataset imported is older than the one to be imported
-    # or
-    #  the rule_file is newer than when the last dataset was imported
-    #
-    # (note the difference between the age oft the dataset itself and
-    # the date when it is imported)
-    study="$1"
-    dataset="$2"
-    rule_file="$3"
-    shift; shift; shift
-    new_ts="$(get_timestamp $study $dataset)"
-    old_ts_file="last_updates/rapid7_${study}_${dataset}.txt"
-    if [ -f "$old_ts_file" ]
-    then
-        old_ts=$(cat "$old_ts_file")
-    else
-        old_ts="0"
-    fi
-    if [ $new_ts -gt $old_ts ] || [ $rule_file -nt $old_ts_file ]
-    then
-        link="$(get_download_url $study $dataset)"
-        log "Reading $dataset dataset from $link ($old_ts -> $new_ts)…"
-        [ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process"
-        curl -L "$link" | gunzip | ./feed_dns.py rapid7 $@ $EXTRA_ARGS
-        if [ $? -eq 0 ]
-        then
-            echo $new_ts > $old_ts_file
-        fi
-    else
-        log "Skipping $dataset as there is no new version since $old_ts"
-    fi
-}
-
-# feed_rapid7 sonar.rdns_v2 rdns rules_asn/first-party.list
-feed_rapid7 sonar.fdns_v2 fdns_a rules_asn/first-party.list --ip4-cache "$CACHE_SIZE"
-# feed_rapid7 sonar.fdns_v2 fdns_aaaa rules_asn/first-party.list --ip6-cache "$CACHE_SIZE"
-feed_rapid7 sonar.fdns_v2 fdns_cname rules/first-party.list
-