diff --git a/collect_subdomains.sh b/collect_subdomains.sh index bd4741a..4a95d98 100755 --- a/collect_subdomains.sh +++ b/collect_subdomains.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash +function log() { + echo -e "\033[33m$@\033[0m" +} + # Get all subdomains accessed by each website in the website list cat websites/*.list | sort -u > temp/all_websites.list diff --git a/fetch_resources.sh b/fetch_resources.sh index 91b8c04..01121d8 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -1,7 +1,11 @@ #!/usr/bin/env bash +function log() { + echo -e "\033[33m$@\033[0m" +} + function dl() { - echo "Downloading $1 to $2..." + echo "Downloading $1 to $2…" curl --silent "$1" > "$2" if [ $? -ne 0 ] then @@ -9,7 +13,8 @@ function dl() { fi } -echo "Retrieving rules..." > /dev/stderr + +log "Retrieving rules…" rm -f rules*/*.cache.* dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt # From firebog.net Tracking & Telemetry Lists @@ -25,7 +30,7 @@ dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hos # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt -echo "Retrieving nameservers..." > /dev/stderr +log "Retrieving nameservers…" rm -f nameservers touch nameservers [ -f nameservers.head ] && cat nameservers.head >> nameservers @@ -33,7 +38,7 @@ dl https://public-dns.info/nameservers.txt nameservers.temp sort -R nameservers.temp >> nameservers rm nameservers.temp -echo "Retrieving top subdomains..." > /dev/stderr +log "Retrieving top subdomains…" dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip unzip top-1m.csv.zip sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 5f0de0e..9a09b9a 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash +function log() { + echo -e "\033[33m$@\033[0m" +} + if [ ! -f temp/all_resolved.csv ] then echo "Run ./resolve_subdomains.sh first!" @@ -7,7 +11,7 @@ then fi # Gather all the rules for filtering -echo "Compiling rules..." > /dev/stderr +log "Compiling rules…" cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt ./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list @@ -16,19 +20,19 @@ cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ru cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt -echo "Filtering first-party tracking domains..." > /dev/stderr +log "Filtering first-party tracking domains…" ./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt -echo "Filtering first-party curated tracking domains..." > /dev/stderr +log "Filtering first-party curated tracking domains…" ./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt -echo "Filtering multi-party tracking domains..." > /dev/stderr +log "Filtering multi-party tracking domains…" ./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt -echo "Filtering multi-party curated tracking domains..." > /dev/stderr +log "Filtering multi-party curated tracking domains…" ./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index 40d5e07..ed7af79 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -1,7 +1,11 @@ #!/usr/bin/env bash +function log() { + echo -e "\033[33m$@\033[0m" +} + # Resolve the CNAME chain of all the known subdomains for later analysis -echo "Compiling subdomain lists..." > /dev/stderr +log "Compiling subdomain lists..." pv subdomains/*.list | sort -u > temp/all_subdomains.list # Sort by last character to utilize the DNS server caching mechanism pv temp/all_subdomains.list | rev | sort | rev > temp/all_subdomains_reversort.list