eulaurarien/filter_subdomains.sh
Geoffrey Frogeye c23004fbff
Separated DNS resolution from filtering
This effectively removes the parallelism of filtering,
which doubles the processing time (5->8 hours),
but this allows me to toy around with the performances of this step,
which I aim to improve drastically.
2019-12-02 19:03:08 +01:00

46 lines
2.4 KiB
Bash
Executable file

#!/usr/bin/env bash
# Resolve the CNAME chain of all the known subdomains for later analysis
cat subdomains/*.list | sort -u > temp/all_subdomains.list
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
# Filter out the subdomains not pointing to a first-party tracker
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --output dist/firstparty-trackers.txt
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --no-explicit --output dist/firstparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist
function generate_hosts {
basename="$1"
description="$2"
(
echo "# First-party trackers host list"
echo "# $description"
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# Latest version:"
echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation version: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)"
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo
cat "dist/$basename.txt" | while read host;
do
echo "0.0.0.0 $host"
done
) > "dist/$basename-hosts.txt"
}
generate_hosts "firstparty-trackers" "Also contains trackers used in third-party"
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy."