eulaurarien/filter_subdomains.sh

58 lines
2.8 KiB
Bash
Executable File

#!/usr/bin/env bash
if [ ! -f temp/all_resolved.csv ]
then
echo "Run ./resolve_subdomains.sh first!"
exit 1
fi
# Gather all the rules for filtering
echo "Compiling rules..." > /dev/stderr
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules.list
# Filter out the subdomains not pointing to a first-party tracker
echo "Filtering tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
echo "Filtering first-party only tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist
function generate_hosts {
basename="$1"
description="$2"
(
echo "# First-party trackers host list"
echo "# $description"
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# Latest version:"
echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo
cat "dist/$basename.txt" | while read host;
do
echo "0.0.0.0 $host"
done
) > "dist/$basename-hosts.txt"
}
generate_hosts "firstparty-trackers" "Also contains trackers used as third-party."
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in combination with third-party lists."