eulaurarien/filter_subdomains.sh
Geoffrey Frogeye 57416b6e2c
Workflow: POO and individual tables per types
Mostly for performances reasons.
First one to implement threading later.
Second one to speed up the dichotomy,
but it doesn't seem that much better so far.
2019-12-13 00:11:21 +01:00

63 lines
3 KiB
Bash
Executable file

#!/usr/bin/env bash
function log() {
echo -e "\033[33m$@\033[0m"
}
log "Updating references…"
./database.py --references
log "Exporting lists…"
./export.py --first-party | sort -u > dist/firstparty-trackers.txt
./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt
./export.py | sort -u > dist/multiparty-trackers.txt
./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist
function generate_hosts {
basename="$1"
description="$2"
description2="$3"
(
echo "# First-party trackers host list"
echo "# $description"
echo "# $description2"
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# In case of false positives/negatives, or any other question,"
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
echo "#"
echo "# Latest version:"
echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt"
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt"
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: TODO"
echo "# Number of source subdomains: TODO"
echo "#"
echo "# Number of known first-party trackers: TODO"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#"
echo "# Number of known multi-party trackers: TODO"
echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo
cat "dist/$basename.txt" | while read host;
do
echo "0.0.0.0 $host"
done
) > "dist/$basename-hosts.txt"
}
generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" ""
generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection."
generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party."
generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists."