eulaurarien/filter_subdomains.sh

31 lines
1.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Filter out the subdomains not pointing to a first-party tracker
cat subdomains/*.list | sort -u > temp/all_subdomains.list
./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list
sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
# Format the blocklist so it can be used as a hostlist
(
echo "# First-party trackers host list"
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "# Latest version of this list: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation version: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc)"
echo
cat dist/firstparty-trackers.txt | while read host;
do
echo "0.0.0.0 $host"
done
) > dist/firstparty-trackers-hosts.txt