@ -4,69 +4,94 @@ function log() {
echo -e " \033[33m $@ \033[0m "
}
log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt
log "Calculating statistics…"
gen_date = $( date -Isec)
gen_software = $( git describe --tags)
number_websites = $( wc -l < temp/all_websites.list)
number_subdomains = $( wc -l < temp/all_subdomains.list)
number_dns = $( grep '^$' temp/all_resolved.txt | wc -l)
log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
for partyness in { first,multi}
do
if [ $partyness = "first" ]
then
partyness_flags = "--first-party"
else
partyness_flags = ""
fi
log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
echo " Statistics for ${ partyness } -party trackers "
echo " Input rules: $( ./export.py --count --base-rules $partyness_flags ) "
echo " Subsequent rules: $( ./export.py --count --rules $partyness_flags ) "
echo " Subsequent rules (no dupplicate): $( ./export.py --count --rules --no-dupplicates $partyness_flags ) "
echo " Output hostnames: $( ./export.py --count $partyness_flags ) "
echo " Output hostnames (no dupplicate): $( ./export.py --count --no-dupplicates $partyness_flags ) "
echo " Output hostnames (end-chain only): $( ./export.py --count --end-chain $partyness_flags ) "
echo " Output hostnames (no dupplicate, end-chain only): $( ./export.py --count --no-dupplicates --end-chain $partyness_flags ) "
echo
log "Generating hosts lists…"
function generate_hosts {
basename = " $1 "
description = " $2 "
description2 = " $3 "
for trackerness in { trackers,only-trackers}
do
if [ $trackerness = "trackers" ]
then
trackerness_flags = ""
else
trackerness_flags = "--end-chain --no-dupplicates"
fi
file_list = " dist/ ${ partyness } party- ${ trackerness } .txt "
file_host = " dist/ ${ partyness } party- ${ trackerness } -hosts.txt "
(
echo "# First-party trackers host list"
echo " # $description "
echo " # $description2 "
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# In case of false positives/negatives, or any other question,"
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
echo "#"
echo "# Latest version:"
echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt"
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt"
echo '# (you can remove `-hosts` to get the raw list)'
echo "#"
echo " # Generation date: $( date -Isec) "
echo " # Generation software: eulaurarien $( git describe --tags) "
echo " # Number of source websites: $( wc -l temp/all_websites.list | cut -d' ' -f1) "
echo " # Number of source subdomains: $( wc -l temp/all_subdomains.list | cut -d' ' -f1) "
echo " # Number of source DNS records: ~2E9 + $( wc -l temp/all_resolved.json | cut -d' ' -f1) " # TODO
echo "#"
echo " # Known first-party trackers: $( cat temp/count_rules_firstparty.txt) "
echo " # Found first-party trackers: $( cat temp/count_recs_firstparty.txt) "
echo " # Number of first-party hostnames: $( wc -l dist/firstparty-trackers.txt | cut -d' ' -f1) "
echo " # … excluding redirected: $( wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1) "
echo "#"
echo " # Known multi-party trackers: $( cat temp/count_rules_multiparty.txt) "
echo " # Found multi-party trackers: $( cat temp/count_recs_multiparty.txt) "
echo " # Number of multi-party hostnames: $( wc -l dist/multiparty-trackers.txt | cut -d' ' -f1) "
echo " # … excluding redirected: $( wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1) "
echo
sed 's|^|0.0.0.0 |' " dist/ $basename .txt "
) > " dist/ $basename -hosts.txt "
}
log " Generating lists for variant ${ partyness } -party ${ trackerness } … "
# Real export heeere
./export.py $partyness_flags $trackerness_flags > $file_list
# Sometimes a bit heavy to have the DB open and sort the output
# so this is done in two steps
sort -u $file_list -o $file_list
rules_input = $( ./export.py --count --base-rules $partyness_flags )
rules_found = $( ./export.py --count --rules $partyness_flags )
rules_output = $( ./export.py --count $partyness_flags $trackerness_flags )
function link( ) { # link partyness, link trackerness
url = " https://hostfiles.frogeye.fr/ ${ partyness } party- ${ trackerness } -hosts.txt "
if [ " $1 " = " $partyness " ] && [ " $2 " = " $trackerness " ]
then
url = " $url (this one) "
fi
echo $url
}
(
echo "# First-party trackers host list"
echo " # Variant: ${ partyness } -party ${ trackerness } "
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# In case of false positives/negatives, or any other question,"
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
echo "#"
echo "# Latest versions:"
echo " # - First-party trackers : $( link first trackers) "
echo " # - … excluding redirected: $( link first only-trackers) "
echo " # - First and third party : $( link multi trackers) "
echo " # - … excluding redirected: $( link multi only-trackers) "
echo '# (you can remove `-hosts` to get the raw list)'
echo "#"
echo " # Generation date: $gen_date "
echo " # Generation software: eulaurarien $gen_software "
echo " # Number of source websites: $number_websites "
echo " # Number of source subdomains: $number_subdomains "
echo " # Number of source DNS records: ~2E9 + $number_dns "
echo "#"
echo " # Input rules: $rules_input "
echo " # Subsequent rules: $rules_found "
echo " # Output rules: $rules_output "
echo "#"
echo
sed 's|^|0.0.0.0 |' " $file_list "
) > " $file_host "
generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" ""
generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection."
generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party."
generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists."
done
done