Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
3.9 KiB

  1. #!/usr/bin/env bash
  2. function log() {
  3. echo -e "\033[33m$@\033[0m"
  4. }
  5. log "Calculating statistics…"
  6. gen_date=$(date -Isec)
  7. gen_software=$(git describe --tags)
  8. number_websites=$(wc -l < temp/all_websites.list)
  9. number_subdomains=$(wc -l < temp/all_subdomains.list)
  10. number_dns=$(grep '^$' temp/all_resolved.txt | wc -l)
  11. for partyness in {first,multi}
  12. do
  13. if [ $partyness = "first" ]
  14. then
  15. partyness_flags="--first-party"
  16. else
  17. partyness_flags=""
  18. fi
  19. echo "Statistics for ${partyness}-party trackers"
  20. echo "Input rules: $(./export.py --count --base-rules $partyness_flags)"
  21. echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)"
  22. echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)"
  23. echo "Output hostnames: $(./export.py --count $partyness_flags)"
  24. echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)"
  25. echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)"
  26. echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)"
  27. echo
  28. for trackerness in {trackers,only-trackers}
  29. do
  30. if [ $trackerness = "trackers" ]
  31. then
  32. trackerness_flags=""
  33. else
  34. trackerness_flags="--end-chain --no-dupplicates"
  35. fi
  36. file_list="dist/${partyness}party-${trackerness}.txt"
  37. file_host="dist/${partyness}party-${trackerness}-hosts.txt"
  38. log "Generating lists for variant ${partyness}-party ${trackerness}"
  39. # Real export heeere
  40. ./export.py $partyness_flags $trackerness_flags > $file_list
  41. # Sometimes a bit heavy to have the DB open and sort the output
  42. # so this is done in two steps
  43. sort -u $file_list -o $file_list
  44. rules_input=$(./export.py --count --base-rules $partyness_flags)
  45. rules_found=$(./export.py --count --rules $partyness_flags)
  46. rules_output=$(./export.py --count $partyness_flags $trackerness_flags)
  47. function link() { # link partyness, link trackerness
  48. url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt"
  49. if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ]
  50. then
  51. url="$url (this one)"
  52. fi
  53. echo $url
  54. }
  55. (
  56. echo "# First-party trackers host list"
  57. echo "# Variant: ${partyness}-party ${trackerness}"
  58. echo "#"
  59. echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
  60. echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
  61. echo "#"
  62. echo "# In case of false positives/negatives, or any other question,"
  63. echo "# contact me the way you like: https://geoffrey.frogeye.fr"
  64. echo "#"
  65. echo "# Latest versions:"
  66. echo "# - First-party trackers : $(link first trackers)"
  67. echo "# - … excluding redirected: $(link first only-trackers)"
  68. echo "# - First and third party : $(link multi trackers)"
  69. echo "# - … excluding redirected: $(link multi only-trackers)"
  70. echo '# (you can remove `-hosts` to get the raw list)'
  71. echo "#"
  72. echo "# Generation date: $gen_date"
  73. echo "# Generation software: eulaurarien $gen_software"
  74. echo "# Number of source websites: $number_websites"
  75. echo "# Number of source subdomains: $number_subdomains"
  76. echo "# Number of source DNS records: ~2E9 + $number_dns"
  77. echo "#"
  78. echo "# Input rules: $rules_input"
  79. echo "# Subsequent rules: $rules_found"
  80. echo "# Output rules: $rules_output"
  81. echo "#"
  82. echo
  83. sed 's|^|0.0.0.0 |' "$file_list"
  84. ) > "$file_host"
  85. done
  86. done