Generates a host list of first-party trackers for ad-blocking.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

79 lines
4.5 KiB

3 years ago
3 years ago
  1. #!/usr/bin/env bash
  2. if [ ! -f temp/all_resolved.csv ]
  3. then
  4. echo "Run ./resolve_subdomains.sh first!"
  5. exit 1
  6. fi
  7. # Gather all the rules for filtering
  8. echo "Compiling rules..." > /dev/stderr
  9. cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
  10. ./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
  11. cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
  12. cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list
  13. cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list
  14. echo "Filtering first-party tracking domains..." > /dev/stderr
  15. ./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
  16. sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
  17. echo "Filtering first-party curated tracking domains..." > /dev/stderr
  18. ./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
  19. sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
  20. echo "Filtering multi-party tracking domains..." > /dev/stderr
  21. ./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
  22. sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt
  23. echo "Filtering multi-party curated tracking domains..." > /dev/stderr
  24. ./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
  25. sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt
  26. # Format the blocklist so it can be used as a hostlist
  27. function generate_hosts {
  28. basename="$1"
  29. description="$2"
  30. description2="$3"
  31. (
  32. echo "# First-party trackers host list"
  33. echo "# $description"
  34. echo "# $description2"
  35. echo "#"
  36. echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
  37. echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
  38. echo "#"
  39. echo "# In case of false positives/negatives, or any other question,"
  40. echo "# contact me the way you like: https://geoffrey.frogeye.fr"
  41. echo "#"
  42. echo "# Latest version:"
  43. echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
  44. echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
  45. echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt"
  46. echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt"
  47. echo "#"
  48. echo "# Generation date: $(date -Isec)"
  49. echo "# Generation software: eulaurarien $(git describe --tags)"
  50. echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
  51. echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
  52. echo "#"
  53. echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)"
  54. echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
  55. echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
  56. echo "#"
  57. echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)"
  58. echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
  59. echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
  60. echo
  61. cat "dist/$basename.txt" | while read host;
  62. do
  63. echo "0.0.0.0 $host"
  64. done
  65. ) > "dist/$basename-hosts.txt"
  66. }
  67. generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" ""
  68. generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection."
  69. generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party."
  70. generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists."