diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 6c00cbf..05bb125 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -11,25 +11,35 @@ echo "Compiling rules..." > /dev/stderr cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt ./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list -cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules.list +cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list +cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list -# Filter out the subdomains not pointing to a first-party tracker -echo "Filtering tracking domains..." > /dev/stderr -./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list +echo "Filtering first-party tracking domains..." > /dev/stderr +./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt -echo "Filtering first-party only tracking domains..." > /dev/stderr -./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list +echo "Filtering first-party curated tracking domains..." > /dev/stderr +./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt +echo "Filtering multi-party tracking domains..." > /dev/stderr +./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list +sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt + +echo "Filtering multi-party curated tracking domains..." > /dev/stderr +./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list +sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt + # Format the blocklist so it can be used as a hostlist function generate_hosts { basename="$1" description="$2" + description2="$2" ( echo "# First-party trackers host list" echo "# $description" + echo "# $description2" echo "#" echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" @@ -38,16 +48,21 @@ function generate_hosts { echo "# contact me the way you like: https://geoffrey.frogeye.fr" echo "#" echo "# Latest version:" - echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" - echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" + echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" + echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" + echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt" + echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt" echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)" - echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" - echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" + echo "# Number of known first-party trackers: $(wc -l temp/all_rules_first.list | cut -d' ' -f1)" + echo "# Number of known multi-party trackers: $(wc -l temp/all_rules_multi.list | cut -d' ' -f1)" + echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" + echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" + echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" + echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo cat "dist/$basename.txt" | while read host; do @@ -56,5 +71,7 @@ function generate_hosts { ) > "dist/$basename-hosts.txt" } -generate_hosts "firstparty-trackers" "Also contains trackers used as third-party." -generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in combination with third-party lists." +generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" "" +generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection." +generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party." +generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists." diff --git a/rules/first-party.list b/rules/first-party.list index 2159717..423d132 100644 --- a/rules/first-party.list +++ b/rules/first-party.list @@ -1,8 +1,15 @@ +# Eulerian +eulerian.net # Xiti (AT Internet) ati-host.net at-o.net # NP6 bp01.net # Criteo +criteo.com dnsdelegation.io storetail.io +# Keyade +keyade.com +# Adobe Experience Cloud +omtrdc.net