Browse Source
Can now use AdBlock lists for tracking matching
Can now use AdBlock lists for tracking matching
It's not very performant by itself, especially since pyre2 isn't maintained nor really compilableinstallable anymore. The performance seems to have decreased from 200 req/s to 0.2 req/s when using 512 threads, and to 80 req/s using 64 req/s. This might or might not be related,as the CPU doesn't seem to be the bottleneck. I will probably add support for host-based rules, matching the subdomains of such hosts (as for now there doesn't seem to be any other pattern for first-party trackers than subdomains, and this would be a very broad performace / compatibility with existing lists improvement), and convert the AdBlock lists to this format, only keeping domains-only rules.newworkflow_parseropti
8 changed files with 108 additions and 49 deletions
-
1README.md
-
3fetch_resources.sh
-
59filter_out_explicit.py
-
33filter_subdomains.py
-
57filter_subdomains.sh
-
2rules/.gitignore
-
1rules/first-party.txt
-
1temp/.gitignore
@ -1,30 +1,43 @@ |
|||
#!/usr/bin/env bash |
|||
|
|||
# Filter out the subdomains not pointing to a first-party tracker |
|||
|
|||
cat subdomains/*.list | sort -u > temp/all_subdomains.list |
|||
./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list |
|||
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt |
|||
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_subdomains.list --output temp/all_toblock.list |
|||
sort -u temp/all_toblock.list > dist/firstparty-trackers.txt |
|||
./filter_out_explicit.py --rules temp/all_rules.txt --input dist/firstparty-trackers.txt --output dist/firstparty-only-trackers.txt |
|||
|
|||
# Format the blocklist so it can be used as a hostlist |
|||
|
|||
( |
|||
echo "# First-party trackers host list" |
|||
echo "#" |
|||
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" |
|||
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" |
|||
echo "# Latest version of this list: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" |
|||
echo "#" |
|||
echo "# Generation date: $(date -Isec)" |
|||
echo "# Generation version: eulaurarien $(git describe --tags)" |
|||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" |
|||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" |
|||
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')" |
|||
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" |
|||
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc -l)" |
|||
echo |
|||
cat dist/firstparty-trackers.txt | while read host; |
|||
do |
|||
echo "0.0.0.0 $host" |
|||
done |
|||
) > dist/firstparty-trackers-hosts.txt |
|||
function generate_hosts { |
|||
basename="$1" |
|||
description="$2" |
|||
|
|||
( |
|||
echo "# First-party trackers host list" |
|||
echo "# $description" |
|||
echo "#" |
|||
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" |
|||
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" |
|||
echo "#" |
|||
echo "# Latest version:" |
|||
echo "# - With third-party trackers: https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" |
|||
echo "# - First-party trackers only: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" |
|||
echo "#" |
|||
echo "# Generation date: $(date -Isec)" |
|||
echo "# Generation version: eulaurarien $(git describe --tags)" |
|||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" |
|||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" |
|||
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)" |
|||
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" |
|||
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" |
|||
echo |
|||
cat "dist/$basename.txt" | while read host; |
|||
do |
|||
echo "0.0.0.0 $host" |
|||
done |
|||
) > "dist/$basename-hosts.txt" |
|||
} |
|||
|
|||
generate_hosts "firstparty-trackers" "Also contains trackers used in third-party" |
|||
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy." |
@ -0,0 +1,2 @@ |
|||
*.custom.txt |
|||
*.cache.txt |
@ -0,0 +1 @@ |
|||
||at-o.net^ |
@ -1 +1,2 @@ |
|||
*.list |
|||
*.txt |
Write
Preview
Loading…
Cancel
Save
Reference in new issue