diff --git a/dist/README.md b/dist/README.md index 68fe932..4c4f7dc 100644 --- a/dist/README.md +++ b/dist/README.md @@ -70,7 +70,7 @@ In the other hand, they might protect against first-party tracker that we're not This is the same list as above, albeit not containing the hostnames under the tracking company domains (e.g. `website1.trackercompany.com`). While those are technically third-party trackers, they cannot be blocked at once by some ad blockers (e.g. Pi-hole). -Use only with ad blocker able to import regular expressions and in conjuction with other block lists. +Use only with ad blocker able to import regular expressions and in conjuction with other block lists, especially the ones in the previous section. ## Meta diff --git a/export_lists.sh b/export_lists.sh index 0c4eb98..a274056 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -5,11 +5,13 @@ function log() { } log "Calculating statistics…" +oldest="$(cat last_updates/*.txt | sort -n | head -1)" +oldest_date=$(date -Isec -d @$oldest) gen_date=$(date -Isec) gen_software=$(git describe --tags) number_websites=$(wc -l < temp/all_websites.list) number_subdomains=$(wc -l < temp/all_subdomains.list) -number_dns=$(grep '^$' temp/all_resolved.txt | wc -l) +number_dns=$(grep 'NOERROR' temp/all_resolved.txt | wc -l) for partyness in {first,multi} do @@ -20,15 +22,19 @@ do partyness_flags="" fi + rules_input=$(./export.py --count --base-rules $partyness_flags) + rules_found=$(./export.py --count --rules $partyness_flags) + rules_found_nd=$(./export.py --count --rules --no-dupplicates $partyness_flags) + + echo echo "Statistics for ${partyness}-party trackers" - echo "Input rules: $(./export.py --count --base-rules $partyness_flags)" - echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)" - echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)" + echo "Input rules: $rules_input" + echo "Subsequent rules: $rules_found" + echo "Subsequent rules (no dupplicate): $rules_found_nd" echo "Output hostnames: $(./export.py --count $partyness_flags)" echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)" echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)" echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)" - echo for trackerness in {trackers,only-trackers} do @@ -49,50 +55,32 @@ do # so this is done in two steps sort -u $file_list -o $file_list - rules_input=$(./export.py --count --base-rules $partyness_flags) - rules_found=$(./export.py --count --rules $partyness_flags) rules_output=$(./export.py --count $partyness_flags $trackerness_flags) - function link() { # link partyness, link trackerness - url="https://hostfiles.frogeye.fr/${1}party-${2}-hosts.txt" - if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ] - then - url="$url (this one)" - fi - echo $url - } - ( echo "# First-party trackers host list" echo "# Variant: ${partyness}-party ${trackerness}" echo "#" - echo "# About first-party trackers: " - echo "# https://hostfiles.frogeye.fr/#whats-a-first-party-tracker" + echo "# About first-party trackers: https://hostfiles.frogeye.fr/#whats-a-first-party-tracker" echo "#" echo "# In case of false positives/negatives, or any other question," echo "# contact me the way you like: https://geoffrey.frogeye.fr" echo "#" + echo "# Latest versions and variants: https://hostfiles.frogeye.fr/#list-variants" echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" echo "# License: https://git.frogeye.fr/geoffrey/eulaurarien/src/branch/master/LICENSE" echo "# Acknowledgements: https://hostfiles.frogeye.fr/#acknowledgements" echo "#" - echo "# Latest versions and variants:" - echo "# - First-party trackers : $(link first trackers)" - echo "# - … excluding redirected: $(link first only-trackers)" - echo "# - First and third party : $(link multi trackers)" - echo "# - … excluding redirected: $(link multi only-trackers)" - echo '# (you can remove `-hosts` to get the raw list)' - echo '# Information about the variants:' - echo '# https://hostfiles.frogeye.fr/#list-variants' - echo "#" - echo "# Generation date: $gen_date" echo "# Generation software: eulaurarien $gen_software" + echo "# List generation date: $gen_date" + echo "# Oldest record: $oldest_date" echo "# Number of source websites: $number_websites" echo "# Number of source subdomains: $number_subdomains" echo "# Number of source DNS records: ~2E9 + $number_dns" echo "#" echo "# Input rules: $rules_input" echo "# Subsequent rules: $rules_found" + echo "# … no dupplicates: $rules_found_nd" echo "# Output rules: $rules_output" echo "#" echo diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index d5ddeb8..dba6b51 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -17,7 +17,7 @@ pv -f subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > te log "Resolving subdomain…" date +%s > "last_updates/massdns.txt" -"$MASSDNS_BINARY" --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list +"$MASSDNS_BINARY" --output Snrql --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list log "Importing into database…" [ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process"