Better list output

This commit is contained in:
Geoffrey Frogeye 2019-12-27 21:46:57 +01:00
parent fd8bfee088
commit 2b97ee4cb9
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
3 changed files with 18 additions and 30 deletions

2
dist/README.md vendored
View file

@ -70,7 +70,7 @@ In the other hand, they might protect against first-party tracker that we're not
This is the same list as above, albeit not containing the hostnames under the tracking company domains (e.g. `website1.trackercompany.com`).
While those are technically third-party trackers, they cannot be blocked at once by some ad blockers (e.g. Pi-hole).
Use only with ad blocker able to import regular expressions and in conjuction with other block lists.
Use only with ad blocker able to import regular expressions and in conjuction with other block lists, especially the ones in the previous section.
## Meta

View file

@ -5,11 +5,13 @@ function log() {
}
log "Calculating statistics…"
oldest="$(cat last_updates/*.txt | sort -n | head -1)"
oldest_date=$(date -Isec -d @$oldest)
gen_date=$(date -Isec)
gen_software=$(git describe --tags)
number_websites=$(wc -l < temp/all_websites.list)
number_subdomains=$(wc -l < temp/all_subdomains.list)
number_dns=$(grep '^$' temp/all_resolved.txt | wc -l)
number_dns=$(grep 'NOERROR' temp/all_resolved.txt | wc -l)
for partyness in {first,multi}
do
@ -20,15 +22,19 @@ do
partyness_flags=""
fi
rules_input=$(./export.py --count --base-rules $partyness_flags)
rules_found=$(./export.py --count --rules $partyness_flags)
rules_found_nd=$(./export.py --count --rules --no-dupplicates $partyness_flags)
echo
echo "Statistics for ${partyness}-party trackers"
echo "Input rules: $(./export.py --count --base-rules $partyness_flags)"
echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)"
echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)"
echo "Input rules: $rules_input"
echo "Subsequent rules: $rules_found"
echo "Subsequent rules (no dupplicate): $rules_found_nd"
echo "Output hostnames: $(./export.py --count $partyness_flags)"
echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)"
echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)"
echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)"
echo
for trackerness in {trackers,only-trackers}
do
@ -49,50 +55,32 @@ do
# so this is done in two steps
sort -u $file_list -o $file_list
rules_input=$(./export.py --count --base-rules $partyness_flags)
rules_found=$(./export.py --count --rules $partyness_flags)
rules_output=$(./export.py --count $partyness_flags $trackerness_flags)
function link() { # link partyness, link trackerness
url="https://hostfiles.frogeye.fr/${1}party-${2}-hosts.txt"
if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ]
then
url="$url (this one)"
fi
echo $url
}
(
echo "# First-party trackers host list"
echo "# Variant: ${partyness}-party ${trackerness}"
echo "#"
echo "# About first-party trackers: "
echo "# https://hostfiles.frogeye.fr/#whats-a-first-party-tracker"
echo "# About first-party trackers: https://hostfiles.frogeye.fr/#whats-a-first-party-tracker"
echo "#"
echo "# In case of false positives/negatives, or any other question,"
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
echo "#"
echo "# Latest versions and variants: https://hostfiles.frogeye.fr/#list-variants"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "# License: https://git.frogeye.fr/geoffrey/eulaurarien/src/branch/master/LICENSE"
echo "# Acknowledgements: https://hostfiles.frogeye.fr/#acknowledgements"
echo "#"
echo "# Latest versions and variants:"
echo "# - First-party trackers : $(link first trackers)"
echo "# - … excluding redirected: $(link first only-trackers)"
echo "# - First and third party : $(link multi trackers)"
echo "# - … excluding redirected: $(link multi only-trackers)"
echo '# (you can remove `-hosts` to get the raw list)'
echo '# Information about the variants:'
echo '# https://hostfiles.frogeye.fr/#list-variants'
echo "#"
echo "# Generation date: $gen_date"
echo "# Generation software: eulaurarien $gen_software"
echo "# List generation date: $gen_date"
echo "# Oldest record: $oldest_date"
echo "# Number of source websites: $number_websites"
echo "# Number of source subdomains: $number_subdomains"
echo "# Number of source DNS records: ~2E9 + $number_dns"
echo "#"
echo "# Input rules: $rules_input"
echo "# Subsequent rules: $rules_found"
echo "# … no dupplicates: $rules_found_nd"
echo "# Output rules: $rules_output"
echo "#"
echo

View file

@ -17,7 +17,7 @@ pv -f subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > te
log "Resolving subdomain…"
date +%s > "last_updates/massdns.txt"
"$MASSDNS_BINARY" --output Snrql --retry REFUSED,SERVFAIL --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
"$MASSDNS_BINARY" --output Snrql --hashmap-size "$MASSDNS_HASHMAP_SIZE" --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
log "Importing into database…"
[ $SINGLE_PROCESS -eq 1 ] && EXTRA_ARGS="--single-process"