Statistics about explicit first-parties

newworkflow_parseropti
Geoffrey Frogeye 2019-11-14 13:31:39 +01:00
parent 32377229db
commit ae93593930
2 changed files with 46 additions and 0 deletions

45
filter_out_explicit.py Executable file
View File

@ -0,0 +1,45 @@
#!/usr/bin/env python3
# pylint: disable=C0103
"""
From a list of subdomains to block,
filter out the ones explicitely matching a regex.
It should be already handled by the ad blocker.
"""
import logging
import multiprocessing
import re
import sys
import typing
import regexes
def explicitely_match(subdomain: str) -> bool:
for regex in regexes.REGEXES:
if re.match(regex, subdomain + '.'):
return True
return False
if __name__ == '__main__':
# Parsing arguments
assert len(sys.argv) <= 2
filename = None
if len(sys.argv) == 2 and sys.argv[1] != '-':
filename = sys.argv[1]
textio = open(filename)
else:
textio = sys.stdin
# Cleaning input
iterator = iter(textio)
iterator = map(str.strip, iterator)
iterator = filter(None, iterator)
for subdomain in iterator:
if not explicitely_match(subdomain):
print(subdomain)

View File

@ -21,6 +21,7 @@ sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc)"
echo
cat dist/firstparty-trackers.txt | while read host;
do