Statistics about explicit first-parties
This commit is contained in:
parent
32377229db
commit
ae93593930
45
filter_out_explicit.py
Executable file
45
filter_out_explicit.py
Executable file
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env python3
|
||||
# pylint: disable=C0103
|
||||
|
||||
"""
|
||||
From a list of subdomains to block,
|
||||
filter out the ones explicitely matching a regex.
|
||||
It should be already handled by the ad blocker.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import multiprocessing
|
||||
import re
|
||||
import sys
|
||||
import typing
|
||||
|
||||
import regexes
|
||||
|
||||
|
||||
def explicitely_match(subdomain: str) -> bool:
|
||||
for regex in regexes.REGEXES:
|
||||
if re.match(regex, subdomain + '.'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Parsing arguments
|
||||
assert len(sys.argv) <= 2
|
||||
filename = None
|
||||
|
||||
if len(sys.argv) == 2 and sys.argv[1] != '-':
|
||||
filename = sys.argv[1]
|
||||
textio = open(filename)
|
||||
else:
|
||||
textio = sys.stdin
|
||||
|
||||
# Cleaning input
|
||||
iterator = iter(textio)
|
||||
iterator = map(str.strip, iterator)
|
||||
iterator = filter(None, iterator)
|
||||
|
||||
for subdomain in iterator:
|
||||
if not explicitely_match(subdomain):
|
||||
print(subdomain)
|
|
@ -21,6 +21,7 @@ sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
|
|||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
|
||||
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc)"
|
||||
echo
|
||||
cat dist/firstparty-trackers.txt | while read host;
|
||||
do
|
||||
|
|
Loading…
Reference in a new issue