Statistics about explicit first-parties
This commit is contained in:
parent
32377229db
commit
ae93593930
45
filter_out_explicit.py
Executable file
45
filter_out_explicit.py
Executable file
|
@ -0,0 +1,45 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# pylint: disable=C0103
|
||||||
|
|
||||||
|
"""
|
||||||
|
From a list of subdomains to block,
|
||||||
|
filter out the ones explicitely matching a regex.
|
||||||
|
It should be already handled by the ad blocker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import multiprocessing
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import typing
|
||||||
|
|
||||||
|
import regexes
|
||||||
|
|
||||||
|
|
||||||
|
def explicitely_match(subdomain: str) -> bool:
|
||||||
|
for regex in regexes.REGEXES:
|
||||||
|
if re.match(regex, subdomain + '.'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# Parsing arguments
|
||||||
|
assert len(sys.argv) <= 2
|
||||||
|
filename = None
|
||||||
|
|
||||||
|
if len(sys.argv) == 2 and sys.argv[1] != '-':
|
||||||
|
filename = sys.argv[1]
|
||||||
|
textio = open(filename)
|
||||||
|
else:
|
||||||
|
textio = sys.stdin
|
||||||
|
|
||||||
|
# Cleaning input
|
||||||
|
iterator = iter(textio)
|
||||||
|
iterator = map(str.strip, iterator)
|
||||||
|
iterator = filter(None, iterator)
|
||||||
|
|
||||||
|
for subdomain in iterator:
|
||||||
|
if not explicitely_match(subdomain):
|
||||||
|
print(subdomain)
|
|
@ -21,6 +21,7 @@ sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
|
||||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||||
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
|
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
|
||||||
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||||
|
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc)"
|
||||||
echo
|
echo
|
||||||
cat dist/firstparty-trackers.txt | while read host;
|
cat dist/firstparty-trackers.txt | while read host;
|
||||||
do
|
do
|
||||||
|
|
Loading…
Reference in a new issue