diff --git a/filter_out_explicit.py b/filter_out_explicit.py new file mode 100755 index 0000000..32277eb --- /dev/null +++ b/filter_out_explicit.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# pylint: disable=C0103 + +""" +From a list of subdomains to block, +filter out the ones explicitely matching a regex. +It should be already handled by the ad blocker. +""" + +import logging +import multiprocessing +import re +import sys +import typing + +import regexes + + +def explicitely_match(subdomain: str) -> bool: + for regex in regexes.REGEXES: + if re.match(regex, subdomain + '.'): + return True + return False + + +if __name__ == '__main__': + + # Parsing arguments + assert len(sys.argv) <= 2 + filename = None + + if len(sys.argv) == 2 and sys.argv[1] != '-': + filename = sys.argv[1] + textio = open(filename) + else: + textio = sys.stdin + + # Cleaning input + iterator = iter(textio) + iterator = map(str.strip, iterator) + iterator = filter(None, iterator) + + for subdomain in iterator: + if not explicitely_match(subdomain): + print(subdomain) diff --git a/filter_subdomains.sh b/filter_subdomains.sh index d576b8c..036c467 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -21,6 +21,7 @@ sort -u temp/all_toblock.list > dist/firstparty-trackers.txt echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')" echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" + echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc)" echo cat dist/firstparty-trackers.txt | while read host; do