Improved rules handling

Rules can now come in 3 different formats: - AdBlock rules - Host lists - Domains lists All will be converted into domain lists and aggregated (only AdBlock rules matching a whole domain will be kept). Subdomains will now be matched if it is a subdomain of any domain of the rule. It is way faster (seconds rather than hours!) but less flexible (although it shouldn't be a problem).
2019-12-03 08:48:12 +01:00 · 2019-12-03 08:48:12 +01:00 · 69b82d29fd
parent c23004fbff
commit 69b82d29fd
11 changed files with 130 additions and 28 deletions
--- a/adblock_to_domain_list.py
+++ b/adblock_to_domain_list.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# pylint: disable=C0103
+
+"""
+Extract the domains to block as a whole
+from a AdBlock rules list.
+"""
+
+import argparse
+import sys
+import typing
+
+import abp.filters
+
+
+def get_domains(rule: abp.filters.parser.Filter) -> typing.Iterable[str]:
+    for key, val in rule.options:
+        if key not in ('third-party',):
+            return
+    selector_type = rule.selector['type']
+    selector_value = rule.selector['value']
+    if selector_type == 'url-pattern' \
+            and selector_value.startswith('||') \
+            and selector_value.endswith('^'):
+        yield selector_value[2:-1]
+
+
+if __name__ == '__main__':
+
+    # Parsing arguments
+    parser = argparse.ArgumentParser(
+        description="TODO")
+    parser.add_argument(
+        '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
+        help="Input file with AdBlock rules")
+    parser.add_argument(
+        '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
+        help="Outptut file with one rule tracking subdomain per line")
+    args = parser.parse_args()
+
+    # Reading rules
+    rules = abp.filters.parse_filterlist(args.input)
+
+    # Filtering
+    for rule in rules:
+        if not isinstance(rule, abp.filters.parser.Filter):
+            continue
+        for domain in get_domains(rule):
+            print(domain, file=args.output)
--- a/eulaurarien.sh
+++ b/eulaurarien.sh
@ -4,5 +4,6 @@

 ./fetch_resources.sh
 ./collect_subdomains.sh
+./resolve_subdomains.sh
 ./filter_subdomains.sh

--- a/fetch_resources.sh
+++ b/fetch_resources.sh
@ -1,17 +1,27 @@
 #!/usr/bin/env bash

+function dl() {
+    echo "Downloading $1 to $2..."
+    curl --silent "$1" > "$2"
+    if [ $? -ne 0 ]
+    then
+        echo "Failed!"
+    fi
+}
+
 # Get rules
-curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt
+dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
+dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt

 # Get a list of nameservers
-
 rm -f nameservers
 touch nameservers
 [ -f nameservers.head ] && cat nameservers.head >> nameservers
-curl https://public-dns.info/nameservers.txt | sort -R >> nameservers
+dl https://public-dns.info/nameservers.txt nameservers.temp
+sort -R nameservers.temp >> nameservers
+rm nameservers.temp

 # Get top 1M subdomains
-
 wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
 unzip top-1m.csv.zip
 sed 's|^[0-9]\+,||' top-1m.csv > subdomains/cisco-umbrella_popularity.cache.list
--- a/filter_subdomains.py
+++ b/filter_subdomains.py
@ -12,14 +12,22 @@ import progressbar
 import csv
 import typing

-import adblockparser
-
-OPTIONS = {"third-party": True}
+# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
+DomainRule = typing.Union[bool, typing.Dict]

+RULES_DICT: DomainRule = dict()

 def subdomain_matching(subdomain: str) -> bool:
-    url = f"https://{subdomain}/"
-    return rules.should_block(url, OPTIONS)
+    parts = subdomain.split('.')
+    parts.reverse()
+    dic = RULES_DICT
+    for part in parts:
+        if isinstance(dic, bool) or part not in dic:
+            break
+        dic = dic[part]
+    if isinstance(dic, bool):
+        return dic
+    return False


 def get_matching(chain: typing.List[str], no_explicit: bool = False
@ -35,6 +43,21 @@ def get_matching(chain: typing.List[str], no_explicit: bool = False
        yield initial


+def register_rule(subdomain: str) -> None:
+    # Make a tree with domain parts
+    parts = subdomain.split('.')
+    parts.reverse()
+    dic = RULES_DICT
+    last_part = len(parts) - 1
+    for p, part in enumerate(parts):
+        if isinstance(dic, bool):
+            return
+        if p == last_part:
+            dic[part] = True
+        else:
+            dic.setdefault(part, dict())
+            dic = dic[part]
+
 if __name__ == '__main__':

    # Parsing arguments
@ -54,9 +77,6 @@ if __name__ == '__main__':
        help="Rules file")
    args = parser.parse_args()

-    # Reading rules
-    rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
-
    # Progress bar
    widgets = [
        progressbar.Percentage(),
@ -67,14 +87,17 @@ if __name__ == '__main__':
        ' ', progressbar.AdaptiveETA(),
    ]
    progress = progressbar.ProgressBar(widgets=widgets)
+
+    # Reading rules
+    for rule in args.rules:
+        register_rule(rule.strip())
+
+    # Reading domains to filter
    if args.input.seekable():
        progress.max_value = len(args.input.readlines())
        args.input.seek(0)

-    # Cleaning input
    reader = csv.reader(args.input)
-
-    # Filtering
    progress.start()
    for chain in reader:
        for match in get_matching(chain, no_explicit=args.no_explicit):
--- a/filter_subdomains.sh
+++ b/filter_subdomains.sh
@ -1,14 +1,22 @@
 #!/usr/bin/env bash

-# Resolve the CNAME chain of all the known subdomains for later analysis
-cat subdomains/*.list | sort -u > temp/all_subdomains.list
-./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
-sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
+if [ ! -f temp/all_resolved.csv ]
+then
+    echo "Run ./resolve_subdomains.sh first!"
+    exit 1
+fi
+
+# Gather all the rules for filtering
+cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
+./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
+cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
+cat rules/*.list | sort -u > temp/all_rules.list

 # Filter out the subdomains not pointing to a first-party tracker
-cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
-./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --output dist/firstparty-trackers.txt
-./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --no-explicit --output dist/firstparty-only-trackers.txt
+./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
+sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
+./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
+sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt

 # Format the blocklist so it can be used as a hostlist
 function generate_hosts {
@ -30,7 +38,7 @@ function generate_hosts {
        echo "# Generation version: eulaurarien $(git describe --tags)"
        echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
        echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
-        echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)"
+        echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"
        echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
        echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
        echo
@ -41,5 +49,5 @@ function generate_hosts {
    ) > "dist/$basename-hosts.txt"
 }

-generate_hosts "firstparty-trackers" "Also contains trackers used in third-party"
-generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy."
+generate_hosts "firstparty-trackers" "Also contains trackers used as third-party."
+generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in combination with third-party lists."
--- a/resolve_subdomains.sh
+++ b/resolve_subdomains.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Resolve the CNAME chain of all the known subdomains for later analysis
+cat subdomains/*.list | sort -u > temp/all_subdomains.list
+./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
+sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
+
--- a/rules/.gitignore
+++ b/rules/.gitignore
@ -1,2 +1,2 @@
-*.custom.txt
-*.cache.txt
+*.custom.list
+*.cache.list
--- a/rules/first-party.list
+++ b/rules/first-party.list
@ -0,0 +1 @@
+at-o.net
--- a/rules/first-party.txt
+++ b/rules/first-party.txt
@ -1 +0,0 @@
-||at-o.net^
--- a/rules_adblock/.gitignore
+++ b/rules_adblock/.gitignore
@ -0,0 +1,2 @@
+*.custom.txt
+*.cache.txt
--- a/rules_hosts/.gitignore
+++ b/rules_hosts/.gitignore
@ -0,0 +1,2 @@
+*.custom.txt
+*.cache.txt