Browse Source

Improved rules handling

Rules can now come in 3 different formats:
- AdBlock rules
- Host lists
- Domains lists
All will be converted into domain lists and aggregated
(only AdBlock rules matching a whole domain will be kept).

Subdomains will now be matched if it is a subdomain of any domain of the
rule.
It is way faster (seconds rather than hours!) but less flexible
(although it shouldn't be a problem).
tags/v1.4
Geoffrey Frogeye 2 months ago
parent
commit
69b82d29fd
Signed by: Geoffrey “Frogeye” Preud'homme <geoffrey@frogeye.fr> GPG Key ID: D8A7ECA00A8CD3DD
11 changed files with 130 additions and 28 deletions
  1. +49
    -0
      adblock_to_domain_list.py
  2. +1
    -0
      eulaurarien.sh
  3. +14
    -4
      fetch_resources.sh
  4. +34
    -11
      filter_subdomains.py
  5. +18
    -10
      filter_subdomains.sh
  6. +7
    -0
      resolve_subdomains.sh
  7. +2
    -2
      rules/.gitignore
  8. +1
    -0
      rules/first-party.list
  9. +0
    -1
      rules/first-party.txt
  10. +2
    -0
      rules_adblock/.gitignore
  11. +2
    -0
      rules_hosts/.gitignore

+ 49
- 0
adblock_to_domain_list.py View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# pylint: disable=C0103

"""
Extract the domains to block as a whole
from a AdBlock rules list.
"""

import argparse
import sys
import typing

import abp.filters


def get_domains(rule: abp.filters.parser.Filter) -> typing.Iterable[str]:
for key, val in rule.options:
if key not in ('third-party',):
return
selector_type = rule.selector['type']
selector_value = rule.selector['value']
if selector_type == 'url-pattern' \
and selector_value.startswith('||') \
and selector_value.endswith('^'):
yield selector_value[2:-1]


if __name__ == '__main__':

# Parsing arguments
parser = argparse.ArgumentParser(
description="TODO")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with AdBlock rules")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one rule tracking subdomain per line")
args = parser.parse_args()

# Reading rules
rules = abp.filters.parse_filterlist(args.input)

# Filtering
for rule in rules:
if not isinstance(rule, abp.filters.parser.Filter):
continue
for domain in get_domains(rule):
print(domain, file=args.output)

+ 1
- 0
eulaurarien.sh View File

@@ -4,5 +4,6 @@

./fetch_resources.sh
./collect_subdomains.sh
./resolve_subdomains.sh
./filter_subdomains.sh


+ 14
- 4
fetch_resources.sh View File

@@ -1,17 +1,27 @@
#!/usr/bin/env bash

function dl() {
echo "Downloading $1 to $2..."
curl --silent "$1" > "$2"
if [ $? -ne 0 ]
then
echo "Failed!"
fi
}

# Get rules
curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt
dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt

# Get a list of nameservers

rm -f nameservers
touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers
curl https://public-dns.info/nameservers.txt | sort -R >> nameservers
dl https://public-dns.info/nameservers.txt nameservers.temp
sort -R nameservers.temp >> nameservers
rm nameservers.temp

# Get top 1M subdomains

wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
unzip top-1m.csv.zip
sed 's|^[0-9]\+,||' top-1m.csv > subdomains/cisco-umbrella_popularity.cache.list

+ 34
- 11
filter_subdomains.py View File

@@ -12,14 +12,22 @@ import progressbar
import csv
import typing

import adblockparser

OPTIONS = {"third-party": True}
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
DomainRule = typing.Union[bool, typing.Dict]

RULES_DICT: DomainRule = dict()

def subdomain_matching(subdomain: str) -> bool:
url = f"https://{subdomain}/"
return rules.should_block(url, OPTIONS)
parts = subdomain.split('.')
parts.reverse()
dic = RULES_DICT
for part in parts:
if isinstance(dic, bool) or part not in dic:
break
dic = dic[part]
if isinstance(dic, bool):
return dic
return False


def get_matching(chain: typing.List[str], no_explicit: bool = False
@@ -35,6 +43,21 @@ def get_matching(chain: typing.List[str], no_explicit: bool = False
yield initial


def register_rule(subdomain: str) -> None:
# Make a tree with domain parts
parts = subdomain.split('.')
parts.reverse()
dic = RULES_DICT
last_part = len(parts) - 1
for p, part in enumerate(parts):
if isinstance(dic, bool):
return
if p == last_part:
dic[part] = True
else:
dic.setdefault(part, dict())
dic = dic[part]

if __name__ == '__main__':

# Parsing arguments
@@ -54,9 +77,6 @@ if __name__ == '__main__':
help="Rules file")
args = parser.parse_args()

# Reading rules
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)

# Progress bar
widgets = [
progressbar.Percentage(),
@@ -67,14 +87,17 @@ if __name__ == '__main__':
' ', progressbar.AdaptiveETA(),
]
progress = progressbar.ProgressBar(widgets=widgets)

# Reading rules
for rule in args.rules:
register_rule(rule.strip())

# Reading domains to filter
if args.input.seekable():
progress.max_value = len(args.input.readlines())
args.input.seek(0)

# Cleaning input
reader = csv.reader(args.input)

# Filtering
progress.start()
for chain in reader:
for match in get_matching(chain, no_explicit=args.no_explicit):

+ 18
- 10
filter_subdomains.sh View File

@@ -1,14 +1,22 @@
#!/usr/bin/env bash

# Resolve the CNAME chain of all the known subdomains for later analysis
cat subdomains/*.list | sort -u > temp/all_subdomains.list
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
if [ ! -f temp/all_resolved.csv ]
then
echo "Run ./resolve_subdomains.sh first!"
exit 1
fi

# Gather all the rules for filtering
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
cat rules/*.list | sort -u > temp/all_rules.list

# Filter out the subdomains not pointing to a first-party tracker
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --output dist/firstparty-trackers.txt
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --no-explicit --output dist/firstparty-only-trackers.txt
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt

# Format the blocklist so it can be used as a hostlist
function generate_hosts {
@@ -30,7 +38,7 @@ function generate_hosts {
echo "# Generation version: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)"
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo
@@ -41,5 +49,5 @@ function generate_hosts {
) > "dist/$basename-hosts.txt"
}

generate_hosts "firstparty-trackers" "Also contains trackers used in third-party"
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy."
generate_hosts "firstparty-trackers" "Also contains trackers used as third-party."
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in combination with third-party lists."

+ 7
- 0
resolve_subdomains.sh View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash

# Resolve the CNAME chain of all the known subdomains for later analysis
cat subdomains/*.list | sort -u > temp/all_subdomains.list
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv


+ 2
- 2
rules/.gitignore View File

@@ -1,2 +1,2 @@
*.custom.txt
*.cache.txt
*.custom.list
*.cache.list

+ 1
- 0
rules/first-party.list View File

@@ -0,0 +1 @@
at-o.net

+ 0
- 1
rules/first-party.txt View File

@@ -1 +0,0 @@
||at-o.net^

+ 2
- 0
rules_adblock/.gitignore View File

@@ -0,0 +1,2 @@
*.custom.txt
*.cache.txt

+ 2
- 0
rules_hosts/.gitignore View File

@@ -0,0 +1,2 @@
*.custom.txt
*.cache.txt

Loading…
Cancel
Save