Compare commits
No commits in common. "c609b903904e194d635c11514eb029d731439e86" and "c23004fbff58aa674ee74d26d60dda118cd151c7" have entirely different histories.
c609b90390
...
c23004fbff
|
@ -1,49 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# pylint: disable=C0103
|
|
||||||
|
|
||||||
"""
|
|
||||||
Extract the domains to block as a whole
|
|
||||||
from a AdBlock rules list.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import typing
|
|
||||||
|
|
||||||
import abp.filters
|
|
||||||
|
|
||||||
|
|
||||||
def get_domains(rule: abp.filters.parser.Filter) -> typing.Iterable[str]:
|
|
||||||
for key, val in rule.options:
|
|
||||||
if key not in ('third-party',):
|
|
||||||
return
|
|
||||||
selector_type = rule.selector['type']
|
|
||||||
selector_value = rule.selector['value']
|
|
||||||
if selector_type == 'url-pattern' \
|
|
||||||
and selector_value.startswith('||') \
|
|
||||||
and selector_value.endswith('^'):
|
|
||||||
yield selector_value[2:-1]
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# Parsing arguments
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Extract whole domains from an AdBlock blocking list")
|
|
||||||
parser.add_argument(
|
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
|
||||||
help="Input file with AdBlock rules")
|
|
||||||
parser.add_argument(
|
|
||||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
|
||||||
help="Outptut file with one rule tracking subdomain per line")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Reading rules
|
|
||||||
rules = abp.filters.parse_filterlist(args.input)
|
|
||||||
|
|
||||||
# Filtering
|
|
||||||
for rule in rules:
|
|
||||||
if not isinstance(rule, abp.filters.parser.Filter):
|
|
||||||
continue
|
|
||||||
for domain in get_domains(rule):
|
|
||||||
print(domain, file=args.output)
|
|
|
@ -4,6 +4,5 @@
|
||||||
|
|
||||||
./fetch_resources.sh
|
./fetch_resources.sh
|
||||||
./collect_subdomains.sh
|
./collect_subdomains.sh
|
||||||
./resolve_subdomains.sh
|
|
||||||
./filter_subdomains.sh
|
./filter_subdomains.sh
|
||||||
|
|
||||||
|
|
|
@ -1,31 +1,19 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
function dl() {
|
|
||||||
echo "Downloading $1 to $2..."
|
|
||||||
curl --silent "$1" > "$2"
|
|
||||||
if [ $? -ne 0 ]
|
|
||||||
then
|
|
||||||
echo "Failed!"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get rules
|
# Get rules
|
||||||
dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
|
curl https://easylist.to/easylist/easyprivacy.txt > rules/easyprivacy.cache.txt
|
||||||
dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt
|
|
||||||
|
|
||||||
# Get a list of nameservers
|
# Get a list of nameservers
|
||||||
|
|
||||||
rm -f nameservers
|
rm -f nameservers
|
||||||
touch nameservers
|
touch nameservers
|
||||||
[ -f nameservers.head ] && cat nameservers.head >> nameservers
|
[ -f nameservers.head ] && cat nameservers.head >> nameservers
|
||||||
dl https://public-dns.info/nameservers.txt nameservers.temp
|
curl https://public-dns.info/nameservers.txt | sort -R >> nameservers
|
||||||
sort -R nameservers.temp >> nameservers
|
|
||||||
rm nameservers.temp
|
|
||||||
|
|
||||||
# Get top 1M subdomains
|
# Get top 1M subdomains
|
||||||
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
|
|
||||||
unzip top-1m.csv.zip
|
wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
|
||||||
sed 's|^[0-9]\+,||' top-1m.csv > temp/cisco-umbrella_popularity.fresh.list
|
unzip top-1m.csv.zip
|
||||||
rm top-1m.csv top-1m.csv.zip
|
sed 's|^[0-9]\+,||' top-1m.csv > subdomains/cisco-umbrella_popularity.cache.list
|
||||||
cp subdomains/cisco-umbrella_popularity.cache.list temp/cisco-umbrella_popularity.old.list
|
rm top-1m.csv top-1m.csv.zip
|
||||||
cat temp/cisco-umbrella_popularity.old.list temp/cisco-umbrella_popularity.fresh.list | sort -u > subdomains/cisco-umbrella_popularity.cache.list
|
|
||||||
|
|
||||||
|
|
|
@ -12,22 +12,14 @@ import progressbar
|
||||||
import csv
|
import csv
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
|
import adblockparser
|
||||||
DomainRule = typing.Union[bool, typing.Dict]
|
|
||||||
|
OPTIONS = {"third-party": True}
|
||||||
|
|
||||||
RULES_DICT: DomainRule = dict()
|
|
||||||
|
|
||||||
def subdomain_matching(subdomain: str) -> bool:
|
def subdomain_matching(subdomain: str) -> bool:
|
||||||
parts = subdomain.split('.')
|
url = f"https://{subdomain}/"
|
||||||
parts.reverse()
|
return rules.should_block(url, OPTIONS)
|
||||||
dic = RULES_DICT
|
|
||||||
for part in parts:
|
|
||||||
if isinstance(dic, bool) or part not in dic:
|
|
||||||
break
|
|
||||||
dic = dic[part]
|
|
||||||
if isinstance(dic, bool):
|
|
||||||
return dic
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
||||||
|
@ -43,21 +35,6 @@ def get_matching(chain: typing.List[str], no_explicit: bool = False
|
||||||
yield initial
|
yield initial
|
||||||
|
|
||||||
|
|
||||||
def register_rule(subdomain: str) -> None:
|
|
||||||
# Make a tree with domain parts
|
|
||||||
parts = subdomain.split('.')
|
|
||||||
parts.reverse()
|
|
||||||
dic = RULES_DICT
|
|
||||||
last_part = len(parts) - 1
|
|
||||||
for p, part in enumerate(parts):
|
|
||||||
if isinstance(dic, bool):
|
|
||||||
return
|
|
||||||
if p == last_part:
|
|
||||||
dic[part] = True
|
|
||||||
else:
|
|
||||||
dic.setdefault(part, dict())
|
|
||||||
dic = dic[part]
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
|
@ -77,6 +54,9 @@ if __name__ == '__main__':
|
||||||
help="Rules file")
|
help="Rules file")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Reading rules
|
||||||
|
rules: adblockparser.AdblockRules = adblockparser.AdblockRules(args.rules)
|
||||||
|
|
||||||
# Progress bar
|
# Progress bar
|
||||||
widgets = [
|
widgets = [
|
||||||
progressbar.Percentage(),
|
progressbar.Percentage(),
|
||||||
|
@ -87,17 +67,14 @@ if __name__ == '__main__':
|
||||||
' ', progressbar.AdaptiveETA(),
|
' ', progressbar.AdaptiveETA(),
|
||||||
]
|
]
|
||||||
progress = progressbar.ProgressBar(widgets=widgets)
|
progress = progressbar.ProgressBar(widgets=widgets)
|
||||||
|
|
||||||
# Reading rules
|
|
||||||
for rule in args.rules:
|
|
||||||
register_rule(rule.strip())
|
|
||||||
|
|
||||||
# Reading domains to filter
|
|
||||||
if args.input.seekable():
|
if args.input.seekable():
|
||||||
progress.max_value = len(args.input.readlines())
|
progress.max_value = len(args.input.readlines())
|
||||||
args.input.seek(0)
|
args.input.seek(0)
|
||||||
|
|
||||||
|
# Cleaning input
|
||||||
reader = csv.reader(args.input)
|
reader = csv.reader(args.input)
|
||||||
|
|
||||||
|
# Filtering
|
||||||
progress.start()
|
progress.start()
|
||||||
for chain in reader:
|
for chain in reader:
|
||||||
for match in get_matching(chain, no_explicit=args.no_explicit):
|
for match in get_matching(chain, no_explicit=args.no_explicit):
|
||||||
|
|
|
@ -1,22 +1,14 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
if [ ! -f temp/all_resolved.csv ]
|
# Resolve the CNAME chain of all the known subdomains for later analysis
|
||||||
then
|
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
||||||
echo "Run ./resolve_subdomains.sh first!"
|
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
|
||||||
exit 1
|
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
|
||||||
fi
|
|
||||||
|
|
||||||
# Gather all the rules for filtering
|
|
||||||
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules_adblock.txt
|
|
||||||
./adblock_to_domain_list.py --input temp/all_rules_adblock.txt --output rules/from_adblock.cache.list
|
|
||||||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
|
|
||||||
cat rules/*.list | sort -u > temp/all_rules.list
|
|
||||||
|
|
||||||
# Filter out the subdomains not pointing to a first-party tracker
|
# Filter out the subdomains not pointing to a first-party tracker
|
||||||
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
|
cat rules/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/all_rules.txt
|
||||||
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
|
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --output dist/firstparty-trackers.txt
|
||||||
./filter_subdomains.py --rules temp/all_rules.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
|
./filter_subdomains.py --rules temp/all_rules.txt --input temp/all_resolved_sorted.csv --no-explicit --output dist/firstparty-only-trackers.txt
|
||||||
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
|
|
||||||
|
|
||||||
# Format the blocklist so it can be used as a hostlist
|
# Format the blocklist so it can be used as a hostlist
|
||||||
function generate_hosts {
|
function generate_hosts {
|
||||||
|
@ -38,7 +30,7 @@ function generate_hosts {
|
||||||
echo "# Generation version: eulaurarien $(git describe --tags)"
|
echo "# Generation version: eulaurarien $(git describe --tags)"
|
||||||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
||||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||||
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.list | cut -d' ' -f1)"
|
echo "# Number of trackers identification rules : $(wc -l temp/all_rules.txt | cut -d' ' -f1)"
|
||||||
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of tracker subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
||||||
echo
|
echo
|
||||||
|
@ -49,5 +41,5 @@ function generate_hosts {
|
||||||
) > "dist/$basename-hosts.txt"
|
) > "dist/$basename-hosts.txt"
|
||||||
}
|
}
|
||||||
|
|
||||||
generate_hosts "firstparty-trackers" "Also contains trackers used as third-party."
|
generate_hosts "firstparty-trackers" "Also contains trackers used in third-party"
|
||||||
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in combination with third-party lists."
|
generate_hosts "firstparty-only-trackers" "Do not contain trackers used in third-party. Use in conjuction with EasyPrivacy."
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Resolve the CNAME chain of all the known subdomains for later analysis
|
|
||||||
cat subdomains/*.list | sort -u > temp/all_subdomains.list
|
|
||||||
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv
|
|
||||||
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv
|
|
||||||
|
|
4
rules/.gitignore
vendored
4
rules/.gitignore
vendored
|
@ -1,2 +1,2 @@
|
||||||
*.custom.list
|
*.custom.txt
|
||||||
*.cache.list
|
*.cache.txt
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
at-o.net
|
|
1
rules/first-party.txt
Normal file
1
rules/first-party.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
||at-o.net^
|
2
rules_adblock/.gitignore
vendored
2
rules_adblock/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
||||||
*.custom.txt
|
|
||||||
*.cache.txt
|
|
2
rules_hosts/.gitignore
vendored
2
rules_hosts/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
||||||
*.custom.txt
|
|
||||||
*.cache.txt
|
|
Loading…
Reference in a new issue