Compare commits
No commits in common. "0b2eb000c391d9a56e8ba611c1e5b7807eeb21ec" and "28e33dcc7a1b6c5883aa9248f1d0ee48fc913494" have entirely different histories.
0b2eb000c3
...
28e33dcc7a
|
@ -11,18 +11,13 @@ import sys
|
||||||
import progressbar
|
import progressbar
|
||||||
import csv
|
import csv
|
||||||
import typing
|
import typing
|
||||||
import ipaddress
|
|
||||||
|
|
||||||
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
|
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
|
||||||
DomainRule = typing.Union[bool, typing.Dict]
|
DomainRule = typing.Union[bool, typing.Dict]
|
||||||
|
|
||||||
RULES_DICT: DomainRule = dict()
|
RULES_DICT: DomainRule = dict()
|
||||||
RULES_IP: typing.Set[ipaddress.IPv4Network] = set()
|
|
||||||
|
|
||||||
|
|
||||||
def subdomain_matching(subdomain: str) -> bool:
|
def subdomain_matching(subdomain: str) -> bool:
|
||||||
if not RULES_DICT:
|
|
||||||
return False
|
|
||||||
parts = subdomain.split('.')
|
parts = subdomain.split('.')
|
||||||
parts.reverse()
|
parts.reverse()
|
||||||
dic = RULES_DICT
|
dic = RULES_DICT
|
||||||
|
@ -35,28 +30,16 @@ def subdomain_matching(subdomain: str) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def ip_matching(ip_str: str) -> bool:
|
|
||||||
if not RULES_IP:
|
|
||||||
return False
|
|
||||||
ip = ipaddress.ip_address(ip_str)
|
|
||||||
for net in RULES_IP:
|
|
||||||
if ip in net:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
if len(chain) <= 1:
|
|
||||||
return
|
|
||||||
initial = chain[0]
|
initial = chain[0]
|
||||||
cname_destinations = chain[1:-1]
|
cname_destinations = chain[1:-1]
|
||||||
a_destination = chain[-1]
|
# a_destination = chain[-1]
|
||||||
initial_matching = subdomain_matching(initial)
|
initial_matching = subdomain_matching(initial)
|
||||||
if no_explicit and initial_matching:
|
if no_explicit and initial_matching:
|
||||||
return
|
return
|
||||||
cname_matching = any(map(subdomain_matching, cname_destinations))
|
cname_matching = any(map(subdomain_matching, cname_destinations))
|
||||||
if cname_matching or initial_matching or ip_matching(a_destination):
|
if cname_matching or initial_matching:
|
||||||
yield initial
|
yield initial
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,14 +58,6 @@ def register_rule(subdomain: str) -> None:
|
||||||
dic.setdefault(part, dict())
|
dic.setdefault(part, dict())
|
||||||
dic = dic[part]
|
dic = dic[part]
|
||||||
|
|
||||||
|
|
||||||
def register_rule_ip(network: str) -> None:
|
|
||||||
net = ipaddress.ip_network(network)
|
|
||||||
RULES_IP.add(net)
|
|
||||||
# If RULES_IP start becoming bigger,
|
|
||||||
# we might implement a binary tree for performance
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
|
@ -98,11 +73,8 @@ if __name__ == '__main__':
|
||||||
'-n', '--no-explicit', action='store_true',
|
'-n', '--no-explicit', action='store_true',
|
||||||
help="Don't output domains already blocked with rules without CNAME")
|
help="Don't output domains already blocked with rules without CNAME")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-r', '--rules', type=argparse.FileType('r'),
|
'-r', '--rules', type=argparse.FileType('r'), default='rules',
|
||||||
help="List of domains domains to block (with their subdomains)")
|
help="Rules file")
|
||||||
parser.add_argument(
|
|
||||||
'-p', '--rules-ip', type=argparse.FileType('r'),
|
|
||||||
help="List of IPs ranges to block")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Progress bar
|
# Progress bar
|
||||||
|
@ -117,12 +89,8 @@ if __name__ == '__main__':
|
||||||
progress = progressbar.ProgressBar(widgets=widgets)
|
progress = progressbar.ProgressBar(widgets=widgets)
|
||||||
|
|
||||||
# Reading rules
|
# Reading rules
|
||||||
if args.rules:
|
|
||||||
for rule in args.rules:
|
for rule in args.rules:
|
||||||
register_rule(rule.strip())
|
register_rule(rule.strip())
|
||||||
if args.rules_ip:
|
|
||||||
for rule in args.rules_ip:
|
|
||||||
register_rule_ip(rule.strip())
|
|
||||||
|
|
||||||
# Approximating line count
|
# Approximating line count
|
||||||
if args.input.seekable():
|
if args.input.seekable():
|
||||||
|
|
|
@ -13,23 +13,21 @@ cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/a
|
||||||
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
|
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
|
||||||
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list
|
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list
|
||||||
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list
|
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list
|
||||||
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt
|
|
||||||
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt
|
|
||||||
|
|
||||||
echo "Filtering first-party tracking domains..." > /dev/stderr
|
echo "Filtering first-party tracking domains..." > /dev/stderr
|
||||||
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
|
./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
|
||||||
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
|
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
|
||||||
|
|
||||||
echo "Filtering first-party curated tracking domains..." > /dev/stderr
|
echo "Filtering first-party curated tracking domains..." > /dev/stderr
|
||||||
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
|
./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
|
||||||
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
|
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
|
||||||
|
|
||||||
echo "Filtering multi-party tracking domains..." > /dev/stderr
|
echo "Filtering multi-party tracking domains..." > /dev/stderr
|
||||||
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
|
./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
|
||||||
sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt
|
sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt
|
||||||
|
|
||||||
echo "Filtering multi-party curated tracking domains..." > /dev/stderr
|
echo "Filtering multi-party curated tracking domains..." > /dev/stderr
|
||||||
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
|
./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
|
||||||
sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt
|
sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt
|
||||||
|
|
||||||
# Format the blocklist so it can be used as a hostlist
|
# Format the blocklist so it can be used as a hostlist
|
||||||
|
|
|
@ -13,5 +13,3 @@ storetail.io
|
||||||
keyade.com
|
keyade.com
|
||||||
# Adobe Experience Cloud
|
# Adobe Experience Cloud
|
||||||
omtrdc.net
|
omtrdc.net
|
||||||
# ThreatMetrix
|
|
||||||
online-metrix.net
|
|
||||||
|
|
2
rules_ip/.gitignore
vendored
2
rules_ip/.gitignore
vendored
|
@ -1,2 +0,0 @@
|
||||||
*.custom.txt
|
|
||||||
*.cache.txt
|
|
|
@ -1,15 +0,0 @@
|
||||||
# Eulerian (AS50234 EULERIAN TECHNOLOGIES S.A.S.)
|
|
||||||
109.232.192.0/21
|
|
||||||
# Criteo (TODO More AS)
|
|
||||||
178.250.0.0/21
|
|
||||||
91.212.98.0/24
|
|
||||||
91.199.242.0/24
|
|
||||||
185.235.84.0/24
|
|
||||||
# ThreatMetrix (AS30286 ThreatMetrix Inc.)
|
|
||||||
69.84.176.0/24
|
|
||||||
173.254.179.0/24
|
|
||||||
185.32.240.0/23
|
|
||||||
185.32.242.0/23
|
|
||||||
192.225.156.0/22
|
|
||||||
199.101.156.0/23
|
|
||||||
199.101.158.0/23
|
|
Loading…
Reference in a new issue