Browse Source

Filtering by IP range

Closes #5
newworkflow_parseropti
Geoffrey Frogeye 2 years ago
parent
commit
a5e768fe00
Signed by: geoffrey GPG Key ID: D8A7ECA00A8CD3DD
  1. 32
      filter_subdomains.py
  2. 10
      filter_subdomains.sh
  3. 2
      rules_ip/.gitignore
  4. 7
      rules_ip/first-party.txt

32
filter_subdomains.py

@ -11,11 +11,14 @@ import sys
import progressbar
import csv
import typing
import ipaddress
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
DomainRule = typing.Union[bool, typing.Dict]
RULES_DICT: DomainRule = dict()
RULES_IP: typing.Set[ipaddress.IPv4Network] = set()
def subdomain_matching(subdomain: str) -> bool:
parts = subdomain.split('.')
@ -30,16 +33,26 @@ def subdomain_matching(subdomain: str) -> bool:
return False
def ip_matching(ip_str: str) -> bool:
ip = ipaddress.ip_address(ip_str)
for net in RULES_IP:
if ip in net:
return True
return False
def get_matching(chain: typing.List[str], no_explicit: bool = False
) -> typing.Iterable[str]:
if len(chain) <= 1:
return
initial = chain[0]
cname_destinations = chain[1:-1]
# a_destination = chain[-1]
a_destination = chain[-1]
initial_matching = subdomain_matching(initial)
if no_explicit and initial_matching:
return
cname_matching = any(map(subdomain_matching, cname_destinations))
if cname_matching or initial_matching:
if cname_matching or initial_matching or ip_matching(a_destination):
yield initial
@ -58,6 +71,14 @@ def register_rule(subdomain: str) -> None:
dic.setdefault(part, dict())
dic = dic[part]
def register_rule_ip(network: str) -> None:
net = ipaddress.ip_network(network)
RULES_IP.add(net)
# If RULES_IP start becoming bigger,
# we might implement a binary tree for performance
if __name__ == '__main__':
# Parsing arguments
@ -74,7 +95,10 @@ if __name__ == '__main__':
help="Don't output domains already blocked with rules without CNAME")
parser.add_argument(
'-r', '--rules', type=argparse.FileType('r'), default='rules',
help="Rules file")
help="List of domains domains to block (with their subdomains)")
parser.add_argument(
'-p', '--rules-ip', type=argparse.FileType('r'), default='rules-ip',
help="List of IPs ranges to block")
args = parser.parse_args()
# Progress bar
@ -91,6 +115,8 @@ if __name__ == '__main__':
# Reading rules
for rule in args.rules:
register_rule(rule.strip())
for rule in args.rules_ip:
register_rule_ip(rule.strip())
# Approximating line count
if args.input.seekable():

10
filter_subdomains.sh

@ -13,21 +13,23 @@ cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | sort -u > temp/a
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 > rules/from_hosts.cache.list
cat rules/*.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_multi.list
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | sort -u > temp/all_rules_first.list
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_multi.txt
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | sort -u > temp/all_ip_rules_first.txt
echo "Filtering first-party tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --output temp/firstparty-trackers.list
sort -u temp/firstparty-trackers.list > dist/firstparty-trackers.txt
echo "Filtering first-party curated tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules_first.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
./filter_subdomains.py --rules temp/all_rules_first.list --rules-ip temp/all_ip_rules_first.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/firstparty-only-trackers.list
sort -u temp/firstparty-only-trackers.list > dist/firstparty-only-trackers.txt
echo "Filtering multi-party tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --output temp/multiparty-trackers.list
sort -u temp/multiparty-trackers.list > dist/multiparty-trackers.txt
echo "Filtering multi-party curated tracking domains..." > /dev/stderr
./filter_subdomains.py --rules temp/all_rules_multi.list --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
./filter_subdomains.py --rules temp/all_rules_multi.list --rules-ip temp/all_ip_rules_multi.txt --input temp/all_resolved_sorted.csv --no-explicit --output temp/multiparty-only-trackers.list
sort -u temp/multiparty-only-trackers.list > dist/multiparty-only-trackers.txt
# Format the blocklist so it can be used as a hostlist

2
rules_ip/.gitignore

@ -0,0 +1,2 @@
*.custom.txt
*.cache.txt

7
rules_ip/first-party.txt

@ -0,0 +1,7 @@
# Eulerian
109.232.192.0/21
# Criteo
178.250.0.0/21
91.212.98.0/24
91.199.242.0/24
185.235.84.0/24
Loading…
Cancel
Save