Compare commits
8 commits
e882e09b37
...
53b14c6ffa
Author | SHA1 | Date | |
---|---|---|---|
Geoffrey Frogeye | 53b14c6ffa | ||
Geoffrey Frogeye | c81be4825c | ||
Geoffrey Frogeye | 4a22054796 | ||
Geoffrey Frogeye | 06b745890c | ||
Geoffrey Frogeye | aca5023c3f | ||
Geoffrey Frogeye | dce35cb299 | ||
Geoffrey Frogeye | 747fe46ad0 | ||
Geoffrey Frogeye | b43cb1725c |
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,4 +1,2 @@
|
||||||
*.log
|
*.log
|
||||||
*.p
|
*.p
|
||||||
nameservers
|
|
||||||
nameservers.head
|
|
||||||
|
|
156
database.py
156
database.py
|
@ -9,6 +9,10 @@ import time
|
||||||
import logging
|
import logging
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
import pickle
|
import pickle
|
||||||
|
import numpy
|
||||||
|
import math
|
||||||
|
|
||||||
|
TLD_LIST: typing.Set[str] = set()
|
||||||
|
|
||||||
coloredlogs.install(
|
coloredlogs.install(
|
||||||
level='DEBUG',
|
level='DEBUG',
|
||||||
|
@ -199,6 +203,54 @@ class Database(Profiler):
|
||||||
Profiler.__init__(self)
|
Profiler.__init__(self)
|
||||||
self.log = logging.getLogger('db')
|
self.log = logging.getLogger('db')
|
||||||
self.load()
|
self.load()
|
||||||
|
self.ip4cache_shift: int = 32
|
||||||
|
self.ip4cache = numpy.ones(1)
|
||||||
|
|
||||||
|
def _set_ip4cache(self, path: Path, _: Match) -> None:
|
||||||
|
assert isinstance(path, Ip4Path)
|
||||||
|
self.enter_step('set_ip4cache')
|
||||||
|
mini = path.value >> self.ip4cache_shift
|
||||||
|
maxi = (path.value + 2**(32-path.prefixlen)) >> self.ip4cache_shift
|
||||||
|
if mini == maxi:
|
||||||
|
self.ip4cache[mini] = True
|
||||||
|
else:
|
||||||
|
self.ip4cache[mini:maxi] = True
|
||||||
|
|
||||||
|
def fill_ip4cache(self, max_size: int = 512*1024**2) -> None:
|
||||||
|
"""
|
||||||
|
Size in bytes
|
||||||
|
"""
|
||||||
|
if max_size > 2**32/8:
|
||||||
|
self.log.warning("Allocating more than 512 MiB of RAM for "
|
||||||
|
"the Ip4 cache is not necessary.")
|
||||||
|
max_cache_width = int(math.log2(max(1, max_size*8)))
|
||||||
|
cache_width = min(2**32, max_cache_width)
|
||||||
|
self.ip4cache_shift = 32-cache_width
|
||||||
|
cache_size = 2**cache_width
|
||||||
|
self.ip4cache = numpy.zeros(cache_size, dtype=numpy.bool)
|
||||||
|
for _ in self.exec_each_ip4(self._set_ip4cache):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def populate_tld_list() -> None:
|
||||||
|
with open('temp/all_tld.list', 'r') as tld_fdesc:
|
||||||
|
for tld in tld_fdesc:
|
||||||
|
tld = tld.strip()
|
||||||
|
TLD_LIST.add(tld)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validate_domain(path: str) -> bool:
|
||||||
|
if len(path) > 255:
|
||||||
|
return False
|
||||||
|
splits = path.split('.')
|
||||||
|
if not TLD_LIST:
|
||||||
|
Database.populate_tld_list()
|
||||||
|
if splits[-1] not in TLD_LIST:
|
||||||
|
return False
|
||||||
|
for split in splits:
|
||||||
|
if not 1 <= len(split) <= 63:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def pack_domain(domain: str) -> DomainPath:
|
def pack_domain(domain: str) -> DomainPath:
|
||||||
|
@ -219,6 +271,19 @@ class Database(Profiler):
|
||||||
def unpack_asn(asn: AsnPath) -> str:
|
def unpack_asn(asn: AsnPath) -> str:
|
||||||
return f'AS{asn.asn}'
|
return f'AS{asn.asn}'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validate_ip4address(path: str) -> bool:
|
||||||
|
splits = path.split('.')
|
||||||
|
if len(splits) != 4:
|
||||||
|
return False
|
||||||
|
for split in splits:
|
||||||
|
try:
|
||||||
|
if not 0 <= int(split) <= 255:
|
||||||
|
return False
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def pack_ip4address(address: str) -> Ip4Path:
|
def pack_ip4address(address: str) -> Ip4Path:
|
||||||
addr = 0
|
addr = 0
|
||||||
|
@ -237,6 +302,21 @@ class Database(Profiler):
|
||||||
addr >>= 8
|
addr >>= 8
|
||||||
return '.'.join(map(str, octets))
|
return '.'.join(map(str, octets))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validate_ip4network(path: str) -> bool:
|
||||||
|
# A bit generous but ok for our usage
|
||||||
|
splits = path.split('/')
|
||||||
|
if len(splits) != 2:
|
||||||
|
return False
|
||||||
|
if not Database.validate_ip4address(splits[0]):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
if not 0 <= int(splits[1]) <= 32:
|
||||||
|
return False
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def pack_ip4network(network: str) -> Ip4Path:
|
def pack_ip4network(network: str) -> Ip4Path:
|
||||||
address, prefixlen_str = network.split('/')
|
address, prefixlen_str = network.split('/')
|
||||||
|
@ -353,8 +433,9 @@ class Database(Profiler):
|
||||||
pref = _par.prefixlen + 1
|
pref = _par.prefixlen + 1
|
||||||
dic = _dic.zero
|
dic = _dic.zero
|
||||||
if dic:
|
if dic:
|
||||||
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
|
# addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
|
||||||
assert addr0 == _par.value
|
# assert addr0 == _par.value
|
||||||
|
addr0 = _par.value
|
||||||
yield from self.exec_each_ip4(
|
yield from self.exec_each_ip4(
|
||||||
callback,
|
callback,
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
|
@ -364,6 +445,7 @@ class Database(Profiler):
|
||||||
dic = _dic.one
|
dic = _dic.one
|
||||||
if dic:
|
if dic:
|
||||||
addr1 = _par.value | (1 << (32-pref))
|
addr1 = _par.value | (1 << (32-pref))
|
||||||
|
# assert addr1 != _par.value
|
||||||
yield from self.exec_each_ip4(
|
yield from self.exec_each_ip4(
|
||||||
callback,
|
callback,
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
|
@ -409,62 +491,56 @@ class Database(Profiler):
|
||||||
string += f' ← {self.explain(match.source)}'
|
string += f' ← {self.explain(match.source)}'
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def export(self,
|
def list_records(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
end_chain_only: bool = False,
|
end_chain_only: bool = False,
|
||||||
no_dupplicates: bool = False,
|
no_dupplicates: bool = False,
|
||||||
explain: bool = False,
|
rules_only: bool = False,
|
||||||
) -> typing.Iterable[str]:
|
hostnames_only: bool = False,
|
||||||
|
explain: bool = False,
|
||||||
|
) -> typing.Iterable[str]:
|
||||||
|
|
||||||
def export_cb(path: Path, match: Match
|
def export_cb(path: Path, match: Match
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
assert isinstance(path, DomainPath)
|
|
||||||
if not isinstance(path, HostnamePath):
|
|
||||||
return
|
|
||||||
if first_party_only and not match.first_party:
|
if first_party_only and not match.first_party:
|
||||||
return
|
return
|
||||||
if end_chain_only and match.references > 0:
|
if end_chain_only and match.references > 0:
|
||||||
return
|
return
|
||||||
if no_dupplicates and match.dupplicate:
|
if no_dupplicates and match.dupplicate:
|
||||||
return
|
return
|
||||||
|
if rules_only and match.level > 1:
|
||||||
|
return
|
||||||
|
if hostnames_only and not isinstance(path, HostnamePath):
|
||||||
|
return
|
||||||
|
|
||||||
if explain:
|
if explain:
|
||||||
yield self.explain(path)
|
yield self.explain(path)
|
||||||
else:
|
else:
|
||||||
yield self.unpack_domain(path)
|
yield str(path)
|
||||||
|
|
||||||
yield from self.exec_each_domain(export_cb)
|
yield from self.exec_each(export_cb)
|
||||||
|
|
||||||
def list_rules(self,
|
|
||||||
first_party_only: bool = False,
|
|
||||||
) -> typing.Iterable[str]:
|
|
||||||
|
|
||||||
def list_rules_cb(path: Path, match: Match
|
|
||||||
) -> typing.Iterable[str]:
|
|
||||||
if first_party_only and not match.first_party:
|
|
||||||
return
|
|
||||||
if isinstance(path, ZonePath) \
|
|
||||||
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
|
|
||||||
# if match.level == 1:
|
|
||||||
# It should be the latter condition but it is more
|
|
||||||
# useful when using the former
|
|
||||||
yield self.explain(path)
|
|
||||||
|
|
||||||
yield from self.exec_each(list_rules_cb)
|
|
||||||
|
|
||||||
def count_records(self,
|
def count_records(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
rules_only: bool = False,
|
end_chain_only: bool = False,
|
||||||
no_dupplicates: bool = False,
|
no_dupplicates: bool = False,
|
||||||
|
rules_only: bool = False,
|
||||||
|
hostnames_only: bool = False,
|
||||||
) -> str:
|
) -> str:
|
||||||
memo: typing.Dict[str, int] = dict()
|
memo: typing.Dict[str, int] = dict()
|
||||||
|
|
||||||
def count_records_cb(path: Path, match: Match) -> None:
|
def count_records_cb(path: Path, match: Match) -> None:
|
||||||
if first_party_only and not match.first_party:
|
if first_party_only and not match.first_party:
|
||||||
return
|
return
|
||||||
if rules_only and match.level > 1:
|
if end_chain_only and match.references > 0:
|
||||||
return
|
return
|
||||||
if no_dupplicates and match.dupplicate:
|
if no_dupplicates and match.dupplicate:
|
||||||
return
|
return
|
||||||
|
if rules_only and match.level > 1:
|
||||||
|
return
|
||||||
|
if hostnames_only and not isinstance(path, HostnamePath):
|
||||||
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
memo[path.__class__.__name__] += 1
|
memo[path.__class__.__name__] += 1
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -472,9 +548,10 @@ class Database(Profiler):
|
||||||
|
|
||||||
for _ in self.exec_each(count_records_cb):
|
for _ in self.exec_each(count_records_cb):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
split: typing.List[str] = list()
|
split: typing.List[str] = list()
|
||||||
for key, value in sorted(memo.items(), key=lambda s: s[0]):
|
for key, value in sorted(memo.items(), key=lambda s: s[0]):
|
||||||
split.append(f'{key[:-4]}: {value}')
|
split.append(f'{key[:-4].lower()}s: {value}')
|
||||||
return ', '.join(split)
|
return ', '.join(split)
|
||||||
|
|
||||||
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
|
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
|
||||||
|
@ -502,6 +579,9 @@ class Database(Profiler):
|
||||||
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
|
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
|
||||||
self.enter_step('get_ip4_pack')
|
self.enter_step('get_ip4_pack')
|
||||||
ip4 = self.pack_ip4address(ip4_str)
|
ip4 = self.pack_ip4address(ip4_str)
|
||||||
|
self.enter_step('get_ip4_cache')
|
||||||
|
if not self.ip4cache[ip4.value >> self.ip4cache_shift]:
|
||||||
|
return
|
||||||
self.enter_step('get_ip4_brws')
|
self.enter_step('get_ip4_brws')
|
||||||
dic = self.ip4tree
|
dic = self.ip4tree
|
||||||
for i in range(31, 31-ip4.prefixlen, -1):
|
for i in range(31, 31-ip4.prefixlen, -1):
|
||||||
|
@ -549,6 +629,9 @@ class Database(Profiler):
|
||||||
domain_str: str,
|
domain_str: str,
|
||||||
updated: int,
|
updated: int,
|
||||||
source: Path) -> None:
|
source: Path) -> None:
|
||||||
|
self.enter_step('set_domain_val')
|
||||||
|
if not Database.validate_domain(domain_str):
|
||||||
|
raise ValueError(f"Invalid domain: {domain_str}")
|
||||||
self.enter_step('set_domain_pack')
|
self.enter_step('set_domain_pack')
|
||||||
domain = self.pack_domain(domain_str)
|
domain = self.pack_domain(domain_str)
|
||||||
self.enter_step('set_domain_fp')
|
self.enter_step('set_domain_fp')
|
||||||
|
@ -631,11 +714,15 @@ class Database(Profiler):
|
||||||
source_match=source_match,
|
source_match=source_match,
|
||||||
dupplicate=dupplicate,
|
dupplicate=dupplicate,
|
||||||
)
|
)
|
||||||
|
self._set_ip4cache(ip4, dic)
|
||||||
|
|
||||||
def set_ip4address(self,
|
def set_ip4address(self,
|
||||||
ip4address_str: str,
|
ip4address_str: str,
|
||||||
*args: typing.Any, **kwargs: typing.Any
|
*args: typing.Any, **kwargs: typing.Any
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.enter_step('set_ip4add_val')
|
||||||
|
if not Database.validate_ip4address(ip4address_str):
|
||||||
|
raise ValueError(f"Invalid ip4address: {ip4address_str}")
|
||||||
self.enter_step('set_ip4add_pack')
|
self.enter_step('set_ip4add_pack')
|
||||||
ip4 = self.pack_ip4address(ip4address_str)
|
ip4 = self.pack_ip4address(ip4address_str)
|
||||||
self._set_ip4(ip4, *args, **kwargs)
|
self._set_ip4(ip4, *args, **kwargs)
|
||||||
|
@ -644,6 +731,9 @@ class Database(Profiler):
|
||||||
ip4network_str: str,
|
ip4network_str: str,
|
||||||
*args: typing.Any, **kwargs: typing.Any
|
*args: typing.Any, **kwargs: typing.Any
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.enter_step('set_ip4net_val')
|
||||||
|
if not Database.validate_ip4network(ip4network_str):
|
||||||
|
raise ValueError(f"Invalid ip4network: {ip4network_str}")
|
||||||
self.enter_step('set_ip4net_pack')
|
self.enter_step('set_ip4net_pack')
|
||||||
ip4 = self.pack_ip4network(ip4network_str)
|
ip4 = self.pack_ip4network(ip4network_str)
|
||||||
self._set_ip4(ip4, *args, **kwargs)
|
self._set_ip4(ip4, *args, **kwargs)
|
||||||
|
|
8
db.py
8
db.py
|
@ -18,14 +18,16 @@ if __name__ == '__main__':
|
||||||
help="Remove old entries from database")
|
help="Remove old entries from database")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-b', '--prune-base', action='store_true',
|
'-b', '--prune-base', action='store_true',
|
||||||
help="TODO")
|
help="With --prune, only prune base rules "
|
||||||
|
"(the ones added by ./feed_rules.py)")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-s', '--prune-before', type=int,
|
'-s', '--prune-before', type=int,
|
||||||
default=(int(time.time()) - 60*60*24*31*6),
|
default=(int(time.time()) - 60*60*24*31*6),
|
||||||
help="TODO")
|
help="With --prune, only rules updated before "
|
||||||
|
"this UNIX timestamp will be deleted")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-r', '--references', action='store_true',
|
'-r', '--references', action='store_true',
|
||||||
help="Update the reference count")
|
help="DEBUG: Update the reference count")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not args.initialize:
|
if not args.initialize:
|
||||||
|
|
48
export.py
48
export.py
|
@ -9,46 +9,56 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="TODO")
|
description="Export the hostnames rules stored "
|
||||||
|
"in the Database as plain text")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
||||||
help="TODO")
|
help="Output file, one rule per line")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-f', '--first-party', action='store_true',
|
'-f', '--first-party', action='store_true',
|
||||||
help="TODO")
|
help="Only output rules issued from first-party sources")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-e', '--end-chain', action='store_true',
|
'-e', '--end-chain', action='store_true',
|
||||||
help="TODO")
|
help="Only output rules that are not referenced by any other")
|
||||||
parser.add_argument(
|
|
||||||
'-x', '--explain', action='store_true',
|
|
||||||
help="TODO")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-r', '--rules', action='store_true',
|
'-r', '--rules', action='store_true',
|
||||||
help="TODO")
|
help="Output all kinds of rules, not just hostnames")
|
||||||
|
parser.add_argument(
|
||||||
|
'-b', '--base-rules', action='store_true',
|
||||||
|
help="Output base rules "
|
||||||
|
"(the ones added by ./feed_rules.py) "
|
||||||
|
"(implies --rules)")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-d', '--no-dupplicates', action='store_true',
|
'-d', '--no-dupplicates', action='store_true',
|
||||||
help="TODO")
|
help="Do not output rules that already match a zone/network rule "
|
||||||
|
"(e.g. dummy.example.com when there's a zone example.com rule)")
|
||||||
|
parser.add_argument(
|
||||||
|
'-x', '--explain', action='store_true',
|
||||||
|
help="Show the chain of rules leading to one "
|
||||||
|
"(and the number of references they have)")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-c', '--count', action='store_true',
|
'-c', '--count', action='store_true',
|
||||||
help="TODO")
|
help="Show the number of rules per type instead of listing them")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
DB = database.Database()
|
DB = database.Database()
|
||||||
|
|
||||||
if args.count:
|
if args.count:
|
||||||
|
assert not args.explain
|
||||||
print(DB.count_records(
|
print(DB.count_records(
|
||||||
first_party_only=args.first_party,
|
|
||||||
rules_only=args.rules,
|
|
||||||
no_dupplicates=args.no_dupplicates,
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
if args.rules:
|
|
||||||
for line in DB.list_rules():
|
|
||||||
print(line)
|
|
||||||
for domain in DB.export(
|
|
||||||
first_party_only=args.first_party,
|
first_party_only=args.first_party,
|
||||||
end_chain_only=args.end_chain,
|
end_chain_only=args.end_chain,
|
||||||
no_dupplicates=args.no_dupplicates,
|
no_dupplicates=args.no_dupplicates,
|
||||||
|
rules_only=args.base_rules,
|
||||||
|
hostnames_only=not (args.rules or args.base_rules),
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
for domain in DB.list_records(
|
||||||
|
first_party_only=args.first_party,
|
||||||
|
end_chain_only=args.end_chain,
|
||||||
|
no_dupplicates=args.no_dupplicates,
|
||||||
|
rules_only=args.base_rules,
|
||||||
|
hostnames_only=not (args.rules or args.base_rules),
|
||||||
explain=args.explain,
|
explain=args.explain,
|
||||||
):
|
):
|
||||||
print(domain, file=args.output)
|
print(domain, file=args.output)
|
||||||
|
|
147
export_lists.sh
147
export_lists.sh
|
@ -4,69 +4,94 @@ function log() {
|
||||||
echo -e "\033[33m$@\033[0m"
|
echo -e "\033[33m$@\033[0m"
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Exporting lists…"
|
log "Calculating statistics…"
|
||||||
./export.py --first-party --output dist/firstparty-trackers.txt
|
gen_date=$(date -Isec)
|
||||||
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
|
gen_software=$(git describe --tags)
|
||||||
./export.py --output dist/multiparty-trackers.txt
|
number_websites=$(wc -l < temp/all_websites.list)
|
||||||
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
|
number_subdomains=$(wc -l < temp/all_subdomains.list)
|
||||||
|
number_dns=$(grep '^$' temp/all_resolved.txt | wc -l)
|
||||||
|
|
||||||
log "Generating statistics…"
|
for partyness in {first,multi}
|
||||||
./export.py --count --first-party > temp/count_recs_firstparty.txt
|
do
|
||||||
./export.py --count > temp/count_recs_multiparty.txt
|
if [ $partyness = "first" ]
|
||||||
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
|
then
|
||||||
./export.py --rules --count > temp/count_rules_multiparty.txt
|
partyness_flags="--first-party"
|
||||||
|
else
|
||||||
|
partyness_flags=""
|
||||||
|
fi
|
||||||
|
|
||||||
log "Sorting lists…"
|
echo "Statistics for ${partyness}-party trackers"
|
||||||
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
|
echo "Input rules: $(./export.py --count --base-rules $partyness_flags)"
|
||||||
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
|
echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)"
|
||||||
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
|
echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)"
|
||||||
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
|
echo "Output hostnames: $(./export.py --count $partyness_flags)"
|
||||||
|
echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)"
|
||||||
|
echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)"
|
||||||
|
echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)"
|
||||||
|
echo
|
||||||
|
|
||||||
log "Generating hosts lists…"
|
for trackerness in {trackers,only-trackers}
|
||||||
function generate_hosts {
|
do
|
||||||
basename="$1"
|
if [ $trackerness = "trackers" ]
|
||||||
description="$2"
|
then
|
||||||
description2="$3"
|
trackerness_flags=""
|
||||||
|
else
|
||||||
|
trackerness_flags="--end-chain --no-dupplicates"
|
||||||
|
fi
|
||||||
|
file_list="dist/${partyness}party-${trackerness}.txt"
|
||||||
|
file_host="dist/${partyness}party-${trackerness}-hosts.txt"
|
||||||
|
|
||||||
(
|
log "Generating lists for variant ${partyness}-party ${trackerness}…"
|
||||||
echo "# First-party trackers host list"
|
|
||||||
echo "# $description"
|
|
||||||
echo "# $description2"
|
|
||||||
echo "#"
|
|
||||||
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
|
|
||||||
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
|
|
||||||
echo "#"
|
|
||||||
echo "# In case of false positives/negatives, or any other question,"
|
|
||||||
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
|
|
||||||
echo "#"
|
|
||||||
echo "# Latest version:"
|
|
||||||
echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
|
|
||||||
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
|
|
||||||
echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt"
|
|
||||||
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt"
|
|
||||||
echo '# (you can remove `-hosts` to get the raw list)'
|
|
||||||
echo "#"
|
|
||||||
echo "# Generation date: $(date -Isec)"
|
|
||||||
echo "# Generation software: eulaurarien $(git describe --tags)"
|
|
||||||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
|
||||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
|
||||||
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
|
|
||||||
echo "#"
|
|
||||||
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
|
|
||||||
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
|
|
||||||
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
|
||||||
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
|
||||||
echo "#"
|
|
||||||
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
|
|
||||||
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
|
|
||||||
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
|
|
||||||
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
|
|
||||||
echo
|
|
||||||
sed 's|^|0.0.0.0 |' "dist/$basename.txt"
|
|
||||||
) > "dist/$basename-hosts.txt"
|
|
||||||
}
|
|
||||||
|
|
||||||
generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" ""
|
# Real export heeere
|
||||||
generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection."
|
./export.py $partyness_flags $trackerness_flags > $file_list
|
||||||
generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party."
|
# Sometimes a bit heavy to have the DB open and sort the output
|
||||||
generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists."
|
# so this is done in two steps
|
||||||
|
sort -u $file_list -o $file_list
|
||||||
|
|
||||||
|
rules_input=$(./export.py --count --base-rules $partyness_flags)
|
||||||
|
rules_found=$(./export.py --count --rules $partyness_flags)
|
||||||
|
rules_output=$(./export.py --count $partyness_flags $trackerness_flags)
|
||||||
|
|
||||||
|
function link() { # link partyness, link trackerness
|
||||||
|
url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt"
|
||||||
|
if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ]
|
||||||
|
then
|
||||||
|
url="$url (this one)"
|
||||||
|
fi
|
||||||
|
echo $url
|
||||||
|
}
|
||||||
|
|
||||||
|
(
|
||||||
|
echo "# First-party trackers host list"
|
||||||
|
echo "# Variant: ${partyness}-party ${trackerness}"
|
||||||
|
echo "#"
|
||||||
|
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
|
||||||
|
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
|
||||||
|
echo "#"
|
||||||
|
echo "# In case of false positives/negatives, or any other question,"
|
||||||
|
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
|
||||||
|
echo "#"
|
||||||
|
echo "# Latest versions:"
|
||||||
|
echo "# - First-party trackers : $(link first trackers)"
|
||||||
|
echo "# - … excluding redirected: $(link first only-trackers)"
|
||||||
|
echo "# - First and third party : $(link multi trackers)"
|
||||||
|
echo "# - … excluding redirected: $(link multi only-trackers)"
|
||||||
|
echo '# (you can remove `-hosts` to get the raw list)'
|
||||||
|
echo "#"
|
||||||
|
echo "# Generation date: $gen_date"
|
||||||
|
echo "# Generation software: eulaurarien $gen_software"
|
||||||
|
echo "# Number of source websites: $number_websites"
|
||||||
|
echo "# Number of source subdomains: $number_subdomains"
|
||||||
|
echo "# Number of source DNS records: ~2E9 + $number_dns"
|
||||||
|
echo "#"
|
||||||
|
echo "# Input rules: $rules_input"
|
||||||
|
echo "# Subsequent rules: $rules_found"
|
||||||
|
echo "# Output rules: $rules_output"
|
||||||
|
echo "#"
|
||||||
|
echo
|
||||||
|
sed 's|^|0.0.0.0 |' "$file_list"
|
||||||
|
) > "$file_host"
|
||||||
|
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="TODO")
|
description="Add the IP ranges associated to the AS in the database")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
DB = database.Database()
|
DB = database.Database()
|
||||||
|
|
62
feed_dns.py
62
feed_dns.py
|
@ -6,7 +6,7 @@ import logging
|
||||||
import sys
|
import sys
|
||||||
import typing
|
import typing
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import enum
|
import time
|
||||||
|
|
||||||
Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str]
|
Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str]
|
||||||
|
|
||||||
|
@ -30,14 +30,23 @@ FUNCTION_MAP: typing.Any = {
|
||||||
class Writer(multiprocessing.Process):
|
class Writer(multiprocessing.Process):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
recs_queue: multiprocessing.Queue,
|
recs_queue: multiprocessing.Queue,
|
||||||
index: int = 0):
|
autosave_interval: int = 0,
|
||||||
|
ip4_cache: int = 0,
|
||||||
|
):
|
||||||
super(Writer, self).__init__()
|
super(Writer, self).__init__()
|
||||||
self.log = logging.getLogger(f'wr')
|
self.log = logging.getLogger(f'wr')
|
||||||
self.recs_queue = recs_queue
|
self.recs_queue = recs_queue
|
||||||
|
self.autosave_interval = autosave_interval
|
||||||
|
self.ip4_cache = ip4_cache
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
self.db = database.Database()
|
self.db = database.Database()
|
||||||
self.db.log = logging.getLogger(f'wr')
|
self.db.log = logging.getLogger(f'wr')
|
||||||
|
self.db.fill_ip4cache(max_size=self.ip4_cache)
|
||||||
|
if self.autosave_interval > 0:
|
||||||
|
next_save = time.time() + self.autosave_interval
|
||||||
|
else:
|
||||||
|
next_save = 0
|
||||||
|
|
||||||
self.db.enter_step('block_wait')
|
self.db.enter_step('block_wait')
|
||||||
block: typing.List[Record]
|
block: typing.List[Record]
|
||||||
|
@ -55,6 +64,12 @@ class Writer(multiprocessing.Process):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.log.exception("Cannot execute: %s", record)
|
self.log.exception("Cannot execute: %s", record)
|
||||||
|
|
||||||
|
if next_save > 0 and time.time() > next_save:
|
||||||
|
self.log.info("Saving database...")
|
||||||
|
self.db.save()
|
||||||
|
self.log.info("Done!")
|
||||||
|
next_save = time.time() + self.autosave_interval
|
||||||
|
|
||||||
self.db.enter_step('block_wait')
|
self.db.enter_step('block_wait')
|
||||||
|
|
||||||
self.db.enter_step('end')
|
self.db.enter_step('end')
|
||||||
|
@ -119,8 +134,8 @@ class Rapid7Parser(Parser):
|
||||||
self.register(record)
|
self.register(record)
|
||||||
|
|
||||||
|
|
||||||
class DnsMassParser(Parser):
|
class MassDnsParser(Parser):
|
||||||
# dnsmass --output Snrql
|
# massdns --output Snrql
|
||||||
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
|
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
|
||||||
TYPES = {
|
TYPES = {
|
||||||
'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None),
|
'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None),
|
||||||
|
@ -129,7 +144,7 @@ class DnsMassParser(Parser):
|
||||||
}
|
}
|
||||||
|
|
||||||
def consume(self) -> None:
|
def consume(self) -> None:
|
||||||
self.prof.enter_step('parse_dnsmass')
|
self.prof.enter_step('parse_massdns')
|
||||||
timestamp = 0
|
timestamp = 0
|
||||||
header = True
|
header = True
|
||||||
for line in self.buf:
|
for line in self.buf:
|
||||||
|
@ -145,7 +160,7 @@ class DnsMassParser(Parser):
|
||||||
header = False
|
header = False
|
||||||
else:
|
else:
|
||||||
select, write, name_offset, value_offset = \
|
select, write, name_offset, value_offset = \
|
||||||
DnsMassParser.TYPES[split[1]]
|
MassDnsParser.TYPES[split[1]]
|
||||||
record = (
|
record = (
|
||||||
select,
|
select,
|
||||||
write,
|
write,
|
||||||
|
@ -154,14 +169,14 @@ class DnsMassParser(Parser):
|
||||||
split[2][:value_offset],
|
split[2][:value_offset],
|
||||||
)
|
)
|
||||||
self.register(record)
|
self.register(record)
|
||||||
self.prof.enter_step('parse_dnsmass')
|
self.prof.enter_step('parse_massdns')
|
||||||
except KeyError:
|
except KeyError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
PARSERS = {
|
PARSERS = {
|
||||||
'rapid7': Rapid7Parser,
|
'rapid7': Rapid7Parser,
|
||||||
'dnsmass': DnsMassParser,
|
'massdns': MassDnsParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -169,29 +184,40 @@ if __name__ == '__main__':
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
log = logging.getLogger('feed_dns')
|
log = logging.getLogger('feed_dns')
|
||||||
args_parser = argparse.ArgumentParser(
|
args_parser = argparse.ArgumentParser(
|
||||||
description="TODO")
|
description="Read DNS records and import "
|
||||||
|
"tracking-relevant data into the database")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'parser',
|
'parser',
|
||||||
choices=PARSERS.keys(),
|
choices=PARSERS.keys(),
|
||||||
help="TODO")
|
help="Input format")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||||
help="TODO")
|
help="Input file")
|
||||||
args_parser.add_argument(
|
|
||||||
'-j', '--workers', type=int, default=4,
|
|
||||||
help="TODO")
|
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-b', '--block-size', type=int, default=1024,
|
'-b', '--block-size', type=int, default=1024,
|
||||||
help="TODO")
|
help="Performance tuning value")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-q', '--queue-size', type=int, default=128,
|
'-q', '--queue-size', type=int, default=128,
|
||||||
help="TODO")
|
help="Performance tuning value")
|
||||||
|
args_parser.add_argument(
|
||||||
|
'-a', '--autosave-interval', type=int, default=900,
|
||||||
|
help="Interval to which the database will save in seconds. "
|
||||||
|
"0 to disable.")
|
||||||
|
args_parser.add_argument(
|
||||||
|
'-4', '--ip4-cache', type=int, default=0,
|
||||||
|
help="RAM cache for faster IPv4 lookup. "
|
||||||
|
"Maximum useful value: 512 MiB (536870912). "
|
||||||
|
"Warning: Depending on the rules, this might already "
|
||||||
|
"be a memory-heavy process, even without the cache.")
|
||||||
args = args_parser.parse_args()
|
args = args_parser.parse_args()
|
||||||
|
|
||||||
recs_queue: multiprocessing.Queue = multiprocessing.Queue(
|
recs_queue: multiprocessing.Queue = multiprocessing.Queue(
|
||||||
maxsize=args.queue_size)
|
maxsize=args.queue_size)
|
||||||
|
|
||||||
writer = Writer(recs_queue)
|
writer = Writer(recs_queue,
|
||||||
|
autosave_interval=args.autosave_interval,
|
||||||
|
ip4_cache=args.ip4_cache
|
||||||
|
)
|
||||||
writer.start()
|
writer.start()
|
||||||
|
|
||||||
parser = PARSERS[args.parser](args.input, recs_queue, args.block_size)
|
parser = PARSERS[args.parser](args.input, recs_queue, args.block_size)
|
||||||
|
|
|
@ -7,22 +7,24 @@ import time
|
||||||
|
|
||||||
FUNCTION_MAP = {
|
FUNCTION_MAP = {
|
||||||
'zone': database.Database.set_zone,
|
'zone': database.Database.set_zone,
|
||||||
'ip4network': database.Database.set_ip4network,
|
'hostname': database.Database.set_hostname,
|
||||||
'asn': database.Database.set_asn,
|
'asn': database.Database.set_asn,
|
||||||
|
'ip4network': database.Database.set_ip4network,
|
||||||
|
'ip4address': database.Database.set_ip4address,
|
||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# Parsing arguments
|
# Parsing arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="TODO")
|
description="Import base rules to the database")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'type',
|
'type',
|
||||||
choices=FUNCTION_MAP.keys(),
|
choices=FUNCTION_MAP.keys(),
|
||||||
help="Type of rule inputed")
|
help="Type of rule inputed")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||||
help="List of domains domains to block (with their subdomains)")
|
help="File with one rule per line")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-f', '--first-party', action='store_true',
|
'-f', '--first-party', action='store_true',
|
||||||
help="The input only comes from verified first-party sources")
|
help="The input only comes from verified first-party sources")
|
||||||
|
@ -39,10 +41,14 @@ if __name__ == '__main__':
|
||||||
source = database.RuleMultiPath()
|
source = database.RuleMultiPath()
|
||||||
|
|
||||||
for rule in args.input:
|
for rule in args.input:
|
||||||
fun(DB,
|
rule = rule.strip()
|
||||||
rule.strip(),
|
try:
|
||||||
source=source,
|
fun(DB,
|
||||||
updated=int(time.time()),
|
rule,
|
||||||
)
|
source=source,
|
||||||
|
updated=int(time.time()),
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
DB.log.error(f"Could not add rule: {rule}")
|
||||||
|
|
||||||
DB.save()
|
DB.save()
|
||||||
|
|
|
@ -30,13 +30,12 @@ dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hos
|
||||||
# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt
|
# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt
|
||||||
# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt
|
# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt
|
||||||
|
|
||||||
|
log "Retrieving TLD list…"
|
||||||
|
dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list
|
||||||
|
grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list
|
||||||
|
|
||||||
log "Retrieving nameservers…"
|
log "Retrieving nameservers…"
|
||||||
rm -f nameservers
|
dl https://public-dns.info/nameservers.txt nameservers/public-dns.cache.list
|
||||||
touch nameservers
|
|
||||||
[ -f nameservers.head ] && cat nameservers.head >> nameservers
|
|
||||||
dl https://public-dns.info/nameservers.txt nameservers.temp
|
|
||||||
sort -R nameservers.temp >> nameservers
|
|
||||||
rm nameservers.temp
|
|
||||||
|
|
||||||
log "Retrieving top subdomains…"
|
log "Retrieving top subdomains…"
|
||||||
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
|
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
|
||||||
|
@ -51,4 +50,3 @@ then
|
||||||
else
|
else
|
||||||
mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list
|
mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list
|
||||||
fi
|
fi
|
||||||
dl https://www.orwell1984.today/cname/eulerian.net.txt subdomains/orwell-eulerian-cname-list.cache.list
|
|
||||||
|
|
|
@ -1,160 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# pylint: disable=C0103
|
|
||||||
|
|
||||||
"""
|
|
||||||
From a list of subdomains, output only
|
|
||||||
the ones resolving to a first-party tracker.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import progressbar
|
|
||||||
import csv
|
|
||||||
import typing
|
|
||||||
import ipaddress
|
|
||||||
|
|
||||||
# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
|
|
||||||
DomainRule = typing.Union[bool, typing.Dict]
|
|
||||||
# IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']]
|
|
||||||
IpRule = typing.Union[bool, typing.Dict]
|
|
||||||
|
|
||||||
RULES_DICT: DomainRule = dict()
|
|
||||||
RULES_IP_DICT: IpRule = dict()
|
|
||||||
|
|
||||||
|
|
||||||
def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
|
|
||||||
for char in address.packed:
|
|
||||||
for i in range(7, -1, -1):
|
|
||||||
yield (char >> i) & 0b1
|
|
||||||
|
|
||||||
|
|
||||||
def subdomain_matching(subdomain: str) -> bool:
|
|
||||||
parts = subdomain.split('.')
|
|
||||||
parts.reverse()
|
|
||||||
dic = RULES_DICT
|
|
||||||
for part in parts:
|
|
||||||
if isinstance(dic, bool) or part not in dic:
|
|
||||||
break
|
|
||||||
dic = dic[part]
|
|
||||||
if isinstance(dic, bool):
|
|
||||||
return dic
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def ip_matching(ip_str: str) -> bool:
|
|
||||||
ip = ipaddress.ip_address(ip_str)
|
|
||||||
dic = RULES_IP_DICT
|
|
||||||
i = 0
|
|
||||||
for bit in get_bits(ip):
|
|
||||||
i += 1
|
|
||||||
if isinstance(dic, bool) or bit not in dic:
|
|
||||||
break
|
|
||||||
dic = dic[bit]
|
|
||||||
if isinstance(dic, bool):
|
|
||||||
return dic
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_matching(chain: typing.List[str], no_explicit: bool = False
|
|
||||||
) -> typing.Iterable[str]:
|
|
||||||
if len(chain) <= 1:
|
|
||||||
return
|
|
||||||
initial = chain[0]
|
|
||||||
cname_destinations = chain[1:-1]
|
|
||||||
a_destination = chain[-1]
|
|
||||||
initial_matching = subdomain_matching(initial)
|
|
||||||
if no_explicit and initial_matching:
|
|
||||||
return
|
|
||||||
cname_matching = any(map(subdomain_matching, cname_destinations))
|
|
||||||
if cname_matching or initial_matching or ip_matching(a_destination):
|
|
||||||
yield initial
|
|
||||||
|
|
||||||
|
|
||||||
def register_rule(subdomain: str) -> None:
|
|
||||||
# Make a tree with domain parts
|
|
||||||
parts = subdomain.split('.')
|
|
||||||
parts.reverse()
|
|
||||||
dic = RULES_DICT
|
|
||||||
last_part = len(parts) - 1
|
|
||||||
for p, part in enumerate(parts):
|
|
||||||
if isinstance(dic, bool):
|
|
||||||
return
|
|
||||||
if p == last_part:
|
|
||||||
dic[part] = True
|
|
||||||
else:
|
|
||||||
dic.setdefault(part, dict())
|
|
||||||
dic = dic[part]
|
|
||||||
|
|
||||||
|
|
||||||
def register_rule_ip(network: str) -> None:
|
|
||||||
net = ipaddress.ip_network(network)
|
|
||||||
ip = net.network_address
|
|
||||||
dic = RULES_IP_DICT
|
|
||||||
last_bit = net.prefixlen - 1
|
|
||||||
for b, bit in enumerate(get_bits(ip)):
|
|
||||||
if isinstance(dic, bool):
|
|
||||||
return
|
|
||||||
if b == last_bit:
|
|
||||||
dic[bit] = True
|
|
||||||
else:
|
|
||||||
dic.setdefault(bit, dict())
|
|
||||||
dic = dic[bit]
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# Parsing arguments
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Filter first-party trackers from a list of subdomains")
|
|
||||||
parser.add_argument(
|
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
|
||||||
help="Input file with DNS chains")
|
|
||||||
parser.add_argument(
|
|
||||||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
|
||||||
help="Outptut file with one tracking subdomain per line")
|
|
||||||
parser.add_argument(
|
|
||||||
'-n', '--no-explicit', action='store_true',
|
|
||||||
help="Don't output domains already blocked with rules without CNAME")
|
|
||||||
parser.add_argument(
|
|
||||||
'-r', '--rules', type=argparse.FileType('r'),
|
|
||||||
help="List of domains domains to block (with their subdomains)")
|
|
||||||
parser.add_argument(
|
|
||||||
'-p', '--rules-ip', type=argparse.FileType('r'),
|
|
||||||
help="List of IPs ranges to block")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Progress bar
|
|
||||||
widgets = [
|
|
||||||
progressbar.Percentage(),
|
|
||||||
' ', progressbar.SimpleProgress(),
|
|
||||||
' ', progressbar.Bar(),
|
|
||||||
' ', progressbar.Timer(),
|
|
||||||
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
|
|
||||||
' ', progressbar.AdaptiveETA(),
|
|
||||||
]
|
|
||||||
progress = progressbar.ProgressBar(widgets=widgets)
|
|
||||||
|
|
||||||
# Reading rules
|
|
||||||
if args.rules:
|
|
||||||
for rule in args.rules:
|
|
||||||
register_rule(rule.strip())
|
|
||||||
if args.rules_ip:
|
|
||||||
for rule in args.rules_ip:
|
|
||||||
register_rule_ip(rule.strip())
|
|
||||||
|
|
||||||
# Approximating line count
|
|
||||||
if args.input.seekable():
|
|
||||||
lines = 0
|
|
||||||
for line in args.input:
|
|
||||||
lines += 1
|
|
||||||
progress.max_value = lines
|
|
||||||
args.input.seek(0)
|
|
||||||
|
|
||||||
# Reading domains to filter
|
|
||||||
reader = csv.reader(args.input)
|
|
||||||
progress.start()
|
|
||||||
for chain in reader:
|
|
||||||
for match in get_matching(chain, no_explicit=args.no_explicit):
|
|
||||||
print(match, file=args.output)
|
|
||||||
progress.update(progress.value + 1)
|
|
||||||
progress.finish()
|
|
26
import_rapid7.sh
Executable file
26
import_rapid7.sh
Executable file
|
@ -0,0 +1,26 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
function log() {
|
||||||
|
echo -e "\033[33m$@\033[0m"
|
||||||
|
}
|
||||||
|
|
||||||
|
function feed_rapid7_fdns { # dataset
|
||||||
|
dataset=$1
|
||||||
|
line=$(curl -s https://opendata.rapid7.com/sonar.fdns_v2/ | grep "href=\".\+-fdns_$dataset.json.gz\"")
|
||||||
|
link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)"
|
||||||
|
log "Reading $(echo "$dataset" | awk '{print toupper($0)}') records from $link"
|
||||||
|
curl -L "$link" | gunzip
|
||||||
|
}
|
||||||
|
|
||||||
|
function feed_rapid7_rdns {
|
||||||
|
dataset=$1
|
||||||
|
line=$(curl -s https://opendata.rapid7.com/sonar.rdns_v2/ | grep "href=\".\+-rdns.json.gz\"")
|
||||||
|
link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)"
|
||||||
|
log "Reading PTR records from $link"
|
||||||
|
curl -L "$link" | gunzip
|
||||||
|
}
|
||||||
|
|
||||||
|
feed_rapid7_rdns | ./feed_dns.py rapid7
|
||||||
|
feed_rapid7_fdns a | ./feed_dns.py rapid7 --ip4-cache 536870912
|
||||||
|
# feed_rapid7_fdns aaaa | ./feed_dns.py rapid7 --ip6-cache 536870912
|
||||||
|
feed_rapid7_fdns cname | ./feed_dns.py rapid7
|
|
@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as
|
||||||
|
|
||||||
./feed_asn.py
|
./feed_asn.py
|
||||||
|
|
||||||
log "Pruning old rules…"
|
# log "Pruning old rules…"
|
||||||
./db.py --prune --prune-before "$BEFORE" --prune-base
|
# ./db.py --prune --prune-before "$BEFORE" --prune-base
|
||||||
|
|
2
nameservers/.gitignore
vendored
Normal file
2
nameservers/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
*.custom.list
|
||||||
|
*.cache.list
|
24
nameservers/popular.list
Normal file
24
nameservers/popular.list
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
8.8.8.8
|
||||||
|
8.8.4.4
|
||||||
|
2001:4860:4860:0:0:0:0:8888
|
||||||
|
2001:4860:4860:0:0:0:0:8844
|
||||||
|
208.67.222.222
|
||||||
|
208.67.220.220
|
||||||
|
2620:119:35::35
|
||||||
|
2620:119:53::53
|
||||||
|
4.2.2.1
|
||||||
|
4.2.2.2
|
||||||
|
8.26.56.26
|
||||||
|
8.20.247.20
|
||||||
|
84.200.69.80
|
||||||
|
84.200.70.40
|
||||||
|
2001:1608:10:25:0:0:1c04:b12f
|
||||||
|
2001:1608:10:25:0:0:9249:d69b
|
||||||
|
9.9.9.10
|
||||||
|
149.112.112.10
|
||||||
|
2620:fe::10
|
||||||
|
2620:fe::fe:10
|
||||||
|
1.1.1.1
|
||||||
|
1.0.0.1
|
||||||
|
2606:4700:4700::1111
|
||||||
|
2606:4700:4700::1001
|
|
@ -1,22 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
function log() {
|
|
||||||
echo -e "\033[33m$@\033[0m"
|
|
||||||
}
|
|
||||||
|
|
||||||
./fetch_resources.sh
|
|
||||||
./import_rules.sh
|
|
||||||
|
|
||||||
# TODO Fetch 'em
|
|
||||||
log "Reading PTR records…"
|
|
||||||
pv ptr.json.gz | gunzip | ./feed_dns.py
|
|
||||||
log "Reading A records…"
|
|
||||||
pv a.json.gz | gunzip | ./feed_dns.py
|
|
||||||
log "Reading CNAME records…"
|
|
||||||
pv cname.json.gz | gunzip | ./feed_dns.py
|
|
||||||
|
|
||||||
log "Pruning old data…"
|
|
||||||
./database.py --prune
|
|
||||||
|
|
||||||
./filter_subdomains.sh
|
|
||||||
|
|
|
@ -4,9 +4,16 @@ function log() {
|
||||||
echo -e "\033[33m$@\033[0m"
|
echo -e "\033[33m$@\033[0m"
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Compiling locally known subdomain…"
|
log "Compiling nameservers…"
|
||||||
# Sort by last character to utilize the DNS server caching mechanism
|
pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list
|
||||||
pv subdomains/*.list | sed 's/\r$//' | rev | sort -u | rev > temp/all_subdomains.list
|
|
||||||
log "Resolving locally known subdomain…"
|
|
||||||
pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.csv
|
|
||||||
|
|
||||||
|
log "Compiling subdomain…"
|
||||||
|
# Sort by last character to utilize the DNS server caching mechanism
|
||||||
|
# (not as efficient with massdns but it's almost free so why not)
|
||||||
|
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
|
||||||
|
|
||||||
|
log "Resolving subdomain…"
|
||||||
|
massdns --output Snrql --retry REFUSED,SERVFAIL --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
|
||||||
|
|
||||||
|
log "Importing into database…"
|
||||||
|
pv temp/all_resolved.txt | ./feed_dns.py massdns
|
||||||
|
|
|
@ -18,7 +18,14 @@ omtrdc.net
|
||||||
online-metrix.net
|
online-metrix.net
|
||||||
# Webtrekk
|
# Webtrekk
|
||||||
wt-eu02.net
|
wt-eu02.net
|
||||||
|
webtrekk.net
|
||||||
# Otto Group
|
# Otto Group
|
||||||
oghub.io
|
oghub.io
|
||||||
# ???
|
# Intent.com
|
||||||
partner.intentmedia.net
|
partner.intentmedia.net
|
||||||
|
# Wizaly
|
||||||
|
wizaly.com
|
||||||
|
# Commanders Act
|
||||||
|
tagcommander.com
|
||||||
|
# Ingenious Technologies
|
||||||
|
affex.org
|
||||||
|
|
34
run_tests.py
Executable file
34
run_tests.py
Executable file
|
@ -0,0 +1,34 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import database
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import csv
|
||||||
|
|
||||||
|
TESTS_DIR = 'tests'
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
DB = database.Database()
|
||||||
|
log = logging.getLogger('tests')
|
||||||
|
|
||||||
|
for filename in os.listdir(TESTS_DIR):
|
||||||
|
log.info("")
|
||||||
|
log.info("Running tests from %s", filename)
|
||||||
|
path = os.path.join(TESTS_DIR, filename)
|
||||||
|
with open(path, 'rt') as fdesc:
|
||||||
|
reader = csv.DictReader(fdesc)
|
||||||
|
for test in reader:
|
||||||
|
log.info("Testing %s (%s)", test['url'], test['comment'])
|
||||||
|
|
||||||
|
for white in test['white'].split(':'):
|
||||||
|
if not white:
|
||||||
|
continue
|
||||||
|
if any(DB.get_domain(white)):
|
||||||
|
log.error("False positive: %s", white)
|
||||||
|
|
||||||
|
for black in test['black'].split(':'):
|
||||||
|
if not black:
|
||||||
|
continue
|
||||||
|
if not any(DB.get_domain(black)):
|
||||||
|
log.error("False negative: %s", black)
|
|
@ -1,6 +1,5 @@
|
||||||
url,white,black,comment
|
url,white,black,comment
|
||||||
https://support.apple.com,support.apple.com,,EdgeKey / AkamaiEdge
|
https://support.apple.com,support.apple.com,,EdgeKey / AkamaiEdge
|
||||||
https://www.pinterest.fr/,i.pinimg.com,,Cedexis
|
https://www.pinterest.fr/,i.pinimg.com,,Cedexis
|
||||||
https://www.pinterest.fr/,i.pinimg.com,,Cedexis
|
|
||||||
https://www.tumblr.com/,66.media.tumblr.com,,ChiCDN
|
https://www.tumblr.com/,66.media.tumblr.com,,ChiCDN
|
||||||
https://www.skype.com/fr/,www.skype.com,,TrafficManager
|
https://www.skype.com/fr/,www.skype.com,,TrafficManager
|
||||||
|
|
|
|
@ -5,3 +5,6 @@ https://www.discover.com/,,content.discover.com,ThreatMetrix
|
||||||
https://www.mytoys.de/,,web.mytoys.de,Webtrekk
|
https://www.mytoys.de/,,web.mytoys.de,Webtrekk
|
||||||
https://www.baur.de/,,tp.baur.de,Otto Group
|
https://www.baur.de/,,tp.baur.de,Otto Group
|
||||||
https://www.liligo.com/,,compare.liligo.com,???
|
https://www.liligo.com/,,compare.liligo.com,???
|
||||||
|
https://www.boulanger.com/,,tag.boulanger.fr,TagCommander
|
||||||
|
https://www.airfrance.fr/FR/,,tk.airfrance.fr,Wizaly
|
||||||
|
https://www.vsgamers.es/,,marketing.net.vsgamers.es,Affex
|
||||||
|
|
|
35
validate_list.py
Executable file
35
validate_list.py
Executable file
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# pylint: disable=C0103
|
||||||
|
|
||||||
|
"""
|
||||||
|
Filter out invalid domain names
|
||||||
|
"""
|
||||||
|
|
||||||
|
import database
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# Parsing arguments
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Filter out invalid domain name/ip addresses from a list.")
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
||||||
|
help="Input file, one element per line")
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
|
||||||
|
help="Output file, one element per line")
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--domain', action='store_true',
|
||||||
|
help="Can be domain name")
|
||||||
|
parser.add_argument(
|
||||||
|
'-4', '--ip4', action='store_true',
|
||||||
|
help="Can be IP4")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for line in args.input:
|
||||||
|
line = line.strip()
|
||||||
|
if (args.domain and database.Database.validate_domain(line)) or \
|
||||||
|
(args.ip4 and database.Database.validate_ip4address(line)):
|
||||||
|
print(line, file=args.output)
|
Loading…
Reference in a new issue