Compare commits

..

No commits in common. "e882e09b376891bc80568895e39655e362750813" and "a0e68f08487e333c39b5056ed24eb925cb3ff3c5" have entirely different histories.

10 changed files with 293 additions and 420 deletions

View file

@ -26,8 +26,6 @@ That's where this scripts comes in, to generate a list of such subdomains.
## How does this script work ## How does this script work
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
It takes an input a list of websites with trackers included. It takes an input a list of websites with trackers included.
So far, this list is manually-generated from the list of clients of such first-party trackers So far, this list is manually-generated from the list of clients of such first-party trackers
(latter we should use a general list of websites to be more exhaustive). (latter we should use a general list of websites to be more exhaustive).
@ -40,8 +38,6 @@ It finally outputs the matching ones.
## Requirements ## Requirements
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
Just to build the list, you can find an already-built list in the releases. Just to build the list, you can find an already-built list in the releases.
- Bash - Bash
@ -58,8 +54,6 @@ Just to build the list, you can find an already-built list in the releases.
## Usage ## Usage
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
This is only if you want to build the list yourself. This is only if you want to build the list yourself.
If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt> If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt>
It was build using additional sources not included in this repository for privacy reasons. It was build using additional sources not included in this repository for privacy reasons.

View file

@ -26,76 +26,57 @@ class Path():
class RulePath(Path): class RulePath(Path):
def __str__(self) -> str: pass
return '(rule)'
class RuleFirstPath(RulePath):
def __str__(self) -> str:
return '(first-party rule)'
class RuleMultiPath(RulePath):
def __str__(self) -> str:
return '(multi-party rule)'
class DomainPath(Path): class DomainPath(Path):
def __init__(self, parts: typing.List[str]): def __init__(self, path: typing.List[str]):
self.parts = parts self.path = path
def __str__(self) -> str:
return '?.' + Database.unpack_domain(self)
class HostnamePath(DomainPath): class HostnamePath(DomainPath):
def __str__(self) -> str: pass
return Database.unpack_domain(self)
class ZonePath(DomainPath): class ZonePath(DomainPath):
def __str__(self) -> str: pass
return '*.' + Database.unpack_domain(self)
class AsnPath(Path): class AsnPath(Path):
def __init__(self, asn: Asn): def __init__(self, asn: Asn):
self.asn = asn self.asn = asn
def __str__(self) -> str:
return Database.unpack_asn(self)
class Ip4Path(Path): class Ip4Path(Path):
def __init__(self, value: int, prefixlen: int): def __init__(self, value: int, prefixlen: int):
self.value = value self.value = value
self.prefixlen = prefixlen self.prefixlen = prefixlen
def __str__(self) -> str:
return Database.unpack_ip4network(self)
class Match(): class Match():
def __init__(self) -> None: def __init__(self) -> None:
self.source: typing.Optional[Path] = None
self.updated: int = 0 self.updated: int = 0
self.dupplicate: bool = False
# Cache
self.level: int = 0 self.level: int = 0
self.first_party: bool = False self.source: Path = RulePath()
self.references: int = 0 # FP dupplicate args
def active(self, first_party: bool = None) -> bool: def set(self,
if self.updated == 0 or (first_party and not self.first_party): updated: int,
return False level: int,
return True source: Path,
) -> None:
if updated > self.updated or level > self.level:
self.updated = updated
self.level = level
self.source = source
# FP dupplicate function
def active(self) -> bool:
return self.updated > 0
class AsnNode(Match): class AsnNode(Match):
def __init__(self) -> None: pass
Match.__init__(self)
self.name = ''
class DomainTreeNode(): class DomainTreeNode():
@ -105,16 +86,16 @@ class DomainTreeNode():
self.match_hostname = Match() self.match_hostname = Match()
class IpTreeNode(Match): class IpTreeNode():
def __init__(self) -> None: def __init__(self) -> None:
Match.__init__(self) self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None]
self.zero: typing.Optional[IpTreeNode] = None self.match = Match()
self.one: typing.Optional[IpTreeNode] = None
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
MatchCallable = typing.Callable[[Path, NodeCallable = typing.Callable[[Path,
Match], Node,
typing.Optional[typing.Any]],
typing.Any] typing.Any]
@ -127,6 +108,7 @@ class Profiler():
self.step_dict: typing.Dict[str, int] = dict() self.step_dict: typing.Dict[str, int] = dict()
def enter_step(self, name: str) -> None: def enter_step(self, name: str) -> None:
return
now = time.perf_counter() now = time.perf_counter()
try: try:
self.time_dict[self.time_step] += now - self.time_last self.time_dict[self.time_step] += now - self.time_last
@ -149,21 +131,13 @@ class Profiler():
class Database(Profiler): class Database(Profiler):
VERSION = 18 VERSION = 10
PATH = "blocking.p" PATH = "blocking.p"
def initialize(self) -> None: def initialize(self) -> None:
self.log.warning( self.log.warning(
"Creating database version: %d ", "Creating database version: %d ",
Database.VERSION) Database.VERSION)
# Dummy match objects that everything refer to
self.rules: typing.List[Match] = list()
for first_party in (False, True):
m = Match()
m.updated = 1
m.level = 0
m.first_party = first_party
self.rules.append(m)
self.domtree = DomainTreeNode() self.domtree = DomainTreeNode()
self.asns: typing.Dict[Asn, AsnNode] = dict() self.asns: typing.Dict[Asn, AsnNode] = dict()
self.ip4tree = IpTreeNode() self.ip4tree = IpTreeNode()
@ -174,7 +148,7 @@ class Database(Profiler):
with open(self.PATH, 'rb') as db_fdsec: with open(self.PATH, 'rb') as db_fdsec:
version, data = pickle.load(db_fdsec) version, data = pickle.load(db_fdsec)
if version == Database.VERSION: if version == Database.VERSION:
self.rules, self.domtree, self.asns, self.ip4tree = data self.domtree, self.asns, self.ip4tree = data
return return
self.log.warning( self.log.warning(
"Outdated database version found: %d, " "Outdated database version found: %d, "
@ -191,7 +165,7 @@ class Database(Profiler):
def save(self) -> None: def save(self) -> None:
self.enter_step('save') self.enter_step('save')
with open(self.PATH, 'wb') as db_fdsec: with open(self.PATH, 'wb') as db_fdsec:
data = self.rules, self.domtree, self.asns, self.ip4tree data = self.domtree, self.asns, self.ip4tree
pickle.dump((self.VERSION, data), db_fdsec) pickle.dump((self.VERSION, data), db_fdsec)
self.profile() self.profile()
@ -206,7 +180,7 @@ class Database(Profiler):
@staticmethod @staticmethod
def unpack_domain(domain: DomainPath) -> str: def unpack_domain(domain: DomainPath) -> str:
return '.'.join(domain.parts[::-1]) return '.'.join(domain.path[::-1])
@staticmethod @staticmethod
def pack_asn(asn: str) -> AsnPath: def pack_asn(asn: str) -> AsnPath:
@ -255,227 +229,94 @@ class Database(Profiler):
addr >>= 8 addr >>= 8
return '.'.join(map(str, octets)) + '/' + str(network.prefixlen) return '.'.join(map(str, octets)) + '/' + str(network.prefixlen)
def get_match(self, path: Path) -> Match:
if isinstance(path, RuleMultiPath):
return self.rules[0]
elif isinstance(path, RuleFirstPath):
return self.rules[1]
elif isinstance(path, AsnPath):
return self.asns[path.asn]
elif isinstance(path, DomainPath):
dicd = self.domtree
for part in path.parts:
dicd = dicd.children[part]
if isinstance(path, HostnamePath):
return dicd.match_hostname
elif isinstance(path, ZonePath):
return dicd.match_zone
else:
raise ValueError
elif isinstance(path, Ip4Path):
dici = self.ip4tree
for i in range(31, 31-path.prefixlen, -1):
bit = (path.value >> i) & 0b1
dici_next = dici.one if bit else dici.zero
if not dici_next:
raise IndexError
dici = dici_next
return dici
else:
raise ValueError
def exec_each_asn(self,
callback: MatchCallable,
) -> typing.Any:
for asn in self.asns:
match = self.asns[asn]
if match.active():
c = callback(
AsnPath(asn),
match,
)
try:
yield from c
except TypeError: # not iterable
pass
def exec_each_domain(self, def exec_each_domain(self,
callback: MatchCallable, callback: NodeCallable,
arg: typing.Any = None,
_dic: DomainTreeNode = None, _dic: DomainTreeNode = None,
_par: DomainPath = None, _par: DomainPath = None,
) -> typing.Any: ) -> typing.Any:
_dic = _dic or self.domtree _dic = _dic or self.domtree
_par = _par or DomainPath([]) _par = _par or DomainPath([])
if _dic.match_hostname.active(): yield from callback(_par, _dic, arg)
c = callback(
HostnamePath(_par.parts),
_dic.match_hostname,
)
try:
yield from c
except TypeError: # not iterable
pass
if _dic.match_zone.active():
c = callback(
ZonePath(_par.parts),
_dic.match_zone,
)
try:
yield from c
except TypeError: # not iterable
pass
for part in _dic.children: for part in _dic.children:
dic = _dic.children[part] dic = _dic.children[part]
yield from self.exec_each_domain( yield from self.exec_each_domain(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=DomainPath(_par.parts + [part]) _par=DomainPath(_par.path + [part])
) )
def exec_each_ip4(self, def exec_each_ip4(self,
callback: MatchCallable, callback: NodeCallable,
arg: typing.Any = None,
_dic: IpTreeNode = None, _dic: IpTreeNode = None,
_par: Ip4Path = None, _par: Ip4Path = None,
) -> typing.Any: ) -> typing.Any:
_dic = _dic or self.ip4tree _dic = _dic or self.ip4tree
_par = _par or Ip4Path(0, 0) _par = _par or Ip4Path(0, 0)
if _dic.active(): callback(_par, _dic, arg)
c = callback(
_par,
_dic,
)
try:
yield from c
except TypeError: # not iterable
pass
# 0 # 0
pref = _par.prefixlen + 1 dic = _dic.children[0]
dic = _dic.zero
if dic: if dic:
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref))) addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen)))
assert addr0 == _par.value assert addr0 == _par.value
yield from self.exec_each_ip4( yield from self.exec_each_ip4(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=Ip4Path(addr0, pref) _par=Ip4Path(addr0, _par.prefixlen+1)
) )
# 1 # 1
dic = _dic.one dic = _dic.children[1]
if dic: if dic:
addr1 = _par.value | (1 << (32-pref)) addr1 = _par.value | (1 << (32-_par.prefixlen))
yield from self.exec_each_ip4( yield from self.exec_each_ip4(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=Ip4Path(addr1, pref) _par=Ip4Path(addr1, _par.prefixlen+1)
) )
def exec_each(self, def exec_each(self,
callback: MatchCallable, callback: NodeCallable,
arg: typing.Any = None,
) -> typing.Any: ) -> typing.Any:
yield from self.exec_each_domain(callback) yield from self.exec_each_domain(callback)
yield from self.exec_each_ip4(callback) yield from self.exec_each_ip4(callback)
yield from self.exec_each_asn(callback)
def update_references(self) -> None: def update_references(self) -> None:
# Should be correctly calculated normally, raise NotImplementedError
# keeping this just in case
def reset_references_cb(path: Path,
match: Match
) -> None:
match.references = 0
for _ in self.exec_each(reset_references_cb):
pass
def increment_references_cb(path: Path,
match: Match
) -> None:
if match.source:
source = self.get_match(match.source)
source.references += 1
for _ in self.exec_each(increment_references_cb):
pass
def prune(self, before: int, base_only: bool = False) -> None: def prune(self, before: int, base_only: bool = False) -> None:
raise NotImplementedError raise NotImplementedError
def explain(self, path: Path) -> str: def explain(self, entry: int) -> str:
match = self.get_match(path) raise NotImplementedError
if isinstance(match, AsnNode):
string = f'{path} ({match.name}) #{match.references}'
else:
string = f'{path} #{match.references}'
if match.source:
string += f'{self.explain(match.source)}'
return string
def export(self, def export(self,
first_party_only: bool = False, first_party_only: bool = False,
end_chain_only: bool = False, end_chain_only: bool = False,
no_dupplicates: bool = False,
explain: bool = False, explain: bool = False,
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
if first_party_only or end_chain_only or explain:
raise NotImplementedError
def export_cb(path: Path, match: Match def export_cb(path: Path, node: Node, _: typing.Any
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
assert isinstance(path, DomainPath) assert isinstance(path, DomainPath)
if not isinstance(path, HostnamePath): assert isinstance(node, DomainTreeNode)
return if node.match_hostname:
if first_party_only and not match.first_party: a = self.unpack_domain(path)
return yield a
if end_chain_only and match.references > 0:
return
if no_dupplicates and match.dupplicate:
return
if explain:
yield self.explain(path)
else:
yield self.unpack_domain(path)
yield from self.exec_each_domain(export_cb) yield from self.exec_each_domain(export_cb, None)
def list_rules(self, def count_rules(self,
first_party_only: bool = False, first_party_only: bool = False,
) -> typing.Iterable[str]:
def list_rules_cb(path: Path, match: Match
) -> typing.Iterable[str]:
if first_party_only and not match.first_party:
return
if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 1:
# It should be the latter condition but it is more
# useful when using the former
yield self.explain(path)
yield from self.exec_each(list_rules_cb)
def count_records(self,
first_party_only: bool = False,
rules_only: bool = False,
no_dupplicates: bool = False,
) -> str: ) -> str:
memo: typing.Dict[str, int] = dict() raise NotImplementedError
def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
if rules_only and match.level > 1:
return
if no_dupplicates and match.dupplicate:
return
try:
memo[path.__class__.__name__] += 1
except KeyError:
memo[path.__class__.__name__] = 1
for _ in self.exec_each(count_records_cb):
pass
split: typing.List[str] = list()
for key, value in sorted(memo.items(), key=lambda s: s[0]):
split.append(f'{key[:-4]}: {value}')
return ', '.join(split)
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
self.enter_step('get_domain_pack') self.enter_step('get_domain_pack')
@ -483,10 +324,10 @@ class Database(Profiler):
self.enter_step('get_domain_brws') self.enter_step('get_domain_brws')
dic = self.domtree dic = self.domtree
depth = 0 depth = 0
for part in domain.parts: for part in domain.path:
if dic.match_zone.active(): if dic.match_zone.active():
self.enter_step('get_domain_yield') self.enter_step('get_domain_yield')
yield ZonePath(domain.parts[:depth]) yield ZonePath(domain.path[:depth])
self.enter_step('get_domain_brws') self.enter_step('get_domain_brws')
if part not in dic.children: if part not in dic.children:
return return
@ -494,85 +335,61 @@ class Database(Profiler):
depth += 1 depth += 1
if dic.match_zone.active(): if dic.match_zone.active():
self.enter_step('get_domain_yield') self.enter_step('get_domain_yield')
yield ZonePath(domain.parts) yield ZonePath(domain.path)
if dic.match_hostname.active(): if dic.match_hostname.active():
self.enter_step('get_domain_yield') self.enter_step('get_domain_yield')
yield HostnamePath(domain.parts) yield HostnamePath(domain.path)
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]: def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
self.enter_step('get_ip4_pack') self.enter_step('get_ip4_pack')
ip4 = self.pack_ip4address(ip4_str) ip4 = self.pack_ip4address(ip4_str)
self.enter_step('get_ip4_brws') self.enter_step('get_ip4_brws')
dic = self.ip4tree dic = self.ip4tree
for i in range(31, 31-ip4.prefixlen, -1): for i in reversed(range(ip4.prefixlen)):
bit = (ip4.value >> i) & 0b1 part = (ip4.value >> i) & 0b1
if dic.active(): if dic.match.active():
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) yield Ip4Path(ip4.value, 32-i)
self.enter_step('get_ip4_brws') self.enter_step('get_ip4_brws')
next_dic = dic.one if bit else dic.zero next_dic = dic.children[part]
if next_dic is None: if next_dic is None:
return return
dic = next_dic dic = next_dic
if dic.active(): if dic.match.active():
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
yield ip4 yield ip4
def _set_match(self, def list_asn(self) -> typing.Iterable[AsnPath]:
match: Match, for asn in self.asns:
updated: int, yield AsnPath(asn)
source: Path,
source_match: Match = None,
dupplicate: bool = False,
) -> None:
# source_match is in parameters because most of the time
# its parent function needs it too,
# so it can pass it to save a traversal
source_match = source_match or self.get_match(source)
new_level = source_match.level + 1
if updated > match.updated or new_level < match.level \
or source_match.first_party > match.first_party:
# NOTE FP and level of matches referencing this one
# won't be updated until run or prune
if match.source:
old_source = self.get_match(match.source)
old_source.references -= 1
match.updated = updated
match.level = new_level
match.first_party = source_match.first_party
match.source = source
source_match.references += 1
match.dupplicate = dupplicate
def _set_domain(self, def _set_domain(self,
hostname: bool, hostname: bool,
domain_str: str, domain_str: str,
updated: int, updated: int,
source: Path) -> None: is_first_party: bool = None,
source: Path = None) -> None:
self.enter_step('set_domain_pack') self.enter_step('set_domain_pack')
if is_first_party:
raise NotImplementedError
domain = self.pack_domain(domain_str) domain = self.pack_domain(domain_str)
self.enter_step('set_domain_fp')
source_match = self.get_match(source)
is_first_party = source_match.first_party
self.enter_step('set_domain_brws') self.enter_step('set_domain_brws')
dic = self.domtree dic = self.domtree
dupplicate = False for part in domain.path:
for part in domain.parts: if dic.match_zone.active():
# Refuse to add domain whose zone is already matching
return
if part not in dic.children: if part not in dic.children:
dic.children[part] = DomainTreeNode() dic.children[part] = DomainTreeNode()
dic = dic.children[part] dic = dic.children[part]
if dic.match_zone.active(is_first_party):
dupplicate = True
if hostname: if hostname:
match = dic.match_hostname match = dic.match_hostname
else: else:
match = dic.match_zone match = dic.match_zone
self._set_match( match.set(
match,
updated, updated,
source, 0, # TODO Level
source_match=source_match, source or RulePath(),
dupplicate=dupplicate,
) )
def set_hostname(self, def set_hostname(self,
@ -588,48 +405,42 @@ class Database(Profiler):
def set_asn(self, def set_asn(self,
asn_str: str, asn_str: str,
updated: int, updated: int,
source: Path) -> None: is_first_party: bool = None,
source: Path = None) -> None:
self.enter_step('set_asn') self.enter_step('set_asn')
if is_first_party:
raise NotImplementedError
path = self.pack_asn(asn_str) path = self.pack_asn(asn_str)
if path.asn in self.asns:
match = self.asns[path.asn]
else:
match = AsnNode() match = AsnNode()
self.asns[path.asn] = match match.set(
self._set_match(
match,
updated, updated,
source, 0,
source or RulePath()
) )
self.asns[path.asn] = match
def _set_ip4(self, def _set_ip4(self,
ip4: Ip4Path, ip4: Ip4Path,
updated: int, updated: int,
source: Path) -> None: is_first_party: bool = None,
self.enter_step('set_ip4_fp') source: Path = None) -> None:
source_match = self.get_match(source) if is_first_party:
is_first_party = source_match.first_party raise NotImplementedError
self.enter_step('set_ip4_brws')
dic = self.ip4tree dic = self.ip4tree
dupplicate = False for i in reversed(range(ip4.prefixlen)):
for i in range(31, 31-ip4.prefixlen, -1): part = (ip4.value >> i) & 0b1
bit = (ip4.value >> i) & 0b1 if dic.match.active():
next_dic = dic.one if bit else dic.zero # Refuse to add ip4* whose network is already matching
return
next_dic = dic.children[part]
if next_dic is None: if next_dic is None:
next_dic = IpTreeNode() next_dic = IpTreeNode()
if bit: dic.children[part] = next_dic
dic.one = next_dic
else:
dic.zero = next_dic
dic = next_dic dic = next_dic
if dic.active(is_first_party): dic.match.set(
dupplicate = True
self._set_match(
dic,
updated, updated,
source, 0, # TODO Level
source_match=source_match, source or RulePath(),
dupplicate=dupplicate,
) )
def set_ip4address(self, def set_ip4address(self,
@ -638,6 +449,7 @@ class Database(Profiler):
) -> None: ) -> None:
self.enter_step('set_ip4add_pack') self.enter_step('set_ip4add_pack')
ip4 = self.pack_ip4address(ip4address_str) ip4 = self.pack_ip4address(ip4address_str)
self.enter_step('set_ip4add_brws')
self._set_ip4(ip4, *args, **kwargs) self._set_ip4(ip4, *args, **kwargs)
def set_ip4network(self, def set_ip4network(self,
@ -646,4 +458,5 @@ class Database(Profiler):
) -> None: ) -> None:
self.enter_step('set_ip4net_pack') self.enter_step('set_ip4net_pack')
ip4 = self.pack_ip4network(ip4network_str) ip4 = self.pack_ip4network(ip4network_str)
self.enter_step('set_ip4net_brws')
self._set_ip4(ip4, *args, **kwargs) self._set_ip4(ip4, *args, **kwargs)

44
db.py
View file

@ -1,44 +0,0 @@
#!/usr/bin/env python3
import argparse
import database
import time
import os
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="Database operations")
parser.add_argument(
'-i', '--initialize', action='store_true',
help="Reconstruct the whole database")
parser.add_argument(
'-p', '--prune', action='store_true',
help="Remove old entries from database")
parser.add_argument(
'-b', '--prune-base', action='store_true',
help="TODO")
parser.add_argument(
'-s', '--prune-before', type=int,
default=(int(time.time()) - 60*60*24*31*6),
help="TODO")
parser.add_argument(
'-r', '--references', action='store_true',
help="Update the reference count")
args = parser.parse_args()
if not args.initialize:
DB = database.Database()
else:
if os.path.isfile(database.Database.PATH):
os.unlink(database.Database.PATH)
DB = database.Database()
DB.enter_step('main')
if args.prune:
DB.prune(before=args.prune_before, base_only=args.prune_base)
if args.references:
DB.update_references()
DB.save()

View file

@ -25,9 +25,6 @@ if __name__ == '__main__':
parser.add_argument( parser.add_argument(
'-r', '--rules', action='store_true', '-r', '--rules', action='store_true',
help="TODO") help="TODO")
parser.add_argument(
'-d', '--no-dupplicates', action='store_true',
help="TODO")
parser.add_argument( parser.add_argument(
'-c', '--count', action='store_true', '-c', '--count', action='store_true',
help="TODO") help="TODO")
@ -35,20 +32,16 @@ if __name__ == '__main__':
DB = database.Database() DB = database.Database()
if args.count:
print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules,
no_dupplicates=args.no_dupplicates,
))
else:
if args.rules: if args.rules:
for line in DB.list_rules(): if not args.count:
print(line) raise NotImplementedError
print(DB.count_rules(first_party_only=args.first_party))
else:
if args.count:
raise NotImplementedError
for domain in DB.export( for domain in DB.export(
first_party_only=args.first_party, first_party_only=args.first_party,
end_chain_only=args.end_chain, end_chain_only=args.end_chain,
no_dupplicates=args.no_dupplicates,
explain=args.explain, explain=args.explain,
): ):
print(domain, file=args.output) print(domain, file=args.output)

View file

@ -21,15 +21,6 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
yield pref['prefix'] yield pref['prefix']
def get_name(asn: str) -> str:
req = requests.get(
'https://stat.ripe.net/data/as-overview/data.json',
params={'resource': asn}
)
data = req.json()
return data['data']['holder']
if __name__ == '__main__': if __name__ == '__main__':
log = logging.getLogger('feed_asn') log = logging.getLogger('feed_asn')
@ -41,15 +32,8 @@ if __name__ == '__main__':
DB = database.Database() DB = database.Database()
def add_ranges(path: database.Path, for path in DB.list_asn():
match: database.Match,
) -> None:
assert isinstance(path, database.AsnPath)
assert isinstance(match, database.AsnNode)
asn_str = database.Database.unpack_asn(path) asn_str = database.Database.unpack_asn(path)
DB.enter_step('asn_get_name')
name = get_name(asn_str)
match.name = name
DB.enter_step('asn_get_ranges') DB.enter_step('asn_get_ranges')
for prefix in get_ranges(asn_str): for prefix in get_ranges(asn_str):
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
@ -59,13 +43,10 @@ if __name__ == '__main__':
source=path, source=path,
updated=int(time.time()) updated=int(time.time())
) )
log.info('Added %s from %s (%s)', prefix, path, name) log.info('Added %s from %s (%s)', prefix, asn_str, path)
elif parsed_prefix.version == 6: elif parsed_prefix.version == 6:
log.warning('Unimplemented prefix version: %s', prefix) log.warning('Unimplemented prefix version: %s', prefix)
else: else:
log.error('Unknown prefix version: %s', prefix) log.error('Unknown prefix version: %s', prefix)
for _ in DB.exec_each_asn(add_ranges):
pass
DB.save() DB.save()

147
feed_dns.old.py Executable file
View file

@ -0,0 +1,147 @@
#!/usr/bin/env python3
import argparse
import database
import logging
import sys
import typing
import enum
RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR')
Record = typing.Tuple[RecordType, int, str, str]
# select, write
FUNCTION_MAP: typing.Any = {
RecordType.A: (
database.Database.get_ip4,
database.Database.set_hostname,
),
RecordType.CNAME: (
database.Database.get_domain,
database.Database.set_hostname,
),
RecordType.PTR: (
database.Database.get_domain,
database.Database.set_ip4address,
),
}
class Parser():
def __init__(self, buf: typing.Any) -> None:
self.buf = buf
self.log = logging.getLogger('parser')
self.db = database.Database()
def end(self) -> None:
self.db.save()
def register(self,
rtype: RecordType,
updated: int,
name: str,
value: str
) -> None:
self.db.enter_step('register')
select, write = FUNCTION_MAP[rtype]
for source in select(self.db, value):
# write(self.db, name, updated, source=source)
write(self.db, name, updated)
def consume(self) -> None:
raise NotImplementedError
class Rapid7Parser(Parser):
TYPES = {
'a': RecordType.A,
'aaaa': RecordType.AAAA,
'cname': RecordType.CNAME,
'ptr': RecordType.PTR,
}
def consume(self) -> None:
data = dict()
for line in self.buf:
self.db.enter_step('parse_rapid7')
split = line.split('"')
for k in range(1, 14, 4):
key = split[k]
val = split[k+2]
data[key] = val
self.register(
Rapid7Parser.TYPES[data['type']],
int(data['timestamp']),
data['name'],
data['value']
)
class DnsMassParser(Parser):
# dnsmass --output Snrql
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
TYPES = {
'A': (RecordType.A, -1, None),
'AAAA': (RecordType.AAAA, -1, None),
'CNAME': (RecordType.CNAME, -1, -1),
}
def consume(self) -> None:
self.db.enter_step('parse_dnsmass')
timestamp = 0
header = True
for line in self.buf:
line = line[:-1]
if not line:
header = True
continue
split = line.split(' ')
try:
if header:
timestamp = int(split[1])
header = False
else:
dtype, name_offset, value_offset = \
DnsMassParser.TYPES[split[1]]
self.register(
dtype,
timestamp,
split[0][:name_offset],
split[2][:value_offset],
)
self.db.enter_step('parse_dnsmass')
except KeyError:
continue
PARSERS = {
'rapid7': Rapid7Parser,
'dnsmass': DnsMassParser,
}
if __name__ == '__main__':
# Parsing arguments
log = logging.getLogger('feed_dns')
args_parser = argparse.ArgumentParser(
description="TODO")
args_parser.add_argument(
'parser',
choices=PARSERS.keys(),
help="TODO")
args_parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = args_parser.parse_args()
parser = PARSERS[args.parser](args.input)
try:
parser.consume()
except KeyboardInterrupt:
pass
parser.end()

View file

@ -51,7 +51,8 @@ class Writer(multiprocessing.Process):
try: try:
for source in select(self.db, value): for source in select(self.db, value):
write(self.db, name, updated, source=source) # write(self.db, name, updated, source=source)
write(self.db, name, updated)
except ValueError: except ValueError:
self.log.exception("Cannot execute: %s", record) self.log.exception("Cannot execute: %s", record)
@ -181,10 +182,10 @@ if __name__ == '__main__':
'-j', '--workers', type=int, default=4, '-j', '--workers', type=int, default=4,
help="TODO") help="TODO")
args_parser.add_argument( args_parser.add_argument(
'-b', '--block-size', type=int, default=1024, '-b', '--block-size', type=int, default=100,
help="TODO") help="TODO")
args_parser.add_argument( args_parser.add_argument(
'-q', '--queue-size', type=int, default=128, '-q', '--queue-size', type=int, default=10,
help="TODO") help="TODO")
args = args_parser.parse_args() args = args_parser.parse_args()

View file

@ -32,16 +32,10 @@ if __name__ == '__main__':
fun = FUNCTION_MAP[args.type] fun = FUNCTION_MAP[args.type]
source: database.RulePath
if args.first_party:
source = database.RuleFirstPath()
else:
source = database.RuleMultiPath()
for rule in args.input: for rule in args.input:
fun(DB, fun(DB,
rule.strip(), rule.strip(),
source=source, # is_first_party=args.first_party,
updated=int(time.time()), updated=int(time.time()),
) )

View file

@ -4,25 +4,21 @@ function log() {
echo -e "\033[33m$@\033[0m" echo -e "\033[33m$@\033[0m"
} }
log "Pruning old data…"
./database.py --prune
log "Recounting references…"
./database.py --references
log "Exporting lists…" log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt ./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt ./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt ./export.py --end-chain --output dist/multiparty-only-trackers.txt
log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
log "Generating hosts lists…" log "Generating hosts lists…"
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
function generate_hosts { function generate_hosts {
basename="$1" basename="$1"
description="$2" description="$2"
@ -50,15 +46,13 @@ function generate_hosts {
echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)"
echo "#" echo "#"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#" echo "#"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo echo

View file

@ -6,11 +6,11 @@ function log() {
log "Importing rules…" log "Importing rules…"
BEFORE="$(date +%s)" BEFORE="$(date +%s)"
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone # cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone # cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone # cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network # cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn # cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party