Compare commits

...

8 commits

Author SHA1 Message Date
Geoffrey Frogeye e882e09b37
Added outdated documentation warning in README 2019-12-17 14:27:43 +01:00
Geoffrey Frogeye d65107f849
Save dupplicates too
Maybe I won't publish them but this will help me for tracking trackers.
2019-12-17 14:10:41 +01:00
Geoffrey Frogeye ea0855bd00
Forgot to push this little guy
Good thing I cleaned up my working directory.
It only exists because pickles created from database.py itself
won't be openable from a file simply importing databse.py.
So we create it when in 'imported state'.
2019-12-17 13:50:39 +01:00
Geoffrey Frogeye 7851b038f5
Reworked rule export 2019-12-17 13:30:24 +01:00
Geoffrey Frogeye 8f6e01c857
Added first_party tracking
Well, tracking if a rule is from a first or a multi rule...
Hope I did not do any mistake
2019-12-16 19:09:02 +01:00
Geoffrey Frogeye c3bf102289
Made references work 2019-12-16 14:18:03 +01:00
Geoffrey Frogeye 03a4042238
Added level
Also fixed IP logic because this was real messed up
2019-12-16 09:31:29 +01:00
Geoffrey Frogeye 3197fa1663
Remove list usage for IpTreeNode 2019-12-16 06:54:18 +01:00
10 changed files with 419 additions and 292 deletions

View file

@ -26,6 +26,8 @@ That's where this scripts comes in, to generate a list of such subdomains.
## How does this script work ## How does this script work
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
It takes an input a list of websites with trackers included. It takes an input a list of websites with trackers included.
So far, this list is manually-generated from the list of clients of such first-party trackers So far, this list is manually-generated from the list of clients of such first-party trackers
(latter we should use a general list of websites to be more exhaustive). (latter we should use a general list of websites to be more exhaustive).
@ -38,6 +40,8 @@ It finally outputs the matching ones.
## Requirements ## Requirements
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
Just to build the list, you can find an already-built list in the releases. Just to build the list, you can find an already-built list in the releases.
- Bash - Bash
@ -54,6 +58,8 @@ Just to build the list, you can find an already-built list in the releases.
## Usage ## Usage
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
This is only if you want to build the list yourself. This is only if you want to build the list yourself.
If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt> If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt>
It was build using additional sources not included in this repository for privacy reasons. It was build using additional sources not included in this repository for privacy reasons.

View file

@ -26,57 +26,76 @@ class Path():
class RulePath(Path): class RulePath(Path):
pass def __str__(self) -> str:
return '(rule)'
class RuleFirstPath(RulePath):
def __str__(self) -> str:
return '(first-party rule)'
class RuleMultiPath(RulePath):
def __str__(self) -> str:
return '(multi-party rule)'
class DomainPath(Path): class DomainPath(Path):
def __init__(self, path: typing.List[str]): def __init__(self, parts: typing.List[str]):
self.path = path self.parts = parts
def __str__(self) -> str:
return '?.' + Database.unpack_domain(self)
class HostnamePath(DomainPath): class HostnamePath(DomainPath):
pass def __str__(self) -> str:
return Database.unpack_domain(self)
class ZonePath(DomainPath): class ZonePath(DomainPath):
pass def __str__(self) -> str:
return '*.' + Database.unpack_domain(self)
class AsnPath(Path): class AsnPath(Path):
def __init__(self, asn: Asn): def __init__(self, asn: Asn):
self.asn = asn self.asn = asn
def __str__(self) -> str:
return Database.unpack_asn(self)
class Ip4Path(Path): class Ip4Path(Path):
def __init__(self, value: int, prefixlen: int): def __init__(self, value: int, prefixlen: int):
self.value = value self.value = value
self.prefixlen = prefixlen self.prefixlen = prefixlen
def __str__(self) -> str:
return Database.unpack_ip4network(self)
class Match(): class Match():
def __init__(self) -> None: def __init__(self) -> None:
self.source: typing.Optional[Path] = None
self.updated: int = 0 self.updated: int = 0
self.dupplicate: bool = False
# Cache
self.level: int = 0 self.level: int = 0
self.source: Path = RulePath() self.first_party: bool = False
# FP dupplicate args self.references: int = 0
def set(self, def active(self, first_party: bool = None) -> bool:
updated: int, if self.updated == 0 or (first_party and not self.first_party):
level: int, return False
source: Path, return True
) -> None:
if updated > self.updated or level > self.level:
self.updated = updated
self.level = level
self.source = source
# FP dupplicate function
def active(self) -> bool:
return self.updated > 0
class AsnNode(Match): class AsnNode(Match):
pass def __init__(self) -> None:
Match.__init__(self)
self.name = ''
class DomainTreeNode(): class DomainTreeNode():
@ -86,16 +105,16 @@ class DomainTreeNode():
self.match_hostname = Match() self.match_hostname = Match()
class IpTreeNode(): class IpTreeNode(Match):
def __init__(self) -> None: def __init__(self) -> None:
self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None] Match.__init__(self)
self.match = Match() self.zero: typing.Optional[IpTreeNode] = None
self.one: typing.Optional[IpTreeNode] = None
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode] Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
NodeCallable = typing.Callable[[Path, MatchCallable = typing.Callable[[Path,
Node, Match],
typing.Optional[typing.Any]],
typing.Any] typing.Any]
@ -108,7 +127,6 @@ class Profiler():
self.step_dict: typing.Dict[str, int] = dict() self.step_dict: typing.Dict[str, int] = dict()
def enter_step(self, name: str) -> None: def enter_step(self, name: str) -> None:
return
now = time.perf_counter() now = time.perf_counter()
try: try:
self.time_dict[self.time_step] += now - self.time_last self.time_dict[self.time_step] += now - self.time_last
@ -131,13 +149,21 @@ class Profiler():
class Database(Profiler): class Database(Profiler):
VERSION = 10 VERSION = 18
PATH = "blocking.p" PATH = "blocking.p"
def initialize(self) -> None: def initialize(self) -> None:
self.log.warning( self.log.warning(
"Creating database version: %d ", "Creating database version: %d ",
Database.VERSION) Database.VERSION)
# Dummy match objects that everything refer to
self.rules: typing.List[Match] = list()
for first_party in (False, True):
m = Match()
m.updated = 1
m.level = 0
m.first_party = first_party
self.rules.append(m)
self.domtree = DomainTreeNode() self.domtree = DomainTreeNode()
self.asns: typing.Dict[Asn, AsnNode] = dict() self.asns: typing.Dict[Asn, AsnNode] = dict()
self.ip4tree = IpTreeNode() self.ip4tree = IpTreeNode()
@ -148,7 +174,7 @@ class Database(Profiler):
with open(self.PATH, 'rb') as db_fdsec: with open(self.PATH, 'rb') as db_fdsec:
version, data = pickle.load(db_fdsec) version, data = pickle.load(db_fdsec)
if version == Database.VERSION: if version == Database.VERSION:
self.domtree, self.asns, self.ip4tree = data self.rules, self.domtree, self.asns, self.ip4tree = data
return return
self.log.warning( self.log.warning(
"Outdated database version found: %d, " "Outdated database version found: %d, "
@ -165,7 +191,7 @@ class Database(Profiler):
def save(self) -> None: def save(self) -> None:
self.enter_step('save') self.enter_step('save')
with open(self.PATH, 'wb') as db_fdsec: with open(self.PATH, 'wb') as db_fdsec:
data = self.domtree, self.asns, self.ip4tree data = self.rules, self.domtree, self.asns, self.ip4tree
pickle.dump((self.VERSION, data), db_fdsec) pickle.dump((self.VERSION, data), db_fdsec)
self.profile() self.profile()
@ -180,7 +206,7 @@ class Database(Profiler):
@staticmethod @staticmethod
def unpack_domain(domain: DomainPath) -> str: def unpack_domain(domain: DomainPath) -> str:
return '.'.join(domain.path[::-1]) return '.'.join(domain.parts[::-1])
@staticmethod @staticmethod
def pack_asn(asn: str) -> AsnPath: def pack_asn(asn: str) -> AsnPath:
@ -229,94 +255,227 @@ class Database(Profiler):
addr >>= 8 addr >>= 8
return '.'.join(map(str, octets)) + '/' + str(network.prefixlen) return '.'.join(map(str, octets)) + '/' + str(network.prefixlen)
def get_match(self, path: Path) -> Match:
if isinstance(path, RuleMultiPath):
return self.rules[0]
elif isinstance(path, RuleFirstPath):
return self.rules[1]
elif isinstance(path, AsnPath):
return self.asns[path.asn]
elif isinstance(path, DomainPath):
dicd = self.domtree
for part in path.parts:
dicd = dicd.children[part]
if isinstance(path, HostnamePath):
return dicd.match_hostname
elif isinstance(path, ZonePath):
return dicd.match_zone
else:
raise ValueError
elif isinstance(path, Ip4Path):
dici = self.ip4tree
for i in range(31, 31-path.prefixlen, -1):
bit = (path.value >> i) & 0b1
dici_next = dici.one if bit else dici.zero
if not dici_next:
raise IndexError
dici = dici_next
return dici
else:
raise ValueError
def exec_each_asn(self,
callback: MatchCallable,
) -> typing.Any:
for asn in self.asns:
match = self.asns[asn]
if match.active():
c = callback(
AsnPath(asn),
match,
)
try:
yield from c
except TypeError: # not iterable
pass
def exec_each_domain(self, def exec_each_domain(self,
callback: NodeCallable, callback: MatchCallable,
arg: typing.Any = None,
_dic: DomainTreeNode = None, _dic: DomainTreeNode = None,
_par: DomainPath = None, _par: DomainPath = None,
) -> typing.Any: ) -> typing.Any:
_dic = _dic or self.domtree _dic = _dic or self.domtree
_par = _par or DomainPath([]) _par = _par or DomainPath([])
yield from callback(_par, _dic, arg) if _dic.match_hostname.active():
c = callback(
HostnamePath(_par.parts),
_dic.match_hostname,
)
try:
yield from c
except TypeError: # not iterable
pass
if _dic.match_zone.active():
c = callback(
ZonePath(_par.parts),
_dic.match_zone,
)
try:
yield from c
except TypeError: # not iterable
pass
for part in _dic.children: for part in _dic.children:
dic = _dic.children[part] dic = _dic.children[part]
yield from self.exec_each_domain( yield from self.exec_each_domain(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=DomainPath(_par.path + [part]) _par=DomainPath(_par.parts + [part])
) )
def exec_each_ip4(self, def exec_each_ip4(self,
callback: NodeCallable, callback: MatchCallable,
arg: typing.Any = None,
_dic: IpTreeNode = None, _dic: IpTreeNode = None,
_par: Ip4Path = None, _par: Ip4Path = None,
) -> typing.Any: ) -> typing.Any:
_dic = _dic or self.ip4tree _dic = _dic or self.ip4tree
_par = _par or Ip4Path(0, 0) _par = _par or Ip4Path(0, 0)
callback(_par, _dic, arg) if _dic.active():
c = callback(
_par,
_dic,
)
try:
yield from c
except TypeError: # not iterable
pass
# 0 # 0
dic = _dic.children[0] pref = _par.prefixlen + 1
dic = _dic.zero
if dic: if dic:
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen))) addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
assert addr0 == _par.value assert addr0 == _par.value
yield from self.exec_each_ip4( yield from self.exec_each_ip4(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=Ip4Path(addr0, _par.prefixlen+1) _par=Ip4Path(addr0, pref)
) )
# 1 # 1
dic = _dic.children[1] dic = _dic.one
if dic: if dic:
addr1 = _par.value | (1 << (32-_par.prefixlen)) addr1 = _par.value | (1 << (32-pref))
yield from self.exec_each_ip4( yield from self.exec_each_ip4(
callback, callback,
arg,
_dic=dic, _dic=dic,
_par=Ip4Path(addr1, _par.prefixlen+1) _par=Ip4Path(addr1, pref)
) )
def exec_each(self, def exec_each(self,
callback: NodeCallable, callback: MatchCallable,
arg: typing.Any = None,
) -> typing.Any: ) -> typing.Any:
yield from self.exec_each_domain(callback) yield from self.exec_each_domain(callback)
yield from self.exec_each_ip4(callback) yield from self.exec_each_ip4(callback)
yield from self.exec_each_asn(callback)
def update_references(self) -> None: def update_references(self) -> None:
raise NotImplementedError # Should be correctly calculated normally,
# keeping this just in case
def reset_references_cb(path: Path,
match: Match
) -> None:
match.references = 0
for _ in self.exec_each(reset_references_cb):
pass
def increment_references_cb(path: Path,
match: Match
) -> None:
if match.source:
source = self.get_match(match.source)
source.references += 1
for _ in self.exec_each(increment_references_cb):
pass
def prune(self, before: int, base_only: bool = False) -> None: def prune(self, before: int, base_only: bool = False) -> None:
raise NotImplementedError raise NotImplementedError
def explain(self, entry: int) -> str: def explain(self, path: Path) -> str:
raise NotImplementedError match = self.get_match(path)
if isinstance(match, AsnNode):
string = f'{path} ({match.name}) #{match.references}'
else:
string = f'{path} #{match.references}'
if match.source:
string += f'{self.explain(match.source)}'
return string
def export(self, def export(self,
first_party_only: bool = False, first_party_only: bool = False,
end_chain_only: bool = False, end_chain_only: bool = False,
no_dupplicates: bool = False,
explain: bool = False, explain: bool = False,
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
if first_party_only or end_chain_only or explain:
raise NotImplementedError
def export_cb(path: Path, node: Node, _: typing.Any def export_cb(path: Path, match: Match
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
assert isinstance(path, DomainPath) assert isinstance(path, DomainPath)
assert isinstance(node, DomainTreeNode) if not isinstance(path, HostnamePath):
if node.match_hostname: return
a = self.unpack_domain(path) if first_party_only and not match.first_party:
yield a return
if end_chain_only and match.references > 0:
return
if no_dupplicates and match.dupplicate:
return
if explain:
yield self.explain(path)
else:
yield self.unpack_domain(path)
yield from self.exec_each_domain(export_cb, None) yield from self.exec_each_domain(export_cb)
def count_rules(self, def list_rules(self,
first_party_only: bool = False, first_party_only: bool = False,
) -> typing.Iterable[str]:
def list_rules_cb(path: Path, match: Match
) -> typing.Iterable[str]:
if first_party_only and not match.first_party:
return
if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 1:
# It should be the latter condition but it is more
# useful when using the former
yield self.explain(path)
yield from self.exec_each(list_rules_cb)
def count_records(self,
first_party_only: bool = False,
rules_only: bool = False,
no_dupplicates: bool = False,
) -> str: ) -> str:
raise NotImplementedError memo: typing.Dict[str, int] = dict()
def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
if rules_only and match.level > 1:
return
if no_dupplicates and match.dupplicate:
return
try:
memo[path.__class__.__name__] += 1
except KeyError:
memo[path.__class__.__name__] = 1
for _ in self.exec_each(count_records_cb):
pass
split: typing.List[str] = list()
for key, value in sorted(memo.items(), key=lambda s: s[0]):
split.append(f'{key[:-4]}: {value}')
return ', '.join(split)
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
self.enter_step('get_domain_pack') self.enter_step('get_domain_pack')
@ -324,10 +483,10 @@ class Database(Profiler):
self.enter_step('get_domain_brws') self.enter_step('get_domain_brws')
dic = self.domtree dic = self.domtree
depth = 0 depth = 0
for part in domain.path: for part in domain.parts:
if dic.match_zone.active(): if dic.match_zone.active():
self.enter_step('get_domain_yield') self.enter_step('get_domain_yield')
yield ZonePath(domain.path[:depth]) yield ZonePath(domain.parts[:depth])
self.enter_step('get_domain_brws') self.enter_step('get_domain_brws')
if part not in dic.children: if part not in dic.children:
return return
@ -335,61 +494,85 @@ class Database(Profiler):
depth += 1 depth += 1
if dic.match_zone.active(): if dic.match_zone.active():
self.enter_step('get_domain_yield') self.enter_step('get_domain_yield')
yield ZonePath(domain.path) yield ZonePath(domain.parts)
if dic.match_hostname.active(): if dic.match_hostname.active():
self.enter_step('get_domain_yield') self.enter_step('get_domain_yield')
yield HostnamePath(domain.path) yield HostnamePath(domain.parts)
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]: def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
self.enter_step('get_ip4_pack') self.enter_step('get_ip4_pack')
ip4 = self.pack_ip4address(ip4_str) ip4 = self.pack_ip4address(ip4_str)
self.enter_step('get_ip4_brws') self.enter_step('get_ip4_brws')
dic = self.ip4tree dic = self.ip4tree
for i in reversed(range(ip4.prefixlen)): for i in range(31, 31-ip4.prefixlen, -1):
part = (ip4.value >> i) & 0b1 bit = (ip4.value >> i) & 0b1
if dic.match.active(): if dic.active():
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
yield Ip4Path(ip4.value, 32-i) yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)
self.enter_step('get_ip4_brws') self.enter_step('get_ip4_brws')
next_dic = dic.children[part] next_dic = dic.one if bit else dic.zero
if next_dic is None: if next_dic is None:
return return
dic = next_dic dic = next_dic
if dic.match.active(): if dic.active():
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
yield ip4 yield ip4
def list_asn(self) -> typing.Iterable[AsnPath]: def _set_match(self,
for asn in self.asns: match: Match,
yield AsnPath(asn) updated: int,
source: Path,
source_match: Match = None,
dupplicate: bool = False,
) -> None:
# source_match is in parameters because most of the time
# its parent function needs it too,
# so it can pass it to save a traversal
source_match = source_match or self.get_match(source)
new_level = source_match.level + 1
if updated > match.updated or new_level < match.level \
or source_match.first_party > match.first_party:
# NOTE FP and level of matches referencing this one
# won't be updated until run or prune
if match.source:
old_source = self.get_match(match.source)
old_source.references -= 1
match.updated = updated
match.level = new_level
match.first_party = source_match.first_party
match.source = source
source_match.references += 1
match.dupplicate = dupplicate
def _set_domain(self, def _set_domain(self,
hostname: bool, hostname: bool,
domain_str: str, domain_str: str,
updated: int, updated: int,
is_first_party: bool = None, source: Path) -> None:
source: Path = None) -> None:
self.enter_step('set_domain_pack') self.enter_step('set_domain_pack')
if is_first_party:
raise NotImplementedError
domain = self.pack_domain(domain_str) domain = self.pack_domain(domain_str)
self.enter_step('set_domain_fp')
source_match = self.get_match(source)
is_first_party = source_match.first_party
self.enter_step('set_domain_brws') self.enter_step('set_domain_brws')
dic = self.domtree dic = self.domtree
for part in domain.path: dupplicate = False
if dic.match_zone.active(): for part in domain.parts:
# Refuse to add domain whose zone is already matching
return
if part not in dic.children: if part not in dic.children:
dic.children[part] = DomainTreeNode() dic.children[part] = DomainTreeNode()
dic = dic.children[part] dic = dic.children[part]
if dic.match_zone.active(is_first_party):
dupplicate = True
if hostname: if hostname:
match = dic.match_hostname match = dic.match_hostname
else: else:
match = dic.match_zone match = dic.match_zone
match.set( self._set_match(
match,
updated, updated,
0, # TODO Level source,
source or RulePath(), source_match=source_match,
dupplicate=dupplicate,
) )
def set_hostname(self, def set_hostname(self,
@ -405,42 +588,48 @@ class Database(Profiler):
def set_asn(self, def set_asn(self,
asn_str: str, asn_str: str,
updated: int, updated: int,
is_first_party: bool = None, source: Path) -> None:
source: Path = None) -> None:
self.enter_step('set_asn') self.enter_step('set_asn')
if is_first_party:
raise NotImplementedError
path = self.pack_asn(asn_str) path = self.pack_asn(asn_str)
if path.asn in self.asns:
match = self.asns[path.asn]
else:
match = AsnNode() match = AsnNode()
match.set(
updated,
0,
source or RulePath()
)
self.asns[path.asn] = match self.asns[path.asn] = match
self._set_match(
match,
updated,
source,
)
def _set_ip4(self, def _set_ip4(self,
ip4: Ip4Path, ip4: Ip4Path,
updated: int, updated: int,
is_first_party: bool = None, source: Path) -> None:
source: Path = None) -> None: self.enter_step('set_ip4_fp')
if is_first_party: source_match = self.get_match(source)
raise NotImplementedError is_first_party = source_match.first_party
self.enter_step('set_ip4_brws')
dic = self.ip4tree dic = self.ip4tree
for i in reversed(range(ip4.prefixlen)): dupplicate = False
part = (ip4.value >> i) & 0b1 for i in range(31, 31-ip4.prefixlen, -1):
if dic.match.active(): bit = (ip4.value >> i) & 0b1
# Refuse to add ip4* whose network is already matching next_dic = dic.one if bit else dic.zero
return
next_dic = dic.children[part]
if next_dic is None: if next_dic is None:
next_dic = IpTreeNode() next_dic = IpTreeNode()
dic.children[part] = next_dic if bit:
dic.one = next_dic
else:
dic.zero = next_dic
dic = next_dic dic = next_dic
dic.match.set( if dic.active(is_first_party):
dupplicate = True
self._set_match(
dic,
updated, updated,
0, # TODO Level source,
source or RulePath(), source_match=source_match,
dupplicate=dupplicate,
) )
def set_ip4address(self, def set_ip4address(self,
@ -449,7 +638,6 @@ class Database(Profiler):
) -> None: ) -> None:
self.enter_step('set_ip4add_pack') self.enter_step('set_ip4add_pack')
ip4 = self.pack_ip4address(ip4address_str) ip4 = self.pack_ip4address(ip4address_str)
self.enter_step('set_ip4add_brws')
self._set_ip4(ip4, *args, **kwargs) self._set_ip4(ip4, *args, **kwargs)
def set_ip4network(self, def set_ip4network(self,
@ -458,5 +646,4 @@ class Database(Profiler):
) -> None: ) -> None:
self.enter_step('set_ip4net_pack') self.enter_step('set_ip4net_pack')
ip4 = self.pack_ip4network(ip4network_str) ip4 = self.pack_ip4network(ip4network_str)
self.enter_step('set_ip4net_brws')
self._set_ip4(ip4, *args, **kwargs) self._set_ip4(ip4, *args, **kwargs)

44
db.py Executable file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
import argparse
import database
import time
import os
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="Database operations")
parser.add_argument(
'-i', '--initialize', action='store_true',
help="Reconstruct the whole database")
parser.add_argument(
'-p', '--prune', action='store_true',
help="Remove old entries from database")
parser.add_argument(
'-b', '--prune-base', action='store_true',
help="TODO")
parser.add_argument(
'-s', '--prune-before', type=int,
default=(int(time.time()) - 60*60*24*31*6),
help="TODO")
parser.add_argument(
'-r', '--references', action='store_true',
help="Update the reference count")
args = parser.parse_args()
if not args.initialize:
DB = database.Database()
else:
if os.path.isfile(database.Database.PATH):
os.unlink(database.Database.PATH)
DB = database.Database()
DB.enter_step('main')
if args.prune:
DB.prune(before=args.prune_before, base_only=args.prune_base)
if args.references:
DB.update_references()
DB.save()

View file

@ -25,6 +25,9 @@ if __name__ == '__main__':
parser.add_argument( parser.add_argument(
'-r', '--rules', action='store_true', '-r', '--rules', action='store_true',
help="TODO") help="TODO")
parser.add_argument(
'-d', '--no-dupplicates', action='store_true',
help="TODO")
parser.add_argument( parser.add_argument(
'-c', '--count', action='store_true', '-c', '--count', action='store_true',
help="TODO") help="TODO")
@ -32,16 +35,20 @@ if __name__ == '__main__':
DB = database.Database() DB = database.Database()
if args.rules:
if not args.count:
raise NotImplementedError
print(DB.count_rules(first_party_only=args.first_party))
else:
if args.count: if args.count:
raise NotImplementedError print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules,
no_dupplicates=args.no_dupplicates,
))
else:
if args.rules:
for line in DB.list_rules():
print(line)
for domain in DB.export( for domain in DB.export(
first_party_only=args.first_party, first_party_only=args.first_party,
end_chain_only=args.end_chain, end_chain_only=args.end_chain,
no_dupplicates=args.no_dupplicates,
explain=args.explain, explain=args.explain,
): ):
print(domain, file=args.output) print(domain, file=args.output)

View file

@ -4,21 +4,25 @@ function log() {
echo -e "\033[33m$@\033[0m" echo -e "\033[33m$@\033[0m"
} }
log "Pruning old data…"
./database.py --prune
log "Recounting references…"
./database.py --references
log "Exporting lists…" log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt ./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt ./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output dist/multiparty-only-trackers.txt ./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
log "Generating hosts lists…" log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt ./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt ./export.py --rules --count > temp/count_rules_multiparty.txt
log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
function generate_hosts { function generate_hosts {
basename="$1" basename="$1"
description="$2" description="$2"
@ -46,13 +50,15 @@ function generate_hosts {
echo "# Generation software: eulaurarien $(git describe --tags)" echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
echo "#" echo "#"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#" echo "#"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo echo

View file

@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
yield pref['prefix'] yield pref['prefix']
def get_name(asn: str) -> str:
req = requests.get(
'https://stat.ripe.net/data/as-overview/data.json',
params={'resource': asn}
)
data = req.json()
return data['data']['holder']
if __name__ == '__main__': if __name__ == '__main__':
log = logging.getLogger('feed_asn') log = logging.getLogger('feed_asn')
@ -32,8 +41,15 @@ if __name__ == '__main__':
DB = database.Database() DB = database.Database()
for path in DB.list_asn(): def add_ranges(path: database.Path,
match: database.Match,
) -> None:
assert isinstance(path, database.AsnPath)
assert isinstance(match, database.AsnNode)
asn_str = database.Database.unpack_asn(path) asn_str = database.Database.unpack_asn(path)
DB.enter_step('asn_get_name')
name = get_name(asn_str)
match.name = name
DB.enter_step('asn_get_ranges') DB.enter_step('asn_get_ranges')
for prefix in get_ranges(asn_str): for prefix in get_ranges(asn_str):
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix) parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
@ -43,10 +59,13 @@ if __name__ == '__main__':
source=path, source=path,
updated=int(time.time()) updated=int(time.time())
) )
log.info('Added %s from %s (%s)', prefix, asn_str, path) log.info('Added %s from %s (%s)', prefix, path, name)
elif parsed_prefix.version == 6: elif parsed_prefix.version == 6:
log.warning('Unimplemented prefix version: %s', prefix) log.warning('Unimplemented prefix version: %s', prefix)
else: else:
log.error('Unknown prefix version: %s', prefix) log.error('Unknown prefix version: %s', prefix)
for _ in DB.exec_each_asn(add_ranges):
pass
DB.save() DB.save()

View file

@ -1,147 +0,0 @@
#!/usr/bin/env python3
import argparse
import database
import logging
import sys
import typing
import enum
RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR')
Record = typing.Tuple[RecordType, int, str, str]
# select, write
FUNCTION_MAP: typing.Any = {
RecordType.A: (
database.Database.get_ip4,
database.Database.set_hostname,
),
RecordType.CNAME: (
database.Database.get_domain,
database.Database.set_hostname,
),
RecordType.PTR: (
database.Database.get_domain,
database.Database.set_ip4address,
),
}
class Parser():
def __init__(self, buf: typing.Any) -> None:
self.buf = buf
self.log = logging.getLogger('parser')
self.db = database.Database()
def end(self) -> None:
self.db.save()
def register(self,
rtype: RecordType,
updated: int,
name: str,
value: str
) -> None:
self.db.enter_step('register')
select, write = FUNCTION_MAP[rtype]
for source in select(self.db, value):
# write(self.db, name, updated, source=source)
write(self.db, name, updated)
def consume(self) -> None:
raise NotImplementedError
class Rapid7Parser(Parser):
TYPES = {
'a': RecordType.A,
'aaaa': RecordType.AAAA,
'cname': RecordType.CNAME,
'ptr': RecordType.PTR,
}
def consume(self) -> None:
data = dict()
for line in self.buf:
self.db.enter_step('parse_rapid7')
split = line.split('"')
for k in range(1, 14, 4):
key = split[k]
val = split[k+2]
data[key] = val
self.register(
Rapid7Parser.TYPES[data['type']],
int(data['timestamp']),
data['name'],
data['value']
)
class DnsMassParser(Parser):
# dnsmass --output Snrql
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
TYPES = {
'A': (RecordType.A, -1, None),
'AAAA': (RecordType.AAAA, -1, None),
'CNAME': (RecordType.CNAME, -1, -1),
}
def consume(self) -> None:
self.db.enter_step('parse_dnsmass')
timestamp = 0
header = True
for line in self.buf:
line = line[:-1]
if not line:
header = True
continue
split = line.split(' ')
try:
if header:
timestamp = int(split[1])
header = False
else:
dtype, name_offset, value_offset = \
DnsMassParser.TYPES[split[1]]
self.register(
dtype,
timestamp,
split[0][:name_offset],
split[2][:value_offset],
)
self.db.enter_step('parse_dnsmass')
except KeyError:
continue
PARSERS = {
'rapid7': Rapid7Parser,
'dnsmass': DnsMassParser,
}
if __name__ == '__main__':
# Parsing arguments
log = logging.getLogger('feed_dns')
args_parser = argparse.ArgumentParser(
description="TODO")
args_parser.add_argument(
'parser',
choices=PARSERS.keys(),
help="TODO")
args_parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = args_parser.parse_args()
parser = PARSERS[args.parser](args.input)
try:
parser.consume()
except KeyboardInterrupt:
pass
parser.end()

View file

@ -51,8 +51,7 @@ class Writer(multiprocessing.Process):
try: try:
for source in select(self.db, value): for source in select(self.db, value):
# write(self.db, name, updated, source=source) write(self.db, name, updated, source=source)
write(self.db, name, updated)
except ValueError: except ValueError:
self.log.exception("Cannot execute: %s", record) self.log.exception("Cannot execute: %s", record)
@ -182,10 +181,10 @@ if __name__ == '__main__':
'-j', '--workers', type=int, default=4, '-j', '--workers', type=int, default=4,
help="TODO") help="TODO")
args_parser.add_argument( args_parser.add_argument(
'-b', '--block-size', type=int, default=100, '-b', '--block-size', type=int, default=1024,
help="TODO") help="TODO")
args_parser.add_argument( args_parser.add_argument(
'-q', '--queue-size', type=int, default=10, '-q', '--queue-size', type=int, default=128,
help="TODO") help="TODO")
args = args_parser.parse_args() args = args_parser.parse_args()

View file

@ -32,10 +32,16 @@ if __name__ == '__main__':
fun = FUNCTION_MAP[args.type] fun = FUNCTION_MAP[args.type]
source: database.RulePath
if args.first_party:
source = database.RuleFirstPath()
else:
source = database.RuleMultiPath()
for rule in args.input: for rule in args.input:
fun(DB, fun(DB,
rule.strip(), rule.strip(),
# is_first_party=args.first_party, source=source,
updated=int(time.time()), updated=int(time.time()),
) )

View file

@ -6,11 +6,11 @@ function log() {
log "Importing rules…" log "Importing rules…"
BEFORE="$(date +%s)" BEFORE="$(date +%s)"
# cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
# cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
# cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
# cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
# cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party