Compare commits
8 commits
a0e68f0848
...
e882e09b37
Author | SHA1 | Date | |
---|---|---|---|
Geoffrey Frogeye | e882e09b37 | ||
Geoffrey Frogeye | d65107f849 | ||
Geoffrey Frogeye | ea0855bd00 | ||
Geoffrey Frogeye | 7851b038f5 | ||
Geoffrey Frogeye | 8f6e01c857 | ||
Geoffrey Frogeye | c3bf102289 | ||
Geoffrey Frogeye | 03a4042238 | ||
Geoffrey Frogeye | 3197fa1663 |
|
@ -26,6 +26,8 @@ That's where this scripts comes in, to generate a list of such subdomains.
|
||||||
|
|
||||||
## How does this script work
|
## How does this script work
|
||||||
|
|
||||||
|
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
|
||||||
|
|
||||||
It takes an input a list of websites with trackers included.
|
It takes an input a list of websites with trackers included.
|
||||||
So far, this list is manually-generated from the list of clients of such first-party trackers
|
So far, this list is manually-generated from the list of clients of such first-party trackers
|
||||||
(latter we should use a general list of websites to be more exhaustive).
|
(latter we should use a general list of websites to be more exhaustive).
|
||||||
|
@ -38,6 +40,8 @@ It finally outputs the matching ones.
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
|
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
|
||||||
|
|
||||||
Just to build the list, you can find an already-built list in the releases.
|
Just to build the list, you can find an already-built list in the releases.
|
||||||
|
|
||||||
- Bash
|
- Bash
|
||||||
|
@ -54,6 +58,8 @@ Just to build the list, you can find an already-built list in the releases.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
|
||||||
|
|
||||||
This is only if you want to build the list yourself.
|
This is only if you want to build the list yourself.
|
||||||
If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt>
|
If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt>
|
||||||
It was build using additional sources not included in this repository for privacy reasons.
|
It was build using additional sources not included in this repository for privacy reasons.
|
||||||
|
|
411
database.py
411
database.py
|
@ -26,57 +26,76 @@ class Path():
|
||||||
|
|
||||||
|
|
||||||
class RulePath(Path):
|
class RulePath(Path):
|
||||||
pass
|
def __str__(self) -> str:
|
||||||
|
return '(rule)'
|
||||||
|
|
||||||
|
|
||||||
|
class RuleFirstPath(RulePath):
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return '(first-party rule)'
|
||||||
|
|
||||||
|
|
||||||
|
class RuleMultiPath(RulePath):
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return '(multi-party rule)'
|
||||||
|
|
||||||
|
|
||||||
class DomainPath(Path):
|
class DomainPath(Path):
|
||||||
def __init__(self, path: typing.List[str]):
|
def __init__(self, parts: typing.List[str]):
|
||||||
self.path = path
|
self.parts = parts
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return '?.' + Database.unpack_domain(self)
|
||||||
|
|
||||||
|
|
||||||
class HostnamePath(DomainPath):
|
class HostnamePath(DomainPath):
|
||||||
pass
|
def __str__(self) -> str:
|
||||||
|
return Database.unpack_domain(self)
|
||||||
|
|
||||||
|
|
||||||
class ZonePath(DomainPath):
|
class ZonePath(DomainPath):
|
||||||
pass
|
def __str__(self) -> str:
|
||||||
|
return '*.' + Database.unpack_domain(self)
|
||||||
|
|
||||||
|
|
||||||
class AsnPath(Path):
|
class AsnPath(Path):
|
||||||
def __init__(self, asn: Asn):
|
def __init__(self, asn: Asn):
|
||||||
self.asn = asn
|
self.asn = asn
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return Database.unpack_asn(self)
|
||||||
|
|
||||||
|
|
||||||
class Ip4Path(Path):
|
class Ip4Path(Path):
|
||||||
def __init__(self, value: int, prefixlen: int):
|
def __init__(self, value: int, prefixlen: int):
|
||||||
self.value = value
|
self.value = value
|
||||||
self.prefixlen = prefixlen
|
self.prefixlen = prefixlen
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return Database.unpack_ip4network(self)
|
||||||
|
|
||||||
|
|
||||||
class Match():
|
class Match():
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
self.source: typing.Optional[Path] = None
|
||||||
self.updated: int = 0
|
self.updated: int = 0
|
||||||
|
self.dupplicate: bool = False
|
||||||
|
|
||||||
|
# Cache
|
||||||
self.level: int = 0
|
self.level: int = 0
|
||||||
self.source: Path = RulePath()
|
self.first_party: bool = False
|
||||||
# FP dupplicate args
|
self.references: int = 0
|
||||||
|
|
||||||
def set(self,
|
def active(self, first_party: bool = None) -> bool:
|
||||||
updated: int,
|
if self.updated == 0 or (first_party and not self.first_party):
|
||||||
level: int,
|
return False
|
||||||
source: Path,
|
return True
|
||||||
) -> None:
|
|
||||||
if updated > self.updated or level > self.level:
|
|
||||||
self.updated = updated
|
|
||||||
self.level = level
|
|
||||||
self.source = source
|
|
||||||
# FP dupplicate function
|
|
||||||
|
|
||||||
def active(self) -> bool:
|
|
||||||
return self.updated > 0
|
|
||||||
|
|
||||||
|
|
||||||
class AsnNode(Match):
|
class AsnNode(Match):
|
||||||
pass
|
def __init__(self) -> None:
|
||||||
|
Match.__init__(self)
|
||||||
|
self.name = ''
|
||||||
|
|
||||||
|
|
||||||
class DomainTreeNode():
|
class DomainTreeNode():
|
||||||
|
@ -86,16 +105,16 @@ class DomainTreeNode():
|
||||||
self.match_hostname = Match()
|
self.match_hostname = Match()
|
||||||
|
|
||||||
|
|
||||||
class IpTreeNode():
|
class IpTreeNode(Match):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None]
|
Match.__init__(self)
|
||||||
self.match = Match()
|
self.zero: typing.Optional[IpTreeNode] = None
|
||||||
|
self.one: typing.Optional[IpTreeNode] = None
|
||||||
|
|
||||||
|
|
||||||
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
|
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
|
||||||
NodeCallable = typing.Callable[[Path,
|
MatchCallable = typing.Callable[[Path,
|
||||||
Node,
|
Match],
|
||||||
typing.Optional[typing.Any]],
|
|
||||||
typing.Any]
|
typing.Any]
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,7 +127,6 @@ class Profiler():
|
||||||
self.step_dict: typing.Dict[str, int] = dict()
|
self.step_dict: typing.Dict[str, int] = dict()
|
||||||
|
|
||||||
def enter_step(self, name: str) -> None:
|
def enter_step(self, name: str) -> None:
|
||||||
return
|
|
||||||
now = time.perf_counter()
|
now = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
self.time_dict[self.time_step] += now - self.time_last
|
self.time_dict[self.time_step] += now - self.time_last
|
||||||
|
@ -131,13 +149,21 @@ class Profiler():
|
||||||
|
|
||||||
|
|
||||||
class Database(Profiler):
|
class Database(Profiler):
|
||||||
VERSION = 10
|
VERSION = 18
|
||||||
PATH = "blocking.p"
|
PATH = "blocking.p"
|
||||||
|
|
||||||
def initialize(self) -> None:
|
def initialize(self) -> None:
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
"Creating database version: %d ",
|
"Creating database version: %d ",
|
||||||
Database.VERSION)
|
Database.VERSION)
|
||||||
|
# Dummy match objects that everything refer to
|
||||||
|
self.rules: typing.List[Match] = list()
|
||||||
|
for first_party in (False, True):
|
||||||
|
m = Match()
|
||||||
|
m.updated = 1
|
||||||
|
m.level = 0
|
||||||
|
m.first_party = first_party
|
||||||
|
self.rules.append(m)
|
||||||
self.domtree = DomainTreeNode()
|
self.domtree = DomainTreeNode()
|
||||||
self.asns: typing.Dict[Asn, AsnNode] = dict()
|
self.asns: typing.Dict[Asn, AsnNode] = dict()
|
||||||
self.ip4tree = IpTreeNode()
|
self.ip4tree = IpTreeNode()
|
||||||
|
@ -148,7 +174,7 @@ class Database(Profiler):
|
||||||
with open(self.PATH, 'rb') as db_fdsec:
|
with open(self.PATH, 'rb') as db_fdsec:
|
||||||
version, data = pickle.load(db_fdsec)
|
version, data = pickle.load(db_fdsec)
|
||||||
if version == Database.VERSION:
|
if version == Database.VERSION:
|
||||||
self.domtree, self.asns, self.ip4tree = data
|
self.rules, self.domtree, self.asns, self.ip4tree = data
|
||||||
return
|
return
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
"Outdated database version found: %d, "
|
"Outdated database version found: %d, "
|
||||||
|
@ -165,7 +191,7 @@ class Database(Profiler):
|
||||||
def save(self) -> None:
|
def save(self) -> None:
|
||||||
self.enter_step('save')
|
self.enter_step('save')
|
||||||
with open(self.PATH, 'wb') as db_fdsec:
|
with open(self.PATH, 'wb') as db_fdsec:
|
||||||
data = self.domtree, self.asns, self.ip4tree
|
data = self.rules, self.domtree, self.asns, self.ip4tree
|
||||||
pickle.dump((self.VERSION, data), db_fdsec)
|
pickle.dump((self.VERSION, data), db_fdsec)
|
||||||
self.profile()
|
self.profile()
|
||||||
|
|
||||||
|
@ -180,7 +206,7 @@ class Database(Profiler):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def unpack_domain(domain: DomainPath) -> str:
|
def unpack_domain(domain: DomainPath) -> str:
|
||||||
return '.'.join(domain.path[::-1])
|
return '.'.join(domain.parts[::-1])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def pack_asn(asn: str) -> AsnPath:
|
def pack_asn(asn: str) -> AsnPath:
|
||||||
|
@ -229,94 +255,227 @@ class Database(Profiler):
|
||||||
addr >>= 8
|
addr >>= 8
|
||||||
return '.'.join(map(str, octets)) + '/' + str(network.prefixlen)
|
return '.'.join(map(str, octets)) + '/' + str(network.prefixlen)
|
||||||
|
|
||||||
|
def get_match(self, path: Path) -> Match:
|
||||||
|
if isinstance(path, RuleMultiPath):
|
||||||
|
return self.rules[0]
|
||||||
|
elif isinstance(path, RuleFirstPath):
|
||||||
|
return self.rules[1]
|
||||||
|
elif isinstance(path, AsnPath):
|
||||||
|
return self.asns[path.asn]
|
||||||
|
elif isinstance(path, DomainPath):
|
||||||
|
dicd = self.domtree
|
||||||
|
for part in path.parts:
|
||||||
|
dicd = dicd.children[part]
|
||||||
|
if isinstance(path, HostnamePath):
|
||||||
|
return dicd.match_hostname
|
||||||
|
elif isinstance(path, ZonePath):
|
||||||
|
return dicd.match_zone
|
||||||
|
else:
|
||||||
|
raise ValueError
|
||||||
|
elif isinstance(path, Ip4Path):
|
||||||
|
dici = self.ip4tree
|
||||||
|
for i in range(31, 31-path.prefixlen, -1):
|
||||||
|
bit = (path.value >> i) & 0b1
|
||||||
|
dici_next = dici.one if bit else dici.zero
|
||||||
|
if not dici_next:
|
||||||
|
raise IndexError
|
||||||
|
dici = dici_next
|
||||||
|
return dici
|
||||||
|
else:
|
||||||
|
raise ValueError
|
||||||
|
|
||||||
|
def exec_each_asn(self,
|
||||||
|
callback: MatchCallable,
|
||||||
|
) -> typing.Any:
|
||||||
|
for asn in self.asns:
|
||||||
|
match = self.asns[asn]
|
||||||
|
if match.active():
|
||||||
|
c = callback(
|
||||||
|
AsnPath(asn),
|
||||||
|
match,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
yield from c
|
||||||
|
except TypeError: # not iterable
|
||||||
|
pass
|
||||||
|
|
||||||
def exec_each_domain(self,
|
def exec_each_domain(self,
|
||||||
callback: NodeCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
_dic: DomainTreeNode = None,
|
_dic: DomainTreeNode = None,
|
||||||
_par: DomainPath = None,
|
_par: DomainPath = None,
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
_dic = _dic or self.domtree
|
_dic = _dic or self.domtree
|
||||||
_par = _par or DomainPath([])
|
_par = _par or DomainPath([])
|
||||||
yield from callback(_par, _dic, arg)
|
if _dic.match_hostname.active():
|
||||||
|
c = callback(
|
||||||
|
HostnamePath(_par.parts),
|
||||||
|
_dic.match_hostname,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
yield from c
|
||||||
|
except TypeError: # not iterable
|
||||||
|
pass
|
||||||
|
if _dic.match_zone.active():
|
||||||
|
c = callback(
|
||||||
|
ZonePath(_par.parts),
|
||||||
|
_dic.match_zone,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
yield from c
|
||||||
|
except TypeError: # not iterable
|
||||||
|
pass
|
||||||
for part in _dic.children:
|
for part in _dic.children:
|
||||||
dic = _dic.children[part]
|
dic = _dic.children[part]
|
||||||
yield from self.exec_each_domain(
|
yield from self.exec_each_domain(
|
||||||
callback,
|
callback,
|
||||||
arg,
|
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
_par=DomainPath(_par.path + [part])
|
_par=DomainPath(_par.parts + [part])
|
||||||
)
|
)
|
||||||
|
|
||||||
def exec_each_ip4(self,
|
def exec_each_ip4(self,
|
||||||
callback: NodeCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
_dic: IpTreeNode = None,
|
_dic: IpTreeNode = None,
|
||||||
_par: Ip4Path = None,
|
_par: Ip4Path = None,
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
_dic = _dic or self.ip4tree
|
_dic = _dic or self.ip4tree
|
||||||
_par = _par or Ip4Path(0, 0)
|
_par = _par or Ip4Path(0, 0)
|
||||||
callback(_par, _dic, arg)
|
if _dic.active():
|
||||||
|
c = callback(
|
||||||
|
_par,
|
||||||
|
_dic,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
yield from c
|
||||||
|
except TypeError: # not iterable
|
||||||
|
pass
|
||||||
|
|
||||||
# 0
|
# 0
|
||||||
dic = _dic.children[0]
|
pref = _par.prefixlen + 1
|
||||||
|
dic = _dic.zero
|
||||||
if dic:
|
if dic:
|
||||||
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen)))
|
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
|
||||||
assert addr0 == _par.value
|
assert addr0 == _par.value
|
||||||
yield from self.exec_each_ip4(
|
yield from self.exec_each_ip4(
|
||||||
callback,
|
callback,
|
||||||
arg,
|
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
_par=Ip4Path(addr0, _par.prefixlen+1)
|
_par=Ip4Path(addr0, pref)
|
||||||
)
|
)
|
||||||
# 1
|
# 1
|
||||||
dic = _dic.children[1]
|
dic = _dic.one
|
||||||
if dic:
|
if dic:
|
||||||
addr1 = _par.value | (1 << (32-_par.prefixlen))
|
addr1 = _par.value | (1 << (32-pref))
|
||||||
yield from self.exec_each_ip4(
|
yield from self.exec_each_ip4(
|
||||||
callback,
|
callback,
|
||||||
arg,
|
|
||||||
_dic=dic,
|
_dic=dic,
|
||||||
_par=Ip4Path(addr1, _par.prefixlen+1)
|
_par=Ip4Path(addr1, pref)
|
||||||
)
|
)
|
||||||
|
|
||||||
def exec_each(self,
|
def exec_each(self,
|
||||||
callback: NodeCallable,
|
callback: MatchCallable,
|
||||||
arg: typing.Any = None,
|
|
||||||
) -> typing.Any:
|
) -> typing.Any:
|
||||||
yield from self.exec_each_domain(callback)
|
yield from self.exec_each_domain(callback)
|
||||||
yield from self.exec_each_ip4(callback)
|
yield from self.exec_each_ip4(callback)
|
||||||
|
yield from self.exec_each_asn(callback)
|
||||||
|
|
||||||
def update_references(self) -> None:
|
def update_references(self) -> None:
|
||||||
raise NotImplementedError
|
# Should be correctly calculated normally,
|
||||||
|
# keeping this just in case
|
||||||
|
def reset_references_cb(path: Path,
|
||||||
|
match: Match
|
||||||
|
) -> None:
|
||||||
|
match.references = 0
|
||||||
|
for _ in self.exec_each(reset_references_cb):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def increment_references_cb(path: Path,
|
||||||
|
match: Match
|
||||||
|
) -> None:
|
||||||
|
if match.source:
|
||||||
|
source = self.get_match(match.source)
|
||||||
|
source.references += 1
|
||||||
|
for _ in self.exec_each(increment_references_cb):
|
||||||
|
pass
|
||||||
|
|
||||||
def prune(self, before: int, base_only: bool = False) -> None:
|
def prune(self, before: int, base_only: bool = False) -> None:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def explain(self, entry: int) -> str:
|
def explain(self, path: Path) -> str:
|
||||||
raise NotImplementedError
|
match = self.get_match(path)
|
||||||
|
if isinstance(match, AsnNode):
|
||||||
|
string = f'{path} ({match.name}) #{match.references}'
|
||||||
|
else:
|
||||||
|
string = f'{path} #{match.references}'
|
||||||
|
if match.source:
|
||||||
|
string += f' ← {self.explain(match.source)}'
|
||||||
|
return string
|
||||||
|
|
||||||
def export(self,
|
def export(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
end_chain_only: bool = False,
|
end_chain_only: bool = False,
|
||||||
|
no_dupplicates: bool = False,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
if first_party_only or end_chain_only or explain:
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def export_cb(path: Path, node: Node, _: typing.Any
|
def export_cb(path: Path, match: Match
|
||||||
) -> typing.Iterable[str]:
|
) -> typing.Iterable[str]:
|
||||||
assert isinstance(path, DomainPath)
|
assert isinstance(path, DomainPath)
|
||||||
assert isinstance(node, DomainTreeNode)
|
if not isinstance(path, HostnamePath):
|
||||||
if node.match_hostname:
|
return
|
||||||
a = self.unpack_domain(path)
|
if first_party_only and not match.first_party:
|
||||||
yield a
|
return
|
||||||
|
if end_chain_only and match.references > 0:
|
||||||
|
return
|
||||||
|
if no_dupplicates and match.dupplicate:
|
||||||
|
return
|
||||||
|
if explain:
|
||||||
|
yield self.explain(path)
|
||||||
|
else:
|
||||||
|
yield self.unpack_domain(path)
|
||||||
|
|
||||||
yield from self.exec_each_domain(export_cb, None)
|
yield from self.exec_each_domain(export_cb)
|
||||||
|
|
||||||
def count_rules(self,
|
def list_rules(self,
|
||||||
first_party_only: bool = False,
|
first_party_only: bool = False,
|
||||||
|
) -> typing.Iterable[str]:
|
||||||
|
|
||||||
|
def list_rules_cb(path: Path, match: Match
|
||||||
|
) -> typing.Iterable[str]:
|
||||||
|
if first_party_only and not match.first_party:
|
||||||
|
return
|
||||||
|
if isinstance(path, ZonePath) \
|
||||||
|
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
|
||||||
|
# if match.level == 1:
|
||||||
|
# It should be the latter condition but it is more
|
||||||
|
# useful when using the former
|
||||||
|
yield self.explain(path)
|
||||||
|
|
||||||
|
yield from self.exec_each(list_rules_cb)
|
||||||
|
|
||||||
|
def count_records(self,
|
||||||
|
first_party_only: bool = False,
|
||||||
|
rules_only: bool = False,
|
||||||
|
no_dupplicates: bool = False,
|
||||||
) -> str:
|
) -> str:
|
||||||
raise NotImplementedError
|
memo: typing.Dict[str, int] = dict()
|
||||||
|
|
||||||
|
def count_records_cb(path: Path, match: Match) -> None:
|
||||||
|
if first_party_only and not match.first_party:
|
||||||
|
return
|
||||||
|
if rules_only and match.level > 1:
|
||||||
|
return
|
||||||
|
if no_dupplicates and match.dupplicate:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
memo[path.__class__.__name__] += 1
|
||||||
|
except KeyError:
|
||||||
|
memo[path.__class__.__name__] = 1
|
||||||
|
|
||||||
|
for _ in self.exec_each(count_records_cb):
|
||||||
|
pass
|
||||||
|
split: typing.List[str] = list()
|
||||||
|
for key, value in sorted(memo.items(), key=lambda s: s[0]):
|
||||||
|
split.append(f'{key[:-4]}: {value}')
|
||||||
|
return ', '.join(split)
|
||||||
|
|
||||||
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
|
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
|
||||||
self.enter_step('get_domain_pack')
|
self.enter_step('get_domain_pack')
|
||||||
|
@ -324,10 +483,10 @@ class Database(Profiler):
|
||||||
self.enter_step('get_domain_brws')
|
self.enter_step('get_domain_brws')
|
||||||
dic = self.domtree
|
dic = self.domtree
|
||||||
depth = 0
|
depth = 0
|
||||||
for part in domain.path:
|
for part in domain.parts:
|
||||||
if dic.match_zone.active():
|
if dic.match_zone.active():
|
||||||
self.enter_step('get_domain_yield')
|
self.enter_step('get_domain_yield')
|
||||||
yield ZonePath(domain.path[:depth])
|
yield ZonePath(domain.parts[:depth])
|
||||||
self.enter_step('get_domain_brws')
|
self.enter_step('get_domain_brws')
|
||||||
if part not in dic.children:
|
if part not in dic.children:
|
||||||
return
|
return
|
||||||
|
@ -335,61 +494,85 @@ class Database(Profiler):
|
||||||
depth += 1
|
depth += 1
|
||||||
if dic.match_zone.active():
|
if dic.match_zone.active():
|
||||||
self.enter_step('get_domain_yield')
|
self.enter_step('get_domain_yield')
|
||||||
yield ZonePath(domain.path)
|
yield ZonePath(domain.parts)
|
||||||
if dic.match_hostname.active():
|
if dic.match_hostname.active():
|
||||||
self.enter_step('get_domain_yield')
|
self.enter_step('get_domain_yield')
|
||||||
yield HostnamePath(domain.path)
|
yield HostnamePath(domain.parts)
|
||||||
|
|
||||||
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
|
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
|
||||||
self.enter_step('get_ip4_pack')
|
self.enter_step('get_ip4_pack')
|
||||||
ip4 = self.pack_ip4address(ip4_str)
|
ip4 = self.pack_ip4address(ip4_str)
|
||||||
self.enter_step('get_ip4_brws')
|
self.enter_step('get_ip4_brws')
|
||||||
dic = self.ip4tree
|
dic = self.ip4tree
|
||||||
for i in reversed(range(ip4.prefixlen)):
|
for i in range(31, 31-ip4.prefixlen, -1):
|
||||||
part = (ip4.value >> i) & 0b1
|
bit = (ip4.value >> i) & 0b1
|
||||||
if dic.match.active():
|
if dic.active():
|
||||||
self.enter_step('get_ip4_yield')
|
self.enter_step('get_ip4_yield')
|
||||||
yield Ip4Path(ip4.value, 32-i)
|
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)
|
||||||
self.enter_step('get_ip4_brws')
|
self.enter_step('get_ip4_brws')
|
||||||
next_dic = dic.children[part]
|
next_dic = dic.one if bit else dic.zero
|
||||||
if next_dic is None:
|
if next_dic is None:
|
||||||
return
|
return
|
||||||
dic = next_dic
|
dic = next_dic
|
||||||
if dic.match.active():
|
if dic.active():
|
||||||
self.enter_step('get_ip4_yield')
|
self.enter_step('get_ip4_yield')
|
||||||
yield ip4
|
yield ip4
|
||||||
|
|
||||||
def list_asn(self) -> typing.Iterable[AsnPath]:
|
def _set_match(self,
|
||||||
for asn in self.asns:
|
match: Match,
|
||||||
yield AsnPath(asn)
|
updated: int,
|
||||||
|
source: Path,
|
||||||
|
source_match: Match = None,
|
||||||
|
dupplicate: bool = False,
|
||||||
|
) -> None:
|
||||||
|
# source_match is in parameters because most of the time
|
||||||
|
# its parent function needs it too,
|
||||||
|
# so it can pass it to save a traversal
|
||||||
|
source_match = source_match or self.get_match(source)
|
||||||
|
new_level = source_match.level + 1
|
||||||
|
if updated > match.updated or new_level < match.level \
|
||||||
|
or source_match.first_party > match.first_party:
|
||||||
|
# NOTE FP and level of matches referencing this one
|
||||||
|
# won't be updated until run or prune
|
||||||
|
if match.source:
|
||||||
|
old_source = self.get_match(match.source)
|
||||||
|
old_source.references -= 1
|
||||||
|
match.updated = updated
|
||||||
|
match.level = new_level
|
||||||
|
match.first_party = source_match.first_party
|
||||||
|
match.source = source
|
||||||
|
source_match.references += 1
|
||||||
|
match.dupplicate = dupplicate
|
||||||
|
|
||||||
def _set_domain(self,
|
def _set_domain(self,
|
||||||
hostname: bool,
|
hostname: bool,
|
||||||
domain_str: str,
|
domain_str: str,
|
||||||
updated: int,
|
updated: int,
|
||||||
is_first_party: bool = None,
|
source: Path) -> None:
|
||||||
source: Path = None) -> None:
|
|
||||||
self.enter_step('set_domain_pack')
|
self.enter_step('set_domain_pack')
|
||||||
if is_first_party:
|
|
||||||
raise NotImplementedError
|
|
||||||
domain = self.pack_domain(domain_str)
|
domain = self.pack_domain(domain_str)
|
||||||
|
self.enter_step('set_domain_fp')
|
||||||
|
source_match = self.get_match(source)
|
||||||
|
is_first_party = source_match.first_party
|
||||||
self.enter_step('set_domain_brws')
|
self.enter_step('set_domain_brws')
|
||||||
dic = self.domtree
|
dic = self.domtree
|
||||||
for part in domain.path:
|
dupplicate = False
|
||||||
if dic.match_zone.active():
|
for part in domain.parts:
|
||||||
# Refuse to add domain whose zone is already matching
|
|
||||||
return
|
|
||||||
if part not in dic.children:
|
if part not in dic.children:
|
||||||
dic.children[part] = DomainTreeNode()
|
dic.children[part] = DomainTreeNode()
|
||||||
dic = dic.children[part]
|
dic = dic.children[part]
|
||||||
|
if dic.match_zone.active(is_first_party):
|
||||||
|
dupplicate = True
|
||||||
if hostname:
|
if hostname:
|
||||||
match = dic.match_hostname
|
match = dic.match_hostname
|
||||||
else:
|
else:
|
||||||
match = dic.match_zone
|
match = dic.match_zone
|
||||||
match.set(
|
self._set_match(
|
||||||
|
match,
|
||||||
updated,
|
updated,
|
||||||
0, # TODO Level
|
source,
|
||||||
source or RulePath(),
|
source_match=source_match,
|
||||||
|
dupplicate=dupplicate,
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_hostname(self,
|
def set_hostname(self,
|
||||||
|
@ -405,42 +588,48 @@ class Database(Profiler):
|
||||||
def set_asn(self,
|
def set_asn(self,
|
||||||
asn_str: str,
|
asn_str: str,
|
||||||
updated: int,
|
updated: int,
|
||||||
is_first_party: bool = None,
|
source: Path) -> None:
|
||||||
source: Path = None) -> None:
|
|
||||||
self.enter_step('set_asn')
|
self.enter_step('set_asn')
|
||||||
if is_first_party:
|
|
||||||
raise NotImplementedError
|
|
||||||
path = self.pack_asn(asn_str)
|
path = self.pack_asn(asn_str)
|
||||||
|
if path.asn in self.asns:
|
||||||
|
match = self.asns[path.asn]
|
||||||
|
else:
|
||||||
match = AsnNode()
|
match = AsnNode()
|
||||||
match.set(
|
|
||||||
updated,
|
|
||||||
0,
|
|
||||||
source or RulePath()
|
|
||||||
)
|
|
||||||
self.asns[path.asn] = match
|
self.asns[path.asn] = match
|
||||||
|
self._set_match(
|
||||||
|
match,
|
||||||
|
updated,
|
||||||
|
source,
|
||||||
|
)
|
||||||
|
|
||||||
def _set_ip4(self,
|
def _set_ip4(self,
|
||||||
ip4: Ip4Path,
|
ip4: Ip4Path,
|
||||||
updated: int,
|
updated: int,
|
||||||
is_first_party: bool = None,
|
source: Path) -> None:
|
||||||
source: Path = None) -> None:
|
self.enter_step('set_ip4_fp')
|
||||||
if is_first_party:
|
source_match = self.get_match(source)
|
||||||
raise NotImplementedError
|
is_first_party = source_match.first_party
|
||||||
|
self.enter_step('set_ip4_brws')
|
||||||
dic = self.ip4tree
|
dic = self.ip4tree
|
||||||
for i in reversed(range(ip4.prefixlen)):
|
dupplicate = False
|
||||||
part = (ip4.value >> i) & 0b1
|
for i in range(31, 31-ip4.prefixlen, -1):
|
||||||
if dic.match.active():
|
bit = (ip4.value >> i) & 0b1
|
||||||
# Refuse to add ip4* whose network is already matching
|
next_dic = dic.one if bit else dic.zero
|
||||||
return
|
|
||||||
next_dic = dic.children[part]
|
|
||||||
if next_dic is None:
|
if next_dic is None:
|
||||||
next_dic = IpTreeNode()
|
next_dic = IpTreeNode()
|
||||||
dic.children[part] = next_dic
|
if bit:
|
||||||
|
dic.one = next_dic
|
||||||
|
else:
|
||||||
|
dic.zero = next_dic
|
||||||
dic = next_dic
|
dic = next_dic
|
||||||
dic.match.set(
|
if dic.active(is_first_party):
|
||||||
|
dupplicate = True
|
||||||
|
self._set_match(
|
||||||
|
dic,
|
||||||
updated,
|
updated,
|
||||||
0, # TODO Level
|
source,
|
||||||
source or RulePath(),
|
source_match=source_match,
|
||||||
|
dupplicate=dupplicate,
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_ip4address(self,
|
def set_ip4address(self,
|
||||||
|
@ -449,7 +638,6 @@ class Database(Profiler):
|
||||||
) -> None:
|
) -> None:
|
||||||
self.enter_step('set_ip4add_pack')
|
self.enter_step('set_ip4add_pack')
|
||||||
ip4 = self.pack_ip4address(ip4address_str)
|
ip4 = self.pack_ip4address(ip4address_str)
|
||||||
self.enter_step('set_ip4add_brws')
|
|
||||||
self._set_ip4(ip4, *args, **kwargs)
|
self._set_ip4(ip4, *args, **kwargs)
|
||||||
|
|
||||||
def set_ip4network(self,
|
def set_ip4network(self,
|
||||||
|
@ -458,5 +646,4 @@ class Database(Profiler):
|
||||||
) -> None:
|
) -> None:
|
||||||
self.enter_step('set_ip4net_pack')
|
self.enter_step('set_ip4net_pack')
|
||||||
ip4 = self.pack_ip4network(ip4network_str)
|
ip4 = self.pack_ip4network(ip4network_str)
|
||||||
self.enter_step('set_ip4net_brws')
|
|
||||||
self._set_ip4(ip4, *args, **kwargs)
|
self._set_ip4(ip4, *args, **kwargs)
|
||||||
|
|
44
db.py
Executable file
44
db.py
Executable file
|
@ -0,0 +1,44 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import database
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# Parsing arguments
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Database operations")
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', '--initialize', action='store_true',
|
||||||
|
help="Reconstruct the whole database")
|
||||||
|
parser.add_argument(
|
||||||
|
'-p', '--prune', action='store_true',
|
||||||
|
help="Remove old entries from database")
|
||||||
|
parser.add_argument(
|
||||||
|
'-b', '--prune-base', action='store_true',
|
||||||
|
help="TODO")
|
||||||
|
parser.add_argument(
|
||||||
|
'-s', '--prune-before', type=int,
|
||||||
|
default=(int(time.time()) - 60*60*24*31*6),
|
||||||
|
help="TODO")
|
||||||
|
parser.add_argument(
|
||||||
|
'-r', '--references', action='store_true',
|
||||||
|
help="Update the reference count")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.initialize:
|
||||||
|
DB = database.Database()
|
||||||
|
else:
|
||||||
|
if os.path.isfile(database.Database.PATH):
|
||||||
|
os.unlink(database.Database.PATH)
|
||||||
|
DB = database.Database()
|
||||||
|
|
||||||
|
DB.enter_step('main')
|
||||||
|
if args.prune:
|
||||||
|
DB.prune(before=args.prune_before, base_only=args.prune_base)
|
||||||
|
if args.references:
|
||||||
|
DB.update_references()
|
||||||
|
|
||||||
|
DB.save()
|
19
export.py
19
export.py
|
@ -25,6 +25,9 @@ if __name__ == '__main__':
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-r', '--rules', action='store_true',
|
'-r', '--rules', action='store_true',
|
||||||
help="TODO")
|
help="TODO")
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--no-dupplicates', action='store_true',
|
||||||
|
help="TODO")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-c', '--count', action='store_true',
|
'-c', '--count', action='store_true',
|
||||||
help="TODO")
|
help="TODO")
|
||||||
|
@ -32,16 +35,20 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
DB = database.Database()
|
DB = database.Database()
|
||||||
|
|
||||||
if args.rules:
|
|
||||||
if not args.count:
|
|
||||||
raise NotImplementedError
|
|
||||||
print(DB.count_rules(first_party_only=args.first_party))
|
|
||||||
else:
|
|
||||||
if args.count:
|
if args.count:
|
||||||
raise NotImplementedError
|
print(DB.count_records(
|
||||||
|
first_party_only=args.first_party,
|
||||||
|
rules_only=args.rules,
|
||||||
|
no_dupplicates=args.no_dupplicates,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
if args.rules:
|
||||||
|
for line in DB.list_rules():
|
||||||
|
print(line)
|
||||||
for domain in DB.export(
|
for domain in DB.export(
|
||||||
first_party_only=args.first_party,
|
first_party_only=args.first_party,
|
||||||
end_chain_only=args.end_chain,
|
end_chain_only=args.end_chain,
|
||||||
|
no_dupplicates=args.no_dupplicates,
|
||||||
explain=args.explain,
|
explain=args.explain,
|
||||||
):
|
):
|
||||||
print(domain, file=args.output)
|
print(domain, file=args.output)
|
||||||
|
|
|
@ -4,21 +4,25 @@ function log() {
|
||||||
echo -e "\033[33m$@\033[0m"
|
echo -e "\033[33m$@\033[0m"
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Pruning old data…"
|
|
||||||
./database.py --prune
|
|
||||||
|
|
||||||
log "Recounting references…"
|
|
||||||
./database.py --references
|
|
||||||
|
|
||||||
log "Exporting lists…"
|
log "Exporting lists…"
|
||||||
./export.py --first-party --output dist/firstparty-trackers.txt
|
./export.py --first-party --output dist/firstparty-trackers.txt
|
||||||
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
|
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
|
||||||
./export.py --output dist/multiparty-trackers.txt
|
./export.py --output dist/multiparty-trackers.txt
|
||||||
./export.py --end-chain --output dist/multiparty-only-trackers.txt
|
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
|
||||||
|
|
||||||
log "Generating hosts lists…"
|
log "Generating statistics…"
|
||||||
|
./export.py --count --first-party > temp/count_recs_firstparty.txt
|
||||||
|
./export.py --count > temp/count_recs_multiparty.txt
|
||||||
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
|
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
|
||||||
./export.py --rules --count > temp/count_rules_multiparty.txt
|
./export.py --rules --count > temp/count_rules_multiparty.txt
|
||||||
|
|
||||||
|
log "Sorting lists…"
|
||||||
|
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
|
||||||
|
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
|
||||||
|
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
|
||||||
|
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
|
||||||
|
|
||||||
|
log "Generating hosts lists…"
|
||||||
function generate_hosts {
|
function generate_hosts {
|
||||||
basename="$1"
|
basename="$1"
|
||||||
description="$2"
|
description="$2"
|
||||||
|
@ -46,13 +50,15 @@ function generate_hosts {
|
||||||
echo "# Generation software: eulaurarien $(git describe --tags)"
|
echo "# Generation software: eulaurarien $(git describe --tags)"
|
||||||
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
|
||||||
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
|
||||||
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)"
|
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
|
||||||
echo "#"
|
echo "#"
|
||||||
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
|
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
|
||||||
|
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
|
||||||
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "#"
|
echo "#"
|
||||||
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
|
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
|
||||||
|
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
|
||||||
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
|
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
|
||||||
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
|
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
|
||||||
echo
|
echo
|
23
feed_asn.py
23
feed_asn.py
|
@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
|
||||||
yield pref['prefix']
|
yield pref['prefix']
|
||||||
|
|
||||||
|
|
||||||
|
def get_name(asn: str) -> str:
|
||||||
|
req = requests.get(
|
||||||
|
'https://stat.ripe.net/data/as-overview/data.json',
|
||||||
|
params={'resource': asn}
|
||||||
|
)
|
||||||
|
data = req.json()
|
||||||
|
return data['data']['holder']
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
log = logging.getLogger('feed_asn')
|
log = logging.getLogger('feed_asn')
|
||||||
|
@ -32,8 +41,15 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
DB = database.Database()
|
DB = database.Database()
|
||||||
|
|
||||||
for path in DB.list_asn():
|
def add_ranges(path: database.Path,
|
||||||
|
match: database.Match,
|
||||||
|
) -> None:
|
||||||
|
assert isinstance(path, database.AsnPath)
|
||||||
|
assert isinstance(match, database.AsnNode)
|
||||||
asn_str = database.Database.unpack_asn(path)
|
asn_str = database.Database.unpack_asn(path)
|
||||||
|
DB.enter_step('asn_get_name')
|
||||||
|
name = get_name(asn_str)
|
||||||
|
match.name = name
|
||||||
DB.enter_step('asn_get_ranges')
|
DB.enter_step('asn_get_ranges')
|
||||||
for prefix in get_ranges(asn_str):
|
for prefix in get_ranges(asn_str):
|
||||||
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
|
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
|
||||||
|
@ -43,10 +59,13 @@ if __name__ == '__main__':
|
||||||
source=path,
|
source=path,
|
||||||
updated=int(time.time())
|
updated=int(time.time())
|
||||||
)
|
)
|
||||||
log.info('Added %s from %s (%s)', prefix, asn_str, path)
|
log.info('Added %s from %s (%s)', prefix, path, name)
|
||||||
elif parsed_prefix.version == 6:
|
elif parsed_prefix.version == 6:
|
||||||
log.warning('Unimplemented prefix version: %s', prefix)
|
log.warning('Unimplemented prefix version: %s', prefix)
|
||||||
else:
|
else:
|
||||||
log.error('Unknown prefix version: %s', prefix)
|
log.error('Unknown prefix version: %s', prefix)
|
||||||
|
|
||||||
|
for _ in DB.exec_each_asn(add_ranges):
|
||||||
|
pass
|
||||||
|
|
||||||
DB.save()
|
DB.save()
|
||||||
|
|
147
feed_dns.old.py
147
feed_dns.old.py
|
@ -1,147 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import database
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
import typing
|
|
||||||
import enum
|
|
||||||
|
|
||||||
RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR')
|
|
||||||
Record = typing.Tuple[RecordType, int, str, str]
|
|
||||||
|
|
||||||
# select, write
|
|
||||||
FUNCTION_MAP: typing.Any = {
|
|
||||||
RecordType.A: (
|
|
||||||
database.Database.get_ip4,
|
|
||||||
database.Database.set_hostname,
|
|
||||||
),
|
|
||||||
RecordType.CNAME: (
|
|
||||||
database.Database.get_domain,
|
|
||||||
database.Database.set_hostname,
|
|
||||||
),
|
|
||||||
RecordType.PTR: (
|
|
||||||
database.Database.get_domain,
|
|
||||||
database.Database.set_ip4address,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Parser():
|
|
||||||
def __init__(self, buf: typing.Any) -> None:
|
|
||||||
self.buf = buf
|
|
||||||
self.log = logging.getLogger('parser')
|
|
||||||
self.db = database.Database()
|
|
||||||
|
|
||||||
def end(self) -> None:
|
|
||||||
self.db.save()
|
|
||||||
|
|
||||||
def register(self,
|
|
||||||
rtype: RecordType,
|
|
||||||
updated: int,
|
|
||||||
name: str,
|
|
||||||
value: str
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
self.db.enter_step('register')
|
|
||||||
select, write = FUNCTION_MAP[rtype]
|
|
||||||
for source in select(self.db, value):
|
|
||||||
# write(self.db, name, updated, source=source)
|
|
||||||
write(self.db, name, updated)
|
|
||||||
|
|
||||||
def consume(self) -> None:
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class Rapid7Parser(Parser):
|
|
||||||
TYPES = {
|
|
||||||
'a': RecordType.A,
|
|
||||||
'aaaa': RecordType.AAAA,
|
|
||||||
'cname': RecordType.CNAME,
|
|
||||||
'ptr': RecordType.PTR,
|
|
||||||
}
|
|
||||||
|
|
||||||
def consume(self) -> None:
|
|
||||||
data = dict()
|
|
||||||
for line in self.buf:
|
|
||||||
self.db.enter_step('parse_rapid7')
|
|
||||||
split = line.split('"')
|
|
||||||
|
|
||||||
for k in range(1, 14, 4):
|
|
||||||
key = split[k]
|
|
||||||
val = split[k+2]
|
|
||||||
data[key] = val
|
|
||||||
|
|
||||||
self.register(
|
|
||||||
Rapid7Parser.TYPES[data['type']],
|
|
||||||
int(data['timestamp']),
|
|
||||||
data['name'],
|
|
||||||
data['value']
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DnsMassParser(Parser):
|
|
||||||
# dnsmass --output Snrql
|
|
||||||
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
|
|
||||||
TYPES = {
|
|
||||||
'A': (RecordType.A, -1, None),
|
|
||||||
'AAAA': (RecordType.AAAA, -1, None),
|
|
||||||
'CNAME': (RecordType.CNAME, -1, -1),
|
|
||||||
}
|
|
||||||
|
|
||||||
def consume(self) -> None:
|
|
||||||
self.db.enter_step('parse_dnsmass')
|
|
||||||
timestamp = 0
|
|
||||||
header = True
|
|
||||||
for line in self.buf:
|
|
||||||
line = line[:-1]
|
|
||||||
if not line:
|
|
||||||
header = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
split = line.split(' ')
|
|
||||||
try:
|
|
||||||
if header:
|
|
||||||
timestamp = int(split[1])
|
|
||||||
header = False
|
|
||||||
else:
|
|
||||||
dtype, name_offset, value_offset = \
|
|
||||||
DnsMassParser.TYPES[split[1]]
|
|
||||||
self.register(
|
|
||||||
dtype,
|
|
||||||
timestamp,
|
|
||||||
split[0][:name_offset],
|
|
||||||
split[2][:value_offset],
|
|
||||||
)
|
|
||||||
self.db.enter_step('parse_dnsmass')
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
PARSERS = {
|
|
||||||
'rapid7': Rapid7Parser,
|
|
||||||
'dnsmass': DnsMassParser,
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# Parsing arguments
|
|
||||||
log = logging.getLogger('feed_dns')
|
|
||||||
args_parser = argparse.ArgumentParser(
|
|
||||||
description="TODO")
|
|
||||||
args_parser.add_argument(
|
|
||||||
'parser',
|
|
||||||
choices=PARSERS.keys(),
|
|
||||||
help="TODO")
|
|
||||||
args_parser.add_argument(
|
|
||||||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
|
|
||||||
help="TODO")
|
|
||||||
args = args_parser.parse_args()
|
|
||||||
|
|
||||||
parser = PARSERS[args.parser](args.input)
|
|
||||||
try:
|
|
||||||
parser.consume()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
pass
|
|
||||||
parser.end()
|
|
||||||
|
|
|
@ -51,8 +51,7 @@ class Writer(multiprocessing.Process):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for source in select(self.db, value):
|
for source in select(self.db, value):
|
||||||
# write(self.db, name, updated, source=source)
|
write(self.db, name, updated, source=source)
|
||||||
write(self.db, name, updated)
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
self.log.exception("Cannot execute: %s", record)
|
self.log.exception("Cannot execute: %s", record)
|
||||||
|
|
||||||
|
@ -182,10 +181,10 @@ if __name__ == '__main__':
|
||||||
'-j', '--workers', type=int, default=4,
|
'-j', '--workers', type=int, default=4,
|
||||||
help="TODO")
|
help="TODO")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-b', '--block-size', type=int, default=100,
|
'-b', '--block-size', type=int, default=1024,
|
||||||
help="TODO")
|
help="TODO")
|
||||||
args_parser.add_argument(
|
args_parser.add_argument(
|
||||||
'-q', '--queue-size', type=int, default=10,
|
'-q', '--queue-size', type=int, default=128,
|
||||||
help="TODO")
|
help="TODO")
|
||||||
args = args_parser.parse_args()
|
args = args_parser.parse_args()
|
||||||
|
|
||||||
|
|
|
@ -32,10 +32,16 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
fun = FUNCTION_MAP[args.type]
|
fun = FUNCTION_MAP[args.type]
|
||||||
|
|
||||||
|
source: database.RulePath
|
||||||
|
if args.first_party:
|
||||||
|
source = database.RuleFirstPath()
|
||||||
|
else:
|
||||||
|
source = database.RuleMultiPath()
|
||||||
|
|
||||||
for rule in args.input:
|
for rule in args.input:
|
||||||
fun(DB,
|
fun(DB,
|
||||||
rule.strip(),
|
rule.strip(),
|
||||||
# is_first_party=args.first_party,
|
source=source,
|
||||||
updated=int(time.time()),
|
updated=int(time.time()),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -6,11 +6,11 @@ function log() {
|
||||||
|
|
||||||
log "Importing rules…"
|
log "Importing rules…"
|
||||||
BEFORE="$(date +%s)"
|
BEFORE="$(date +%s)"
|
||||||
# cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
|
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
|
||||||
# cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
|
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
|
||||||
# cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
|
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
|
||||||
# cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
|
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
|
||||||
# cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
|
cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
|
||||||
|
|
||||||
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
|
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
|
||||||
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party
|
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party
|
||||||
|
|
Loading…
Reference in a new issue