Compare commits

..

8 commits

Author SHA1 Message Date
Geoffrey Frogeye e882e09b37
Added outdated documentation warning in README 2019-12-17 14:27:43 +01:00
Geoffrey Frogeye d65107f849
Save dupplicates too
Maybe I won't publish them but this will help me for tracking trackers.
2019-12-17 14:10:41 +01:00
Geoffrey Frogeye ea0855bd00
Forgot to push this little guy
Good thing I cleaned up my working directory.
It only exists because pickles created from database.py itself
won't be openable from a file simply importing databse.py.
So we create it when in 'imported state'.
2019-12-17 13:50:39 +01:00
Geoffrey Frogeye 7851b038f5
Reworked rule export 2019-12-17 13:30:24 +01:00
Geoffrey Frogeye 8f6e01c857
Added first_party tracking
Well, tracking if a rule is from a first or a multi rule...
Hope I did not do any mistake
2019-12-16 19:09:02 +01:00
Geoffrey Frogeye c3bf102289
Made references work 2019-12-16 14:18:03 +01:00
Geoffrey Frogeye 03a4042238
Added level
Also fixed IP logic because this was real messed up
2019-12-16 09:31:29 +01:00
Geoffrey Frogeye 3197fa1663
Remove list usage for IpTreeNode 2019-12-16 06:54:18 +01:00
10 changed files with 419 additions and 292 deletions

View file

@ -26,6 +26,8 @@ That's where this scripts comes in, to generate a list of such subdomains.
## How does this script work
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
It takes an input a list of websites with trackers included.
So far, this list is manually-generated from the list of clients of such first-party trackers
(latter we should use a general list of websites to be more exhaustive).
@ -38,6 +40,8 @@ It finally outputs the matching ones.
## Requirements
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
Just to build the list, you can find an already-built list in the releases.
- Bash
@ -54,6 +58,8 @@ Just to build the list, you can find an already-built list in the releases.
## Usage
> **Notice:** This section is a tad outdated. I'm still experimenting to make the generation process better. I'll update this once I'm done with this.
This is only if you want to build the list yourself.
If you just want to use the list, the latest build is available here: <https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt>
It was build using additional sources not included in this repository for privacy reasons.

View file

@ -26,57 +26,76 @@ class Path():
class RulePath(Path):
pass
def __str__(self) -> str:
return '(rule)'
class RuleFirstPath(RulePath):
def __str__(self) -> str:
return '(first-party rule)'
class RuleMultiPath(RulePath):
def __str__(self) -> str:
return '(multi-party rule)'
class DomainPath(Path):
def __init__(self, path: typing.List[str]):
self.path = path
def __init__(self, parts: typing.List[str]):
self.parts = parts
def __str__(self) -> str:
return '?.' + Database.unpack_domain(self)
class HostnamePath(DomainPath):
pass
def __str__(self) -> str:
return Database.unpack_domain(self)
class ZonePath(DomainPath):
pass
def __str__(self) -> str:
return '*.' + Database.unpack_domain(self)
class AsnPath(Path):
def __init__(self, asn: Asn):
self.asn = asn
def __str__(self) -> str:
return Database.unpack_asn(self)
class Ip4Path(Path):
def __init__(self, value: int, prefixlen: int):
self.value = value
self.prefixlen = prefixlen
def __str__(self) -> str:
return Database.unpack_ip4network(self)
class Match():
def __init__(self) -> None:
self.source: typing.Optional[Path] = None
self.updated: int = 0
self.dupplicate: bool = False
# Cache
self.level: int = 0
self.source: Path = RulePath()
# FP dupplicate args
self.first_party: bool = False
self.references: int = 0
def set(self,
updated: int,
level: int,
source: Path,
) -> None:
if updated > self.updated or level > self.level:
self.updated = updated
self.level = level
self.source = source
# FP dupplicate function
def active(self) -> bool:
return self.updated > 0
def active(self, first_party: bool = None) -> bool:
if self.updated == 0 or (first_party and not self.first_party):
return False
return True
class AsnNode(Match):
pass
def __init__(self) -> None:
Match.__init__(self)
self.name = ''
class DomainTreeNode():
@ -86,17 +105,17 @@ class DomainTreeNode():
self.match_hostname = Match()
class IpTreeNode():
class IpTreeNode(Match):
def __init__(self) -> None:
self.children: typing.List[typing.Optional[IpTreeNode]] = [None, None]
self.match = Match()
Match.__init__(self)
self.zero: typing.Optional[IpTreeNode] = None
self.one: typing.Optional[IpTreeNode] = None
Node = typing.Union[DomainTreeNode, IpTreeNode, AsnNode]
NodeCallable = typing.Callable[[Path,
Node,
typing.Optional[typing.Any]],
typing.Any]
MatchCallable = typing.Callable[[Path,
Match],
typing.Any]
class Profiler():
@ -108,7 +127,6 @@ class Profiler():
self.step_dict: typing.Dict[str, int] = dict()
def enter_step(self, name: str) -> None:
return
now = time.perf_counter()
try:
self.time_dict[self.time_step] += now - self.time_last
@ -131,13 +149,21 @@ class Profiler():
class Database(Profiler):
VERSION = 10
VERSION = 18
PATH = "blocking.p"
def initialize(self) -> None:
self.log.warning(
"Creating database version: %d ",
Database.VERSION)
# Dummy match objects that everything refer to
self.rules: typing.List[Match] = list()
for first_party in (False, True):
m = Match()
m.updated = 1
m.level = 0
m.first_party = first_party
self.rules.append(m)
self.domtree = DomainTreeNode()
self.asns: typing.Dict[Asn, AsnNode] = dict()
self.ip4tree = IpTreeNode()
@ -148,7 +174,7 @@ class Database(Profiler):
with open(self.PATH, 'rb') as db_fdsec:
version, data = pickle.load(db_fdsec)
if version == Database.VERSION:
self.domtree, self.asns, self.ip4tree = data
self.rules, self.domtree, self.asns, self.ip4tree = data
return
self.log.warning(
"Outdated database version found: %d, "
@ -165,7 +191,7 @@ class Database(Profiler):
def save(self) -> None:
self.enter_step('save')
with open(self.PATH, 'wb') as db_fdsec:
data = self.domtree, self.asns, self.ip4tree
data = self.rules, self.domtree, self.asns, self.ip4tree
pickle.dump((self.VERSION, data), db_fdsec)
self.profile()
@ -180,7 +206,7 @@ class Database(Profiler):
@staticmethod
def unpack_domain(domain: DomainPath) -> str:
return '.'.join(domain.path[::-1])
return '.'.join(domain.parts[::-1])
@staticmethod
def pack_asn(asn: str) -> AsnPath:
@ -229,94 +255,227 @@ class Database(Profiler):
addr >>= 8
return '.'.join(map(str, octets)) + '/' + str(network.prefixlen)
def get_match(self, path: Path) -> Match:
if isinstance(path, RuleMultiPath):
return self.rules[0]
elif isinstance(path, RuleFirstPath):
return self.rules[1]
elif isinstance(path, AsnPath):
return self.asns[path.asn]
elif isinstance(path, DomainPath):
dicd = self.domtree
for part in path.parts:
dicd = dicd.children[part]
if isinstance(path, HostnamePath):
return dicd.match_hostname
elif isinstance(path, ZonePath):
return dicd.match_zone
else:
raise ValueError
elif isinstance(path, Ip4Path):
dici = self.ip4tree
for i in range(31, 31-path.prefixlen, -1):
bit = (path.value >> i) & 0b1
dici_next = dici.one if bit else dici.zero
if not dici_next:
raise IndexError
dici = dici_next
return dici
else:
raise ValueError
def exec_each_asn(self,
callback: MatchCallable,
) -> typing.Any:
for asn in self.asns:
match = self.asns[asn]
if match.active():
c = callback(
AsnPath(asn),
match,
)
try:
yield from c
except TypeError: # not iterable
pass
def exec_each_domain(self,
callback: NodeCallable,
arg: typing.Any = None,
callback: MatchCallable,
_dic: DomainTreeNode = None,
_par: DomainPath = None,
) -> typing.Any:
_dic = _dic or self.domtree
_par = _par or DomainPath([])
yield from callback(_par, _dic, arg)
if _dic.match_hostname.active():
c = callback(
HostnamePath(_par.parts),
_dic.match_hostname,
)
try:
yield from c
except TypeError: # not iterable
pass
if _dic.match_zone.active():
c = callback(
ZonePath(_par.parts),
_dic.match_zone,
)
try:
yield from c
except TypeError: # not iterable
pass
for part in _dic.children:
dic = _dic.children[part]
yield from self.exec_each_domain(
callback,
arg,
_dic=dic,
_par=DomainPath(_par.path + [part])
_par=DomainPath(_par.parts + [part])
)
def exec_each_ip4(self,
callback: NodeCallable,
arg: typing.Any = None,
callback: MatchCallable,
_dic: IpTreeNode = None,
_par: Ip4Path = None,
) -> typing.Any:
_dic = _dic or self.ip4tree
_par = _par or Ip4Path(0, 0)
callback(_par, _dic, arg)
if _dic.active():
c = callback(
_par,
_dic,
)
try:
yield from c
except TypeError: # not iterable
pass
# 0
dic = _dic.children[0]
pref = _par.prefixlen + 1
dic = _dic.zero
if dic:
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-_par.prefixlen)))
addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
assert addr0 == _par.value
yield from self.exec_each_ip4(
callback,
arg,
_dic=dic,
_par=Ip4Path(addr0, _par.prefixlen+1)
_par=Ip4Path(addr0, pref)
)
# 1
dic = _dic.children[1]
dic = _dic.one
if dic:
addr1 = _par.value | (1 << (32-_par.prefixlen))
addr1 = _par.value | (1 << (32-pref))
yield from self.exec_each_ip4(
callback,
arg,
_dic=dic,
_par=Ip4Path(addr1, _par.prefixlen+1)
_par=Ip4Path(addr1, pref)
)
def exec_each(self,
callback: NodeCallable,
arg: typing.Any = None,
callback: MatchCallable,
) -> typing.Any:
yield from self.exec_each_domain(callback)
yield from self.exec_each_ip4(callback)
yield from self.exec_each_asn(callback)
def update_references(self) -> None:
raise NotImplementedError
# Should be correctly calculated normally,
# keeping this just in case
def reset_references_cb(path: Path,
match: Match
) -> None:
match.references = 0
for _ in self.exec_each(reset_references_cb):
pass
def increment_references_cb(path: Path,
match: Match
) -> None:
if match.source:
source = self.get_match(match.source)
source.references += 1
for _ in self.exec_each(increment_references_cb):
pass
def prune(self, before: int, base_only: bool = False) -> None:
raise NotImplementedError
def explain(self, entry: int) -> str:
raise NotImplementedError
def explain(self, path: Path) -> str:
match = self.get_match(path)
if isinstance(match, AsnNode):
string = f'{path} ({match.name}) #{match.references}'
else:
string = f'{path} #{match.references}'
if match.source:
string += f'{self.explain(match.source)}'
return string
def export(self,
first_party_only: bool = False,
end_chain_only: bool = False,
no_dupplicates: bool = False,
explain: bool = False,
) -> typing.Iterable[str]:
if first_party_only or end_chain_only or explain:
raise NotImplementedError
def export_cb(path: Path, node: Node, _: typing.Any
def export_cb(path: Path, match: Match
) -> typing.Iterable[str]:
assert isinstance(path, DomainPath)
assert isinstance(node, DomainTreeNode)
if node.match_hostname:
a = self.unpack_domain(path)
yield a
if not isinstance(path, HostnamePath):
return
if first_party_only and not match.first_party:
return
if end_chain_only and match.references > 0:
return
if no_dupplicates and match.dupplicate:
return
if explain:
yield self.explain(path)
else:
yield self.unpack_domain(path)
yield from self.exec_each_domain(export_cb, None)
yield from self.exec_each_domain(export_cb)
def count_rules(self,
first_party_only: bool = False,
) -> str:
raise NotImplementedError
def list_rules(self,
first_party_only: bool = False,
) -> typing.Iterable[str]:
def list_rules_cb(path: Path, match: Match
) -> typing.Iterable[str]:
if first_party_only and not match.first_party:
return
if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 1:
# It should be the latter condition but it is more
# useful when using the former
yield self.explain(path)
yield from self.exec_each(list_rules_cb)
def count_records(self,
first_party_only: bool = False,
rules_only: bool = False,
no_dupplicates: bool = False,
) -> str:
memo: typing.Dict[str, int] = dict()
def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
if rules_only and match.level > 1:
return
if no_dupplicates and match.dupplicate:
return
try:
memo[path.__class__.__name__] += 1
except KeyError:
memo[path.__class__.__name__] = 1
for _ in self.exec_each(count_records_cb):
pass
split: typing.List[str] = list()
for key, value in sorted(memo.items(), key=lambda s: s[0]):
split.append(f'{key[:-4]}: {value}')
return ', '.join(split)
def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
self.enter_step('get_domain_pack')
@ -324,10 +483,10 @@ class Database(Profiler):
self.enter_step('get_domain_brws')
dic = self.domtree
depth = 0
for part in domain.path:
for part in domain.parts:
if dic.match_zone.active():
self.enter_step('get_domain_yield')
yield ZonePath(domain.path[:depth])
yield ZonePath(domain.parts[:depth])
self.enter_step('get_domain_brws')
if part not in dic.children:
return
@ -335,61 +494,85 @@ class Database(Profiler):
depth += 1
if dic.match_zone.active():
self.enter_step('get_domain_yield')
yield ZonePath(domain.path)
yield ZonePath(domain.parts)
if dic.match_hostname.active():
self.enter_step('get_domain_yield')
yield HostnamePath(domain.path)
yield HostnamePath(domain.parts)
def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
self.enter_step('get_ip4_pack')
ip4 = self.pack_ip4address(ip4_str)
self.enter_step('get_ip4_brws')
dic = self.ip4tree
for i in reversed(range(ip4.prefixlen)):
part = (ip4.value >> i) & 0b1
if dic.match.active():
for i in range(31, 31-ip4.prefixlen, -1):
bit = (ip4.value >> i) & 0b1
if dic.active():
self.enter_step('get_ip4_yield')
yield Ip4Path(ip4.value, 32-i)
self.enter_step('get_ip4_brws')
next_dic = dic.children[part]
yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)
self.enter_step('get_ip4_brws')
next_dic = dic.one if bit else dic.zero
if next_dic is None:
return
dic = next_dic
if dic.match.active():
if dic.active():
self.enter_step('get_ip4_yield')
yield ip4
def list_asn(self) -> typing.Iterable[AsnPath]:
for asn in self.asns:
yield AsnPath(asn)
def _set_match(self,
match: Match,
updated: int,
source: Path,
source_match: Match = None,
dupplicate: bool = False,
) -> None:
# source_match is in parameters because most of the time
# its parent function needs it too,
# so it can pass it to save a traversal
source_match = source_match or self.get_match(source)
new_level = source_match.level + 1
if updated > match.updated or new_level < match.level \
or source_match.first_party > match.first_party:
# NOTE FP and level of matches referencing this one
# won't be updated until run or prune
if match.source:
old_source = self.get_match(match.source)
old_source.references -= 1
match.updated = updated
match.level = new_level
match.first_party = source_match.first_party
match.source = source
source_match.references += 1
match.dupplicate = dupplicate
def _set_domain(self,
hostname: bool,
domain_str: str,
updated: int,
is_first_party: bool = None,
source: Path = None) -> None:
source: Path) -> None:
self.enter_step('set_domain_pack')
if is_first_party:
raise NotImplementedError
domain = self.pack_domain(domain_str)
self.enter_step('set_domain_fp')
source_match = self.get_match(source)
is_first_party = source_match.first_party
self.enter_step('set_domain_brws')
dic = self.domtree
for part in domain.path:
if dic.match_zone.active():
# Refuse to add domain whose zone is already matching
return
dupplicate = False
for part in domain.parts:
if part not in dic.children:
dic.children[part] = DomainTreeNode()
dic = dic.children[part]
if dic.match_zone.active(is_first_party):
dupplicate = True
if hostname:
match = dic.match_hostname
else:
match = dic.match_zone
match.set(
self._set_match(
match,
updated,
0, # TODO Level
source or RulePath(),
source,
source_match=source_match,
dupplicate=dupplicate,
)
def set_hostname(self,
@ -405,42 +588,48 @@ class Database(Profiler):
def set_asn(self,
asn_str: str,
updated: int,
is_first_party: bool = None,
source: Path = None) -> None:
source: Path) -> None:
self.enter_step('set_asn')
if is_first_party:
raise NotImplementedError
path = self.pack_asn(asn_str)
match = AsnNode()
match.set(
updated,
0,
source or RulePath()
if path.asn in self.asns:
match = self.asns[path.asn]
else:
match = AsnNode()
self.asns[path.asn] = match
self._set_match(
match,
updated,
source,
)
self.asns[path.asn] = match
def _set_ip4(self,
ip4: Ip4Path,
updated: int,
is_first_party: bool = None,
source: Path = None) -> None:
if is_first_party:
raise NotImplementedError
source: Path) -> None:
self.enter_step('set_ip4_fp')
source_match = self.get_match(source)
is_first_party = source_match.first_party
self.enter_step('set_ip4_brws')
dic = self.ip4tree
for i in reversed(range(ip4.prefixlen)):
part = (ip4.value >> i) & 0b1
if dic.match.active():
# Refuse to add ip4* whose network is already matching
return
next_dic = dic.children[part]
dupplicate = False
for i in range(31, 31-ip4.prefixlen, -1):
bit = (ip4.value >> i) & 0b1
next_dic = dic.one if bit else dic.zero
if next_dic is None:
next_dic = IpTreeNode()
dic.children[part] = next_dic
if bit:
dic.one = next_dic
else:
dic.zero = next_dic
dic = next_dic
dic.match.set(
if dic.active(is_first_party):
dupplicate = True
self._set_match(
dic,
updated,
0, # TODO Level
source or RulePath(),
source,
source_match=source_match,
dupplicate=dupplicate,
)
def set_ip4address(self,
@ -449,7 +638,6 @@ class Database(Profiler):
) -> None:
self.enter_step('set_ip4add_pack')
ip4 = self.pack_ip4address(ip4address_str)
self.enter_step('set_ip4add_brws')
self._set_ip4(ip4, *args, **kwargs)
def set_ip4network(self,
@ -458,5 +646,4 @@ class Database(Profiler):
) -> None:
self.enter_step('set_ip4net_pack')
ip4 = self.pack_ip4network(ip4network_str)
self.enter_step('set_ip4net_brws')
self._set_ip4(ip4, *args, **kwargs)

44
db.py Executable file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
import argparse
import database
import time
import os
if __name__ == '__main__':
# Parsing arguments
parser = argparse.ArgumentParser(
description="Database operations")
parser.add_argument(
'-i', '--initialize', action='store_true',
help="Reconstruct the whole database")
parser.add_argument(
'-p', '--prune', action='store_true',
help="Remove old entries from database")
parser.add_argument(
'-b', '--prune-base', action='store_true',
help="TODO")
parser.add_argument(
'-s', '--prune-before', type=int,
default=(int(time.time()) - 60*60*24*31*6),
help="TODO")
parser.add_argument(
'-r', '--references', action='store_true',
help="Update the reference count")
args = parser.parse_args()
if not args.initialize:
DB = database.Database()
else:
if os.path.isfile(database.Database.PATH):
os.unlink(database.Database.PATH)
DB = database.Database()
DB.enter_step('main')
if args.prune:
DB.prune(before=args.prune_before, base_only=args.prune_base)
if args.references:
DB.update_references()
DB.save()

View file

@ -25,6 +25,9 @@ if __name__ == '__main__':
parser.add_argument(
'-r', '--rules', action='store_true',
help="TODO")
parser.add_argument(
'-d', '--no-dupplicates', action='store_true',
help="TODO")
parser.add_argument(
'-c', '--count', action='store_true',
help="TODO")
@ -32,16 +35,20 @@ if __name__ == '__main__':
DB = database.Database()
if args.rules:
if not args.count:
raise NotImplementedError
print(DB.count_rules(first_party_only=args.first_party))
if args.count:
print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules,
no_dupplicates=args.no_dupplicates,
))
else:
if args.count:
raise NotImplementedError
if args.rules:
for line in DB.list_rules():
print(line)
for domain in DB.export(
first_party_only=args.first_party,
end_chain_only=args.end_chain,
no_dupplicates=args.no_dupplicates,
explain=args.explain,
):
print(domain, file=args.output)

View file

@ -4,21 +4,25 @@ function log() {
echo -e "\033[33m$@\033[0m"
}
log "Pruning old data…"
./database.py --prune
log "Recounting references…"
./database.py --references
log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output dist/multiparty-only-trackers.txt
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
function generate_hosts {
basename="$1"
description="$2"
@ -46,13 +50,15 @@ function generate_hosts {
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
echo "#"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo

View file

@ -21,6 +21,15 @@ def get_ranges(asn: str) -> typing.Iterable[str]:
yield pref['prefix']
def get_name(asn: str) -> str:
req = requests.get(
'https://stat.ripe.net/data/as-overview/data.json',
params={'resource': asn}
)
data = req.json()
return data['data']['holder']
if __name__ == '__main__':
log = logging.getLogger('feed_asn')
@ -32,8 +41,15 @@ if __name__ == '__main__':
DB = database.Database()
for path in DB.list_asn():
def add_ranges(path: database.Path,
match: database.Match,
) -> None:
assert isinstance(path, database.AsnPath)
assert isinstance(match, database.AsnNode)
asn_str = database.Database.unpack_asn(path)
DB.enter_step('asn_get_name')
name = get_name(asn_str)
match.name = name
DB.enter_step('asn_get_ranges')
for prefix in get_ranges(asn_str):
parsed_prefix: IPNetwork = ipaddress.ip_network(prefix)
@ -43,10 +59,13 @@ if __name__ == '__main__':
source=path,
updated=int(time.time())
)
log.info('Added %s from %s (%s)', prefix, asn_str, path)
log.info('Added %s from %s (%s)', prefix, path, name)
elif parsed_prefix.version == 6:
log.warning('Unimplemented prefix version: %s', prefix)
else:
log.error('Unknown prefix version: %s', prefix)
for _ in DB.exec_each_asn(add_ranges):
pass
DB.save()

View file

@ -1,147 +0,0 @@
#!/usr/bin/env python3
import argparse
import database
import logging
import sys
import typing
import enum
RecordType = enum.Enum('RecordType', 'A AAAA CNAME PTR')
Record = typing.Tuple[RecordType, int, str, str]
# select, write
FUNCTION_MAP: typing.Any = {
RecordType.A: (
database.Database.get_ip4,
database.Database.set_hostname,
),
RecordType.CNAME: (
database.Database.get_domain,
database.Database.set_hostname,
),
RecordType.PTR: (
database.Database.get_domain,
database.Database.set_ip4address,
),
}
class Parser():
def __init__(self, buf: typing.Any) -> None:
self.buf = buf
self.log = logging.getLogger('parser')
self.db = database.Database()
def end(self) -> None:
self.db.save()
def register(self,
rtype: RecordType,
updated: int,
name: str,
value: str
) -> None:
self.db.enter_step('register')
select, write = FUNCTION_MAP[rtype]
for source in select(self.db, value):
# write(self.db, name, updated, source=source)
write(self.db, name, updated)
def consume(self) -> None:
raise NotImplementedError
class Rapid7Parser(Parser):
TYPES = {
'a': RecordType.A,
'aaaa': RecordType.AAAA,
'cname': RecordType.CNAME,
'ptr': RecordType.PTR,
}
def consume(self) -> None:
data = dict()
for line in self.buf:
self.db.enter_step('parse_rapid7')
split = line.split('"')
for k in range(1, 14, 4):
key = split[k]
val = split[k+2]
data[key] = val
self.register(
Rapid7Parser.TYPES[data['type']],
int(data['timestamp']),
data['name'],
data['value']
)
class DnsMassParser(Parser):
# dnsmass --output Snrql
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
TYPES = {
'A': (RecordType.A, -1, None),
'AAAA': (RecordType.AAAA, -1, None),
'CNAME': (RecordType.CNAME, -1, -1),
}
def consume(self) -> None:
self.db.enter_step('parse_dnsmass')
timestamp = 0
header = True
for line in self.buf:
line = line[:-1]
if not line:
header = True
continue
split = line.split(' ')
try:
if header:
timestamp = int(split[1])
header = False
else:
dtype, name_offset, value_offset = \
DnsMassParser.TYPES[split[1]]
self.register(
dtype,
timestamp,
split[0][:name_offset],
split[2][:value_offset],
)
self.db.enter_step('parse_dnsmass')
except KeyError:
continue
PARSERS = {
'rapid7': Rapid7Parser,
'dnsmass': DnsMassParser,
}
if __name__ == '__main__':
# Parsing arguments
log = logging.getLogger('feed_dns')
args_parser = argparse.ArgumentParser(
description="TODO")
args_parser.add_argument(
'parser',
choices=PARSERS.keys(),
help="TODO")
args_parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
args = args_parser.parse_args()
parser = PARSERS[args.parser](args.input)
try:
parser.consume()
except KeyboardInterrupt:
pass
parser.end()

View file

@ -51,8 +51,7 @@ class Writer(multiprocessing.Process):
try:
for source in select(self.db, value):
# write(self.db, name, updated, source=source)
write(self.db, name, updated)
write(self.db, name, updated, source=source)
except ValueError:
self.log.exception("Cannot execute: %s", record)
@ -182,10 +181,10 @@ if __name__ == '__main__':
'-j', '--workers', type=int, default=4,
help="TODO")
args_parser.add_argument(
'-b', '--block-size', type=int, default=100,
'-b', '--block-size', type=int, default=1024,
help="TODO")
args_parser.add_argument(
'-q', '--queue-size', type=int, default=10,
'-q', '--queue-size', type=int, default=128,
help="TODO")
args = args_parser.parse_args()

View file

@ -32,10 +32,16 @@ if __name__ == '__main__':
fun = FUNCTION_MAP[args.type]
source: database.RulePath
if args.first_party:
source = database.RuleFirstPath()
else:
source = database.RuleMultiPath()
for rule in args.input:
fun(DB,
rule.strip(),
# is_first_party=args.first_party,
source=source,
updated=int(time.time()),
)

View file

@ -6,11 +6,11 @@ function log() {
log "Importing rules…"
BEFORE="$(date +%s)"
# cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
# cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
# cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
# cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
# cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party