Added first_party tracking

Well, tracking if a rule is from a first or a multi rule...
Hope I did not do any mistake
This commit is contained in:
Geoffrey Frogeye 2019-12-16 19:07:35 +01:00
parent c3bf102289
commit 8f6e01c857
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
3 changed files with 93 additions and 60 deletions

View file

@ -27,7 +27,17 @@ class Path():
class RulePath(Path): class RulePath(Path):
def __str__(self) -> str: def __str__(self) -> str:
return '(rules)' return '(rule)'
class RuleFirstPath(RulePath):
def __str__(self) -> str:
return '(first-party rule)'
class RuleMultiPath(RulePath):
def __str__(self) -> str:
return '(multi-party rule)'
class DomainPath(Path): class DomainPath(Path):
@ -67,14 +77,18 @@ class Ip4Path(Path):
class Match(): class Match():
def __init__(self) -> None: def __init__(self) -> None:
self.updated: int = 0
self.level: int = 0
self.source: typing.Optional[Path] = None self.source: typing.Optional[Path] = None
self.references: int = 0 self.updated: int = 0
# FP dupplicate args
def active(self) -> bool: # Cache
return self.updated > 0 self.level: int = 0
self.first_party: bool = False
self.references: int = 0
def active(self, first_party: bool = None) -> bool:
if self.updated == 0 or (first_party and not self.first_party):
return False
return True
class AsnNode(Match): class AsnNode(Match):
@ -133,13 +147,21 @@ class Profiler():
class Database(Profiler): class Database(Profiler):
VERSION = 14 VERSION = 17
PATH = "blocking.p" PATH = "blocking.p"
def initialize(self) -> None: def initialize(self) -> None:
self.log.warning( self.log.warning(
"Creating database version: %d ", "Creating database version: %d ",
Database.VERSION) Database.VERSION)
# Dummy match objects that everything refer to
self.rules: typing.List[Match] = list()
for first_party in (False, True):
m = Match()
m.updated = 1
m.level = 0
m.first_party = first_party
self.rules.append(m)
self.domtree = DomainTreeNode() self.domtree = DomainTreeNode()
self.asns: typing.Dict[Asn, AsnNode] = dict() self.asns: typing.Dict[Asn, AsnNode] = dict()
self.ip4tree = IpTreeNode() self.ip4tree = IpTreeNode()
@ -150,7 +172,7 @@ class Database(Profiler):
with open(self.PATH, 'rb') as db_fdsec: with open(self.PATH, 'rb') as db_fdsec:
version, data = pickle.load(db_fdsec) version, data = pickle.load(db_fdsec)
if version == Database.VERSION: if version == Database.VERSION:
self.domtree, self.asns, self.ip4tree = data self.rules, self.domtree, self.asns, self.ip4tree = data
return return
self.log.warning( self.log.warning(
"Outdated database version found: %d, " "Outdated database version found: %d, "
@ -167,7 +189,7 @@ class Database(Profiler):
def save(self) -> None: def save(self) -> None:
self.enter_step('save') self.enter_step('save')
with open(self.PATH, 'wb') as db_fdsec: with open(self.PATH, 'wb') as db_fdsec:
data = self.domtree, self.asns, self.ip4tree data = self.rules, self.domtree, self.asns, self.ip4tree
pickle.dump((self.VERSION, data), db_fdsec) pickle.dump((self.VERSION, data), db_fdsec)
self.profile() self.profile()
@ -232,8 +254,10 @@ class Database(Profiler):
return '.'.join(map(str, octets)) + '/' + str(network.prefixlen) return '.'.join(map(str, octets)) + '/' + str(network.prefixlen)
def get_match(self, path: Path) -> Match: def get_match(self, path: Path) -> Match:
if isinstance(path, RulePath): if isinstance(path, RuleMultiPath):
return Match() return self.rules[0]
elif isinstance(path, RuleFirstPath):
return self.rules[1]
elif isinstance(path, AsnPath): elif isinstance(path, AsnPath):
return self.asns[path.asn] return self.asns[path.asn]
elif isinstance(path, DomainPath): elif isinstance(path, DomainPath):
@ -275,7 +299,6 @@ class Database(Profiler):
except TypeError: # not iterable except TypeError: # not iterable
pass pass
def exec_each_domain(self, def exec_each_domain(self,
callback: MatchCallable, callback: MatchCallable,
arg: typing.Any = None, arg: typing.Any = None,
@ -387,9 +410,7 @@ class Database(Profiler):
def explain(self, path: Path) -> str: def explain(self, path: Path) -> str:
match = self.get_match(path) match = self.get_match(path)
string = f'{path}' string = f'{path} #{match.references}'
if not isinstance(path, RulePath):
string += f' #{match.references}'
if match.source: if match.source:
string += f'{self.explain(match.source)}' string += f'{self.explain(match.source)}'
return string return string
@ -399,14 +420,14 @@ class Database(Profiler):
end_chain_only: bool = False, end_chain_only: bool = False,
explain: bool = False, explain: bool = False,
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
if first_party_only:
raise NotImplementedError
def export_cb(path: Path, match: Match, _: typing.Any def export_cb(path: Path, match: Match, _: typing.Any
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
assert isinstance(path, DomainPath) assert isinstance(path, DomainPath)
if not isinstance(path, HostnamePath): if not isinstance(path, HostnamePath):
return return
if first_party_only and not match.first_party:
return
if end_chain_only and match.references > 0: if end_chain_only and match.references > 0:
return return
if explain: if explain:
@ -419,11 +440,11 @@ class Database(Profiler):
def list_rules(self, def list_rules(self,
first_party_only: bool = False, first_party_only: bool = False,
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
if first_party_only:
raise NotImplementedError
def list_rules_cb(path: Path, match: Match, _: typing.Any def list_rules_cb(path: Path, match: Match, _: typing.Any
) -> typing.Iterable[str]: ) -> typing.Iterable[str]:
if first_party_only and not match.first_party:
return
if isinstance(path, ZonePath) \ if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32): or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 0: # if match.level == 0:
@ -465,10 +486,10 @@ class Database(Profiler):
dic = self.ip4tree dic = self.ip4tree
for i in range(31, 31-ip4.prefixlen, -1): for i in range(31, 31-ip4.prefixlen, -1):
bit = (ip4.value >> i) & 0b1 bit = (ip4.value >> i) & 0b1
# TODO PERF copy value and slide once every loop
if dic.active(): if dic.active():
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
a = Ip4Path(ip4.value >> (i+1) << (i+1), 31-i) yield Ip4Path(ip4.value >> (i+1) << (i+1), 31-i)
yield a
self.enter_step('get_ip4_brws') self.enter_step('get_ip4_brws')
next_dic = dic.one if bit else dic.zero next_dic = dic.one if bit else dic.zero
if next_dic is None: if next_dic is None:
@ -478,50 +499,58 @@ class Database(Profiler):
self.enter_step('get_ip4_yield') self.enter_step('get_ip4_yield')
yield ip4 yield ip4
def set_match(self, def _set_match(self,
match: Match, match: Match,
updated: int, updated: int,
source: Path, source: Path,
source_match: Match = None,
) -> None: ) -> None:
new_source = self.get_match(source) # source_match is in parameters because most of the time
new_level = new_source.level + 1 # its parent function needs it too,
if updated > match.updated or new_level > match.level: # so it can pass it to save a traversal
source_match = source_match or self.get_match(source)
new_level = source_match.level + 1
if updated > match.updated or new_level < match.level \
or source_match.first_party > match.first_party:
# NOTE FP and level of matches referencing this one
# won't be updated until run or prune
if match.source: if match.source:
old_source = self.get_match(match.source) old_source = self.get_match(match.source)
old_source.references -= 1 old_source.references -= 1
match.updated = updated match.updated = updated
match.level = new_level match.level = new_level
match.first_party = source_match.first_party
match.source = source match.source = source
new_source.references += 1 source_match.references += 1
# FP dupplicate function
def _set_domain(self, def _set_domain(self,
hostname: bool, hostname: bool,
domain_str: str, domain_str: str,
updated: int, updated: int,
is_first_party: bool = None, source: Path) -> None:
source: Path = None) -> None:
self.enter_step('set_domain_pack') self.enter_step('set_domain_pack')
if is_first_party:
raise NotImplementedError
domain = self.pack_domain(domain_str) domain = self.pack_domain(domain_str)
self.enter_step('set_domain_fp')
source_match = self.get_match(source)
is_first_party = source_match.first_party
self.enter_step('set_domain_brws') self.enter_step('set_domain_brws')
dic = self.domtree dic = self.domtree
for part in domain.parts: for part in domain.parts:
if part not in dic.children: if part not in dic.children:
dic.children[part] = DomainTreeNode() dic.children[part] = DomainTreeNode()
dic = dic.children[part] dic = dic.children[part]
if dic.match_zone.active(): if dic.match_zone.active(is_first_party):
# Refuse to add domain whose zone is already matching # Refuse to add domain whose zone is already matching
return return
if hostname: if hostname:
match = dic.match_hostname match = dic.match_hostname
else: else:
match = dic.match_zone match = dic.match_zone
self.set_match( self._set_match(
match, match,
updated, updated,
source or RulePath(), source,
source_match=source_match,
) )
def set_hostname(self, def set_hostname(self,
@ -537,30 +566,27 @@ class Database(Profiler):
def set_asn(self, def set_asn(self,
asn_str: str, asn_str: str,
updated: int, updated: int,
is_first_party: bool = None, source: Path) -> None:
source: Path = None) -> None:
self.enter_step('set_asn') self.enter_step('set_asn')
if is_first_party:
raise NotImplementedError
path = self.pack_asn(asn_str) path = self.pack_asn(asn_str)
if path.asn in self.asns: if path.asn in self.asns:
match = self.asns[path.asn] match = self.asns[path.asn]
else: else:
match = AsnNode() match = AsnNode()
self.asns[path.asn] = match self.asns[path.asn] = match
self.set_match( self._set_match(
match, match,
updated, updated,
source or RulePath(), source,
) )
def _set_ip4(self, def _set_ip4(self,
ip4: Ip4Path, ip4: Ip4Path,
updated: int, updated: int,
is_first_party: bool = None, source: Path) -> None:
source: Path = None) -> None: self.enter_step('set_ip4_fp')
if is_first_party: source_match = self.get_match(source)
raise NotImplementedError is_first_party = source_match.first_party
self.enter_step('set_ip4_brws') self.enter_step('set_ip4_brws')
dic = self.ip4tree dic = self.ip4tree
for i in range(31, 31-ip4.prefixlen, -1): for i in range(31, 31-ip4.prefixlen, -1):
@ -573,13 +599,14 @@ class Database(Profiler):
else: else:
dic.zero = next_dic dic.zero = next_dic
dic = next_dic dic = next_dic
if dic.active(): if dic.active(is_first_party):
# Refuse to add ip4* whose network is already matching # Refuse to add ip4* whose network is already matching
return return
self.set_match( self._set_match(
dic, dic,
updated, updated,
source or RulePath(), source,
source_match=source_match,
) )
def set_ip4address(self, def set_ip4address(self,

View file

@ -32,10 +32,16 @@ if __name__ == '__main__':
fun = FUNCTION_MAP[args.type] fun = FUNCTION_MAP[args.type]
source: database.RulePath
if args.first_party:
source = database.RuleFirstPath()
else:
source = database.RuleMultiPath()
for rule in args.input: for rule in args.input:
fun(DB, fun(DB,
rule.strip(), rule.strip(),
# is_first_party=args.first_party, source=source,
updated=int(time.time()), updated=int(time.time()),
) )

View file

@ -6,11 +6,11 @@ function log() {
log "Importing rules…" log "Importing rules…"
BEFORE="$(date +%s)" BEFORE="$(date +%s)"
# cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
# cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
# cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
# cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network
# cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn
cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party
cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party