Implement pruning

This commit is contained in:
Geoffrey Frogeye 2019-12-21 19:38:20 +01:00
parent 1a6e64da3d
commit 7d1c1a1d54
Signed by: geoffrey
GPG key ID: D8A7ECA00A8CD3DD
4 changed files with 68 additions and 8 deletions

View file

@ -47,7 +47,9 @@ Depending on the sources you'll be using to generate the list, you'll need to in
### Create a new database
The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
For now there's no way to remove data from it, so here's the command to recreate it: `./db.py --initialize`.
It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order.
You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ;
`TIMESTAMP` can be generated using `date +%s`.
### Gather external sources
@ -79,6 +81,7 @@ In each folder:
- `*.custom.ext` are for sources that you don't want commited
Then, run `./import_rules.sh`.
Note that removed rules and every record depending on them will be automatically pruned.
### Add subdomains

View file

@ -95,6 +95,9 @@ class Match():
return False
return True
def disable(self) -> None:
self.updated = 0
class AsnNode(Match):
def __init__(self) -> None:
@ -478,15 +481,61 @@ class Database(Profiler):
for _ in self.exec_each(increment_references_cb):
pass
def _clean_deps(self) -> None:
# Disable the matches that depends on the targeted
# matches until all disabled matches reference count = 0
did_something = True
def clean_deps_cb(path: Path,
match: Match
) -> None:
nonlocal did_something
if not match.source:
return
source = self.get_match(match.source)
if not source.active():
self._unset_match(match)
elif match.first_party > source.first_party:
match.first_party = source.first_party
else:
return
did_something = True
while did_something:
did_something = False
self.enter_step('pass_clean_deps')
for _ in self.exec_each(clean_deps_cb):
pass
def prune(self, before: int, base_only: bool = False) -> None:
raise NotImplementedError
# Disable the matches targeted
def prune_cb(path: Path,
match: Match
) -> None:
if base_only and match.level > 1:
return
if match.updated > before:
return
self._unset_match(match)
self.log.debug("Print: disabled %s", path)
self.enter_step('pass_prune')
for _ in self.exec_each(prune_cb):
pass
self._clean_deps()
# Remove branches with no match
# TODO
def explain(self, path: Path) -> str:
match = self.get_match(path)
string = str(path)
if isinstance(match, AsnNode):
string = f'{path} ({match.name}) #{match.references}'
else:
string = f'{path} #{match.references}'
string += f' ({match.name})'
party_char = 'F' if match.first_party else 'M'
dup_char = 'D' if match.dupplicate else '_'
string += f' {match.level}{party_char}{dup_char}{match.references}'
if match.source:
string += f'{self.explain(match.source)}'
return string
@ -598,6 +647,14 @@ class Database(Profiler):
self.enter_step('get_ip4_yield')
yield ip4
def _unset_match(self,
match: Match,
) -> None:
match.disable()
if match.source:
source_match = self.get_match(match.source)
source_match.references -= 1
def _set_match(self,
match: Match,
updated: int,

View file

@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as
./feed_asn.py
# log "Pruning old rules…"
# ./db.py --prune --prune-before "$BEFORE" --prune-base
log "Pruning old rules…"
./db.py --prune --prune-before "$BEFORE" --prune-base

View file

@ -7,7 +7,7 @@ function log() {
log "Compiling nameservers…"
pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list
log "Compiling subdomain…"
log "Compiling subdomains…"
# Sort by last character to utilize the DNS server caching mechanism
# (not as efficient with massdns but it's almost free so why not)
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list