Implement pruning
This commit is contained in:
parent
1a6e64da3d
commit
7d1c1a1d54
|
@ -47,7 +47,9 @@ Depending on the sources you'll be using to generate the list, you'll need to in
|
||||||
### Create a new database
|
### Create a new database
|
||||||
|
|
||||||
The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
|
The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it.
|
||||||
For now there's no way to remove data from it, so here's the command to recreate it: `./db.py --initialize`.
|
It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order.
|
||||||
|
You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ;
|
||||||
|
`TIMESTAMP` can be generated using `date +%s`.
|
||||||
|
|
||||||
### Gather external sources
|
### Gather external sources
|
||||||
|
|
||||||
|
@ -79,6 +81,7 @@ In each folder:
|
||||||
- `*.custom.ext` are for sources that you don't want commited
|
- `*.custom.ext` are for sources that you don't want commited
|
||||||
|
|
||||||
Then, run `./import_rules.sh`.
|
Then, run `./import_rules.sh`.
|
||||||
|
Note that removed rules and every record depending on them will be automatically pruned.
|
||||||
|
|
||||||
### Add subdomains
|
### Add subdomains
|
||||||
|
|
||||||
|
|
65
database.py
65
database.py
|
@ -95,6 +95,9 @@ class Match():
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def disable(self) -> None:
|
||||||
|
self.updated = 0
|
||||||
|
|
||||||
|
|
||||||
class AsnNode(Match):
|
class AsnNode(Match):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
@ -478,15 +481,61 @@ class Database(Profiler):
|
||||||
for _ in self.exec_each(increment_references_cb):
|
for _ in self.exec_each(increment_references_cb):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _clean_deps(self) -> None:
|
||||||
|
# Disable the matches that depends on the targeted
|
||||||
|
# matches until all disabled matches reference count = 0
|
||||||
|
did_something = True
|
||||||
|
|
||||||
|
def clean_deps_cb(path: Path,
|
||||||
|
match: Match
|
||||||
|
) -> None:
|
||||||
|
nonlocal did_something
|
||||||
|
if not match.source:
|
||||||
|
return
|
||||||
|
source = self.get_match(match.source)
|
||||||
|
if not source.active():
|
||||||
|
self._unset_match(match)
|
||||||
|
elif match.first_party > source.first_party:
|
||||||
|
match.first_party = source.first_party
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
did_something = True
|
||||||
|
|
||||||
|
while did_something:
|
||||||
|
did_something = False
|
||||||
|
self.enter_step('pass_clean_deps')
|
||||||
|
for _ in self.exec_each(clean_deps_cb):
|
||||||
|
pass
|
||||||
|
|
||||||
def prune(self, before: int, base_only: bool = False) -> None:
|
def prune(self, before: int, base_only: bool = False) -> None:
|
||||||
raise NotImplementedError
|
# Disable the matches targeted
|
||||||
|
def prune_cb(path: Path,
|
||||||
|
match: Match
|
||||||
|
) -> None:
|
||||||
|
if base_only and match.level > 1:
|
||||||
|
return
|
||||||
|
if match.updated > before:
|
||||||
|
return
|
||||||
|
self._unset_match(match)
|
||||||
|
self.log.debug("Print: disabled %s", path)
|
||||||
|
|
||||||
|
self.enter_step('pass_prune')
|
||||||
|
for _ in self.exec_each(prune_cb):
|
||||||
|
pass
|
||||||
|
|
||||||
|
self._clean_deps()
|
||||||
|
|
||||||
|
# Remove branches with no match
|
||||||
|
# TODO
|
||||||
|
|
||||||
def explain(self, path: Path) -> str:
|
def explain(self, path: Path) -> str:
|
||||||
match = self.get_match(path)
|
match = self.get_match(path)
|
||||||
|
string = str(path)
|
||||||
if isinstance(match, AsnNode):
|
if isinstance(match, AsnNode):
|
||||||
string = f'{path} ({match.name}) #{match.references}'
|
string += f' ({match.name})'
|
||||||
else:
|
party_char = 'F' if match.first_party else 'M'
|
||||||
string = f'{path} #{match.references}'
|
dup_char = 'D' if match.dupplicate else '_'
|
||||||
|
string += f' {match.level}{party_char}{dup_char}{match.references}'
|
||||||
if match.source:
|
if match.source:
|
||||||
string += f' ← {self.explain(match.source)}'
|
string += f' ← {self.explain(match.source)}'
|
||||||
return string
|
return string
|
||||||
|
@ -598,6 +647,14 @@ class Database(Profiler):
|
||||||
self.enter_step('get_ip4_yield')
|
self.enter_step('get_ip4_yield')
|
||||||
yield ip4
|
yield ip4
|
||||||
|
|
||||||
|
def _unset_match(self,
|
||||||
|
match: Match,
|
||||||
|
) -> None:
|
||||||
|
match.disable()
|
||||||
|
if match.source:
|
||||||
|
source_match = self.get_match(match.source)
|
||||||
|
source_match.references -= 1
|
||||||
|
|
||||||
def _set_match(self,
|
def _set_match(self,
|
||||||
match: Match,
|
match: Match,
|
||||||
updated: int,
|
updated: int,
|
||||||
|
|
|
@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as
|
||||||
|
|
||||||
./feed_asn.py
|
./feed_asn.py
|
||||||
|
|
||||||
# log "Pruning old rules…"
|
log "Pruning old rules…"
|
||||||
# ./db.py --prune --prune-before "$BEFORE" --prune-base
|
./db.py --prune --prune-before "$BEFORE" --prune-base
|
||||||
|
|
|
@ -7,7 +7,7 @@ function log() {
|
||||||
log "Compiling nameservers…"
|
log "Compiling nameservers…"
|
||||||
pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list
|
pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list
|
||||||
|
|
||||||
log "Compiling subdomain…"
|
log "Compiling subdomains…"
|
||||||
# Sort by last character to utilize the DNS server caching mechanism
|
# Sort by last character to utilize the DNS server caching mechanism
|
||||||
# (not as efficient with massdns but it's almost free so why not)
|
# (not as efficient with massdns but it's almost free so why not)
|
||||||
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
|
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
|
||||||
|
|
Loading…
Reference in a new issue