diff --git a/README.md b/README.md index 70c4c5a..5776bb4 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,9 @@ Depending on the sources you'll be using to generate the list, you'll need to in ### Create a new database The so-called database (in the form of `blocking.p`) is a file storing all the matching entities (ASN, IPs, hostnames, zones…) and every entity leading to it. -For now there's no way to remove data from it, so here's the command to recreate it: `./db.py --initialize`. +It exists because the list cannot be generated in one pass, as DNS redirections chain links do not have to be inputed in order. +You can purge the database by removing old data using `./db.py --prune --prune-before TIMESTAMP` ; +`TIMESTAMP` can be generated using `date +%s`. ### Gather external sources @@ -79,6 +81,7 @@ In each folder: - `*.custom.ext` are for sources that you don't want commited Then, run `./import_rules.sh`. +Note that removed rules and every record depending on them will be automatically pruned. ### Add subdomains diff --git a/database.py b/database.py index c37369f..866eb66 100644 --- a/database.py +++ b/database.py @@ -95,6 +95,9 @@ class Match(): return False return True + def disable(self) -> None: + self.updated = 0 + class AsnNode(Match): def __init__(self) -> None: @@ -478,15 +481,61 @@ class Database(Profiler): for _ in self.exec_each(increment_references_cb): pass + def _clean_deps(self) -> None: + # Disable the matches that depends on the targeted + # matches until all disabled matches reference count = 0 + did_something = True + + def clean_deps_cb(path: Path, + match: Match + ) -> None: + nonlocal did_something + if not match.source: + return + source = self.get_match(match.source) + if not source.active(): + self._unset_match(match) + elif match.first_party > source.first_party: + match.first_party = source.first_party + else: + return + did_something = True + + while did_something: + did_something = False + self.enter_step('pass_clean_deps') + for _ in self.exec_each(clean_deps_cb): + pass + def prune(self, before: int, base_only: bool = False) -> None: - raise NotImplementedError + # Disable the matches targeted + def prune_cb(path: Path, + match: Match + ) -> None: + if base_only and match.level > 1: + return + if match.updated > before: + return + self._unset_match(match) + self.log.debug("Print: disabled %s", path) + + self.enter_step('pass_prune') + for _ in self.exec_each(prune_cb): + pass + + self._clean_deps() + + # Remove branches with no match + # TODO def explain(self, path: Path) -> str: match = self.get_match(path) + string = str(path) if isinstance(match, AsnNode): - string = f'{path} ({match.name}) #{match.references}' - else: - string = f'{path} #{match.references}' + string += f' ({match.name})' + party_char = 'F' if match.first_party else 'M' + dup_char = 'D' if match.dupplicate else '_' + string += f' {match.level}{party_char}{dup_char}{match.references}' if match.source: string += f' ← {self.explain(match.source)}' return string @@ -598,6 +647,14 @@ class Database(Profiler): self.enter_step('get_ip4_yield') yield ip4 + def _unset_match(self, + match: Match, + ) -> None: + match.disable() + if match.source: + source_match = self.get_match(match.source) + source_match.references -= 1 + def _set_match(self, match: Match, updated: int, diff --git a/import_rules.sh b/import_rules.sh index cbcfbd8..14c8c78 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py -# log "Pruning old rules…" -# ./db.py --prune --prune-before "$BEFORE" --prune-base +log "Pruning old rules…" +./db.py --prune --prune-before "$BEFORE" --prune-base diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index 7a91337..b5b2079 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -7,7 +7,7 @@ function log() { log "Compiling nameservers…" pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list -log "Compiling subdomain…" +log "Compiling subdomains…" # Sort by last character to utilize the DNS server caching mechanism # (not as efficient with massdns but it's almost free so why not) pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list