diff --git a/.gitignore b/.gitignore index aa3f3eb..188051c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,3 @@ *.db-journal nameservers nameservers.head -*.o -*.so diff --git a/database.py b/database.py index 1e8c4da..ee51829 100755 --- a/database.py +++ b/database.py @@ -149,6 +149,8 @@ class Database(): total = 0 for i, octet in enumerate(address.split('.')): total += int(octet) << (3-i)*8 + if total > 0xFFFFFFFF: + raise ValueError return total # return '{:02x}{:02x}{:02x}{:02x}'.format( # *[int(c) for c in address.split('.')]) @@ -192,10 +194,13 @@ class Database(): '(SELECT count(*) FROM rules ' 'WHERE source=r.id)') - def prune(self, before: int) -> None: + def prune(self, before: int, base_only: bool = False) -> None: self.enter_step('prune') cursor = self.conn.cursor() - cursor.execute('DELETE FROM rules WHERE updated str: # Format current @@ -541,7 +546,14 @@ if __name__ == '__main__': help="Reconstruct the whole database") parser.add_argument( '-p', '--prune', action='store_true', - help="Remove old (+6 months) entries from database") + help="Remove old entries from database") + parser.add_argument( + '-b', '--prune-base', action='store_true', + help="TODO") + parser.add_argument( + '-s', '--prune-before', type=int, + default=(int(time.time()) - 60*60*24*31*6), + help="TODO") parser.add_argument( '-r', '--references', action='store_true', help="Update the reference count") @@ -552,8 +564,8 @@ if __name__ == '__main__': if args.initialize: DB.initialize() if args.prune: - DB.prune(before=int(time.time()) - 60*60*24*31*6) - if args.references and not args.prune: + DB.prune(before=args.prune_before, base_only=args.prune_base) + if args.references: DB.update_references() DB.close() diff --git a/feed_dns.py b/feed_dns.py index cb996e9..87a4fb6 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -37,20 +37,21 @@ if __name__ == '__main__': DB.enter_step('feed_switch') if dtype == 'a': for rule in DB.get_ip4(value): - if not list(DB.get_domain_in_zone(name)): + if not any(DB.get_domain_in_zone(name)): DB.set_hostname(name, source=rule, updated=int(timestamp)) # updated=int(data['timestamp'])) elif dtype == 'c': for rule in DB.get_domain(value): - if not list(DB.get_domain_in_zone(name)): + if not any(DB.get_domain_in_zone(name)): DB.set_hostname(name, source=rule, updated=int(timestamp)) # updated=int(data['timestamp'])) elif dtype == 'p': for rule in DB.get_domain(value): - if not list(DB.get_ip4_in_network(name)): + if not any(DB.get_ip4_in_network(name)): + log.debug('%s matched by %d: add %s', value, rule, name) DB.set_ip4address(name, source=rule, updated=int(timestamp)) # updated=int(data['timestamp'])) diff --git a/fetch_resources.sh b/fetch_resources.sh index 01121d8..e799729 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -18,7 +18,7 @@ log "Retrieving rules…" rm -f rules*/*.cache.* dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt # From firebog.net Tracking & Telemetry Lists -dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list +# dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list # dl https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-blocklist.txt rules/notrack-blocklist.cache.list # False positives: https://github.com/WaLLy3K/wally3k.github.io/issues/73 -> 69.media.tumblr.com chicdn.net dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt diff --git a/import_rules.sh b/import_rules.sh index 358155c..33c4fbd 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -5,6 +5,7 @@ function log() { } log "Importing rules…" +BEFORE="$(date +%s)" cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone @@ -17,3 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py +log "Pruning old rules…" +./database.py --prune --prune-before "$BEFORE" --prune-base diff --git a/json_to_csv.py b/json_to_csv.py index 11a3600..39ca1b7 100755 --- a/json_to_csv.py +++ b/json_to_csv.py @@ -27,10 +27,10 @@ if __name__ == '__main__': data = json.loads(line) try: writer.writerow([ - data['type'][0], + data['type'][0], # First letter, will need to do something special for AAAA data['timestamp'], data['name'], data['value']]) - except IndexError: + except (KeyError, json.decoder.JSONDecodeError): log.error('Could not parse line: %s', line) pass