Workflow: Some modifications

2019-12-14 16:04:19 +01:00 · 2019-12-14 16:04:19 +01:00 · d7c239a6f6
parent 5023b85d7c
commit d7c239a6f6
6 changed files with 27 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,5 +3,3 @@
 *.db-journal
 nameservers
 nameservers.head
-*.o
-*.so
--- a/database.py
+++ b/database.py
@ -149,6 +149,8 @@ class Database():
        total = 0
        for i, octet in enumerate(address.split('.')):
            total += int(octet) << (3-i)*8
+        if total > 0xFFFFFFFF:
+            raise ValueError
        return total
        # return '{:02x}{:02x}{:02x}{:02x}'.format(
        #     *[int(c) for c in address.split('.')])
@ -192,10 +194,13 @@ class Database():
                       '(SELECT count(*) FROM rules '
                       'WHERE source=r.id)')

-    def prune(self, before: int) -> None:
+    def prune(self, before: int, base_only: bool = False) -> None:
        self.enter_step('prune')
        cursor = self.conn.cursor()
-        cursor.execute('DELETE FROM rules WHERE updated<?', (before,))
+        cmd = 'DELETE FROM rules WHERE updated<?'
+        if base_only:
+            cmd += ' AND level=0'
+        cursor.execute(cmd, (before,))

    def explain(self, entry: int) -> str:
        # Format current
@ -541,7 +546,14 @@ if __name__ == '__main__':
        help="Reconstruct the whole database")
    parser.add_argument(
        '-p', '--prune', action='store_true',
-        help="Remove old (+6 months) entries from database")
+        help="Remove old entries from database")
+    parser.add_argument(
+        '-b', '--prune-base', action='store_true',
+        help="TODO")
+    parser.add_argument(
+        '-s', '--prune-before', type=int,
+        default=(int(time.time()) - 60*60*24*31*6),
+        help="TODO")
    parser.add_argument(
        '-r', '--references', action='store_true',
        help="Update the reference count")
@ -552,8 +564,8 @@ if __name__ == '__main__':
    if args.initialize:
        DB.initialize()
    if args.prune:
-        DB.prune(before=int(time.time()) - 60*60*24*31*6)
-    if args.references and not args.prune:
+        DB.prune(before=args.prune_before, base_only=args.prune_base)
+    if args.references:
        DB.update_references()

    DB.close()
--- a/feed_dns.py
+++ b/feed_dns.py
@ -37,20 +37,21 @@ if __name__ == '__main__':
            DB.enter_step('feed_switch')
            if dtype == 'a':
                for rule in DB.get_ip4(value):
-                    if not list(DB.get_domain_in_zone(name)):
+                    if not any(DB.get_domain_in_zone(name)):

                        DB.set_hostname(name, source=rule,
                                        updated=int(timestamp))
                                        # updated=int(data['timestamp']))
            elif dtype == 'c':
                for rule in DB.get_domain(value):
-                    if not list(DB.get_domain_in_zone(name)):
+                    if not any(DB.get_domain_in_zone(name)):
                        DB.set_hostname(name, source=rule,
                                        updated=int(timestamp))
                                        # updated=int(data['timestamp']))
            elif dtype == 'p':
                for rule in DB.get_domain(value):
-                    if not list(DB.get_ip4_in_network(name)):
+                    if not any(DB.get_ip4_in_network(name)):
+                        log.debug('%s matched by %d: add %s', value, rule, name)
                        DB.set_ip4address(name, source=rule,
                                          updated=int(timestamp))
                                          # updated=int(data['timestamp']))
--- a/fetch_resources.sh
+++ b/fetch_resources.sh
@ -18,7 +18,7 @@ log "Retrieving rules…"
 rm -f rules*/*.cache.*
 dl https://easylist.to/easylist/easyprivacy.txt rules_adblock/easyprivacy.cache.txt
 # From firebog.net Tracking & Telemetry Lists
-dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list
+# dl https://v.firebog.net/hosts/Prigent-Ads.txt rules/prigent-ads.cache.list
 # dl https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-blocklist.txt rules/notrack-blocklist.cache.list
 # False positives: https://github.com/WaLLy3K/wally3k.github.io/issues/73 -> 69.media.tumblr.com chicdn.net
 dl https://raw.githubusercontent.com/StevenBlack/hosts/master/data/add.2o7Net/hosts rules_hosts/add2o7.cache.txt
--- a/import_rules.sh
+++ b/import_rules.sh
@ -5,6 +5,7 @@ function log() {
 }

 log "Importing rules…"
+BEFORE="$(date +%s)"
 cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_domain_list.py | ./feed_rules.py zone
 cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone
 cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone
@ -17,3 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as

 ./feed_asn.py

+log "Pruning old rules…"
+./database.py --prune --prune-before "$BEFORE" --prune-base
--- a/json_to_csv.py
+++ b/json_to_csv.py
@ -27,10 +27,10 @@ if __name__ == '__main__':
        data = json.loads(line)
        try:
            writer.writerow([
-                data['type'][0],
+                data['type'][0], # First letter, will need to do something special for AAAA
                data['timestamp'],
                data['name'],
                data['value']])
-        except IndexError:
+        except (KeyError, json.decoder.JSONDecodeError):
            log.error('Could not parse line: %s', line)
            pass