21 changed files with 341 additions and 451 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 *.log
 *.p
+nameservers
+nameservers.head
--- a/database.py
+++ b/database.py
@ -9,10 +9,6 @@ import time
 import logging
 import coloredlogs
 import pickle
-import numpy
-import math
-
-TLD_LIST: typing.Set[str] = set()

 coloredlogs.install(
    level='DEBUG',
@ -203,54 +199,6 @@ class Database(Profiler):
        Profiler.__init__(self)
        self.log = logging.getLogger('db')
        self.load()
-        self.ip4cache_shift: int = 32
-        self.ip4cache = numpy.ones(1)
-
-    def _set_ip4cache(self, path: Path, _: Match) -> None:
-        assert isinstance(path, Ip4Path)
-        self.enter_step('set_ip4cache')
-        mini = path.value >> self.ip4cache_shift
-        maxi = (path.value + 2**(32-path.prefixlen)) >> self.ip4cache_shift
-        if mini == maxi:
-            self.ip4cache[mini] = True
-        else:
-            self.ip4cache[mini:maxi] = True
-
-    def fill_ip4cache(self, max_size: int = 512*1024**2) -> None:
-        """
-        Size in bytes
-        """
-        if max_size > 2**32/8:
-            self.log.warning("Allocating more than 512 MiB of RAM for "
-                             "the Ip4 cache is not necessary.")
-        max_cache_width = int(math.log2(max(1, max_size*8)))
-        cache_width = min(2**32, max_cache_width)
-        self.ip4cache_shift = 32-cache_width
-        cache_size = 2**cache_width
-        self.ip4cache = numpy.zeros(cache_size, dtype=numpy.bool)
-        for _ in self.exec_each_ip4(self._set_ip4cache):
-            pass
-
-    @staticmethod
-    def populate_tld_list() -> None:
-        with open('temp/all_tld.list', 'r') as tld_fdesc:
-            for tld in tld_fdesc:
-                tld = tld.strip()
-                TLD_LIST.add(tld)
-
-    @staticmethod
-    def validate_domain(path: str) -> bool:
-        if len(path) > 255:
-            return False
-        splits = path.split('.')
-        if not TLD_LIST:
-            Database.populate_tld_list()
-        if splits[-1] not in TLD_LIST:
-            return False
-        for split in splits:
-            if not 1 <= len(split) <= 63:
-                return False
-        return True

    @staticmethod
    def pack_domain(domain: str) -> DomainPath:
@ -271,19 +219,6 @@ class Database(Profiler):
    def unpack_asn(asn: AsnPath) -> str:
        return f'AS{asn.asn}'

-    @staticmethod
-    def validate_ip4address(path: str) -> bool:
-        splits = path.split('.')
-        if len(splits) != 4:
-            return False
-        for split in splits:
-            try:
-                if not 0 <= int(split) <= 255:
-                    return False
-            except ValueError:
-                return False
-        return True
-
    @staticmethod
    def pack_ip4address(address: str) -> Ip4Path:
        addr = 0
@ -302,21 +237,6 @@ class Database(Profiler):
            addr >>= 8
        return '.'.join(map(str, octets))

-    @staticmethod
-    def validate_ip4network(path: str) -> bool:
-        # A bit generous but ok for our usage
-        splits = path.split('/')
-        if len(splits) != 2:
-            return False
-        if not Database.validate_ip4address(splits[0]):
-            return False
-        try:
-            if not 0 <= int(splits[1]) <= 32:
-                return False
-        except ValueError:
-            return False
-        return True
-
    @staticmethod
    def pack_ip4network(network: str) -> Ip4Path:
        address, prefixlen_str = network.split('/')
@ -433,9 +353,8 @@ class Database(Profiler):
        pref = _par.prefixlen + 1
        dic = _dic.zero
        if dic:
-            # addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
-            # assert addr0 == _par.value
-            addr0 = _par.value
+            addr0 = _par.value & (0xFFFFFFFF ^ (1 << (32-pref)))
+            assert addr0 == _par.value
            yield from self.exec_each_ip4(
                callback,
                _dic=dic,
@ -445,7 +364,6 @@ class Database(Profiler):
        dic = _dic.one
        if dic:
            addr1 = _par.value | (1 << (32-pref))
-            # assert addr1 != _par.value
            yield from self.exec_each_ip4(
                callback,
                _dic=dic,
@ -491,56 +409,62 @@ class Database(Profiler):
            string += f' ← {self.explain(match.source)}'
        return string

-    def list_records(self,
+    def export(self,
               first_party_only: bool = False,
               end_chain_only: bool = False,
               no_dupplicates: bool = False,
-                     rules_only: bool = False,
-                     hostnames_only: bool = False,
               explain: bool = False,
               ) -> typing.Iterable[str]:

        def export_cb(path: Path, match: Match
                      ) -> typing.Iterable[str]:
+            assert isinstance(path, DomainPath)
+            if not isinstance(path, HostnamePath):
+                return
            if first_party_only and not match.first_party:
                return
            if end_chain_only and match.references > 0:
                return
            if no_dupplicates and match.dupplicate:
                return
-            if rules_only and match.level > 1:
-                return
-            if hostnames_only and not isinstance(path, HostnamePath):
-                return
-
            if explain:
                yield self.explain(path)
            else:
-                yield str(path)
+                yield self.unpack_domain(path)

-        yield from self.exec_each(export_cb)
+        yield from self.exec_each_domain(export_cb)
+
+    def list_rules(self,
+                   first_party_only: bool = False,
+                   ) -> typing.Iterable[str]:
+
+        def list_rules_cb(path: Path, match: Match
+                          ) -> typing.Iterable[str]:
+            if first_party_only and not match.first_party:
+                return
+            if isinstance(path, ZonePath) \
+                    or (isinstance(path, Ip4Path) and path.prefixlen < 32):
+                # if match.level == 1:
+                # It should be the latter condition but it is more
+                # useful when using the former
+                yield self.explain(path)
+
+        yield from self.exec_each(list_rules_cb)

    def count_records(self,
                      first_party_only: bool = False,
-                      end_chain_only: bool = False,
-                      no_dupplicates: bool = False,
                      rules_only: bool = False,
-                      hostnames_only: bool = False,
+                      no_dupplicates: bool = False,
                      ) -> str:
        memo: typing.Dict[str, int] = dict()

        def count_records_cb(path: Path, match: Match) -> None:
            if first_party_only and not match.first_party:
                return
-            if end_chain_only and match.references > 0:
+            if rules_only and match.level > 1:
                return
            if no_dupplicates and match.dupplicate:
                return
-            if rules_only and match.level > 1:
-                return
-            if hostnames_only and not isinstance(path, HostnamePath):
-                return
-
            try:
                memo[path.__class__.__name__] += 1
            except KeyError:
@ -548,10 +472,9 @@ class Database(Profiler):

        for _ in self.exec_each(count_records_cb):
            pass
-
        split: typing.List[str] = list()
        for key, value in sorted(memo.items(), key=lambda s: s[0]):
-            split.append(f'{key[:-4].lower()}s: {value}')
+            split.append(f'{key[:-4]}: {value}')
        return ', '.join(split)

    def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:
@ -579,9 +502,6 @@ class Database(Profiler):
    def get_ip4(self, ip4_str: str) -> typing.Iterable[Path]:
        self.enter_step('get_ip4_pack')
        ip4 = self.pack_ip4address(ip4_str)
-        self.enter_step('get_ip4_cache')
-        if not self.ip4cache[ip4.value >> self.ip4cache_shift]:
-            return
        self.enter_step('get_ip4_brws')
        dic = self.ip4tree
        for i in range(31, 31-ip4.prefixlen, -1):
@ -629,9 +549,6 @@ class Database(Profiler):
                    domain_str: str,
                    updated: int,
                    source: Path) -> None:
-        self.enter_step('set_domain_val')
-        if not Database.validate_domain(domain_str):
-            raise ValueError(f"Invalid domain: {domain_str}")
        self.enter_step('set_domain_pack')
        domain = self.pack_domain(domain_str)
        self.enter_step('set_domain_fp')
@ -714,15 +631,11 @@ class Database(Profiler):
            source_match=source_match,
            dupplicate=dupplicate,
        )
-        self._set_ip4cache(ip4, dic)

    def set_ip4address(self,
                       ip4address_str: str,
                       *args: typing.Any, **kwargs: typing.Any
                       ) -> None:
-        self.enter_step('set_ip4add_val')
-        if not Database.validate_ip4address(ip4address_str):
-            raise ValueError(f"Invalid ip4address: {ip4address_str}")
        self.enter_step('set_ip4add_pack')
        ip4 = self.pack_ip4address(ip4address_str)
        self._set_ip4(ip4, *args, **kwargs)
@ -731,9 +644,6 @@ class Database(Profiler):
                       ip4network_str: str,
                       *args: typing.Any, **kwargs: typing.Any
                       ) -> None:
-        self.enter_step('set_ip4net_val')
-        if not Database.validate_ip4network(ip4network_str):
-            raise ValueError(f"Invalid ip4network: {ip4network_str}")
        self.enter_step('set_ip4net_pack')
        ip4 = self.pack_ip4network(ip4network_str)
        self._set_ip4(ip4, *args, **kwargs)
--- a/db.py
+++ b/db.py
@ -18,16 +18,14 @@ if __name__ == '__main__':
        help="Remove old entries from database")
    parser.add_argument(
        '-b', '--prune-base', action='store_true',
-        help="With --prune, only prune base rules "
-        "(the ones added by ./feed_rules.py)")
+        help="TODO")
    parser.add_argument(
        '-s', '--prune-before', type=int,
        default=(int(time.time()) - 60*60*24*31*6),
-        help="With --prune, only rules updated before "
-        "this UNIX timestamp will be deleted")
+        help="TODO")
    parser.add_argument(
        '-r', '--references', action='store_true',
-        help="DEBUG: Update the reference count")
+        help="Update the reference count")
    args = parser.parse_args()

    if not args.initialize:
--- a/export.py
+++ b/export.py
@ -9,56 +9,46 @@ if __name__ == '__main__':

    # Parsing arguments
    parser = argparse.ArgumentParser(
-        description="Export the hostnames rules stored "
-        "in the Database as plain text")
+        description="TODO")
    parser.add_argument(
        '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
-        help="Output file, one rule per line")
+        help="TODO")
    parser.add_argument(
        '-f', '--first-party', action='store_true',
-        help="Only output rules issued from first-party sources")
+        help="TODO")
    parser.add_argument(
        '-e', '--end-chain', action='store_true',
-        help="Only output rules that are not referenced by any other")
-    parser.add_argument(
-        '-r', '--rules', action='store_true',
-        help="Output all kinds of rules, not just hostnames")
-    parser.add_argument(
-        '-b', '--base-rules', action='store_true',
-        help="Output base rules "
-        "(the ones added by ./feed_rules.py) "
-        "(implies --rules)")
-    parser.add_argument(
-        '-d', '--no-dupplicates', action='store_true',
-        help="Do not output rules that already match a zone/network rule "
-        "(e.g. dummy.example.com when there's a zone example.com rule)")
+        help="TODO")
    parser.add_argument(
        '-x', '--explain', action='store_true',
-        help="Show the chain of rules leading to one "
-        "(and the number of references they have)")
+        help="TODO")
+    parser.add_argument(
+        '-r', '--rules', action='store_true',
+        help="TODO")
+    parser.add_argument(
+        '-d', '--no-dupplicates', action='store_true',
+        help="TODO")
    parser.add_argument(
        '-c', '--count', action='store_true',
-        help="Show the number of rules per type instead of listing them")
+        help="TODO")
    args = parser.parse_args()

    DB = database.Database()

    if args.count:
-        assert not args.explain
        print(DB.count_records(
            first_party_only=args.first_party,
-            end_chain_only=args.end_chain,
+            rules_only=args.rules,
            no_dupplicates=args.no_dupplicates,
-            rules_only=args.base_rules,
-            hostnames_only=not (args.rules or args.base_rules),
            ))
    else:
-        for domain in DB.list_records(
+        if args.rules:
+            for line in DB.list_rules():
+                print(line)
+        for domain in DB.export(
            first_party_only=args.first_party,
            end_chain_only=args.end_chain,
            no_dupplicates=args.no_dupplicates,
-            rules_only=args.base_rules,
-            hostnames_only=not (args.rules or args.base_rules),
            explain=args.explain,
        ):
            print(domain, file=args.output)
--- a/export_lists.sh
+++ b/export_lists.sh
@ -4,67 +4,34 @@ function log() {
    echo -e "\033[33m$@\033[0m"
 }

-log "Calculating statistics…"
-gen_date=$(date -Isec)
-gen_software=$(git describe --tags)
-number_websites=$(wc -l < temp/all_websites.list)
-number_subdomains=$(wc -l < temp/all_subdomains.list)
-number_dns=$(grep '^$' temp/all_resolved.txt | wc -l)
+log "Exporting lists…"
+./export.py --first-party --output dist/firstparty-trackers.txt
+./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
+./export.py --output dist/multiparty-trackers.txt
+./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt

-for partyness in {first,multi}
-do
-    if [ $partyness = "first" ]
-    then
-        partyness_flags="--first-party"
-    else
-        partyness_flags=""
-    fi
+log "Generating statistics…"
+./export.py --count --first-party > temp/count_recs_firstparty.txt
+./export.py --count > temp/count_recs_multiparty.txt
+./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
+./export.py --rules --count > temp/count_rules_multiparty.txt

-    echo "Statistics for ${partyness}-party trackers"
-    echo "Input rules: $(./export.py --count --base-rules $partyness_flags)"
-    echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)"
-    echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)"
-    echo "Output hostnames: $(./export.py --count $partyness_flags)"
-    echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)"
-    echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)"
-    echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)"
-    echo
+log "Sorting lists…"
+sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
+sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
+sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
+sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt

-    for trackerness in {trackers,only-trackers}
-    do
-        if [ $trackerness = "trackers" ]
-        then
-            trackerness_flags=""
-        else
-            trackerness_flags="--end-chain --no-dupplicates"
-        fi
-        file_list="dist/${partyness}party-${trackerness}.txt"
-        file_host="dist/${partyness}party-${trackerness}-hosts.txt"
-
-        log "Generating lists for variant ${partyness}-party ${trackerness}…"
-
-        # Real export heeere
-        ./export.py $partyness_flags $trackerness_flags > $file_list
-        # Sometimes a bit heavy to have the DB open and sort the output
-        # so this is done in two steps
-        sort -u $file_list -o $file_list
-
-        rules_input=$(./export.py --count --base-rules $partyness_flags)
-        rules_found=$(./export.py --count --rules $partyness_flags)
-        rules_output=$(./export.py --count $partyness_flags $trackerness_flags)
-
-        function link() { # link partyness, link trackerness
-            url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt"
-            if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ]
-            then
-                url="$url (this one)"
-            fi
-            echo $url
-        }
+log "Generating hosts lists…"
+function generate_hosts {
+    basename="$1"
+    description="$2"
+    description2="$3"

    (
        echo "# First-party trackers host list"
-            echo "# Variant: ${partyness}-party ${trackerness}"
+        echo "# $description"
+        echo "# $description2"
        echo "#"
        echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
        echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
@ -72,26 +39,34 @@ do
        echo "# In case of false positives/negatives, or any other question,"
        echo "# contact me the way you like: https://geoffrey.frogeye.fr"
        echo "#"
-            echo "# Latest versions:"
-            echo "# - First-party trackers  : $(link first trackers)"
-            echo "# - … excluding redirected: $(link first only-trackers)"
-            echo "# - First and third party : $(link multi trackers)"
-            echo "# - … excluding redirected: $(link multi only-trackers)"
+        echo "# Latest version:"
+        echo "# - First-party trackers  : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
+        echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
+        echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt"
+        echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt"
        echo '# (you can remove `-hosts` to get the raw list)'
        echo "#"
-            echo "# Generation date: $gen_date"
-            echo "# Generation software: eulaurarien $gen_software"
-            echo "# Number of source websites: $number_websites"
-            echo "# Number of source subdomains: $number_subdomains"
-            echo "# Number of source DNS records: ~2E9 + $number_dns"
+        echo "# Generation date: $(date -Isec)"
+        echo "# Generation software: eulaurarien $(git describe --tags)"
+        echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
+        echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
+        echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
        echo "#"
-            echo "# Input rules: $rules_input"
-            echo "# Subsequent rules: $rules_found"
-            echo "# Output rules: $rules_output"
+        echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
+        echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
+        echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
+        echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
        echo "#"
+        echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
+        echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
+        echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
+        echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
        echo
-            sed 's|^|0.0.0.0 |' "$file_list"
-        ) > "$file_host"
+        sed 's|^|0.0.0.0 |' "dist/$basename.txt"
+    ) > "dist/$basename-hosts.txt"
+}

-    done
-done
+generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" ""
+generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection."
+generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party."
+generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists."
--- a/feed_asn.py
+++ b/feed_asn.py
@ -36,7 +36,7 @@ if __name__ == '__main__':

    # Parsing arguments
    parser = argparse.ArgumentParser(
-        description="Add the IP ranges associated to the AS in the database")
+        description="TODO")
    args = parser.parse_args()

    DB = database.Database()
--- a/feed_dns.py
+++ b/feed_dns.py
@ -6,7 +6,7 @@ import logging
 import sys
 import typing
 import multiprocessing
-import time
+import enum

 Record = typing.Tuple[typing.Callable, typing.Callable, int, str, str]

@ -30,23 +30,14 @@ FUNCTION_MAP: typing.Any = {
 class Writer(multiprocessing.Process):
    def __init__(self,
                 recs_queue: multiprocessing.Queue,
-                 autosave_interval: int = 0,
-                 ip4_cache: int = 0,
-                 ):
+                 index: int = 0):
        super(Writer, self).__init__()
        self.log = logging.getLogger(f'wr')
        self.recs_queue = recs_queue
-        self.autosave_interval = autosave_interval
-        self.ip4_cache = ip4_cache

    def run(self) -> None:
        self.db = database.Database()
        self.db.log = logging.getLogger(f'wr')
-        self.db.fill_ip4cache(max_size=self.ip4_cache)
-        if self.autosave_interval > 0:
-            next_save = time.time() + self.autosave_interval
-        else:
-            next_save = 0

        self.db.enter_step('block_wait')
        block: typing.List[Record]
@ -64,12 +55,6 @@ class Writer(multiprocessing.Process):
                except ValueError:
                    self.log.exception("Cannot execute: %s", record)

-            if next_save > 0 and time.time() > next_save:
-                self.log.info("Saving database...")
-                self.db.save()
-                self.log.info("Done!")
-                next_save = time.time() + self.autosave_interval
-
            self.db.enter_step('block_wait')

        self.db.enter_step('end')
@ -134,8 +119,8 @@ class Rapid7Parser(Parser):
            self.register(record)


-class MassDnsParser(Parser):
-    # massdns --output Snrql
+class DnsMassParser(Parser):
+    # dnsmass --output Snrql
    # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
    TYPES = {
        'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None),
@ -144,7 +129,7 @@ class MassDnsParser(Parser):
    }

    def consume(self) -> None:
-        self.prof.enter_step('parse_massdns')
+        self.prof.enter_step('parse_dnsmass')
        timestamp = 0
        header = True
        for line in self.buf:
@ -160,7 +145,7 @@ class MassDnsParser(Parser):
                    header = False
                else:
                    select, write, name_offset, value_offset = \
-                        MassDnsParser.TYPES[split[1]]
+                        DnsMassParser.TYPES[split[1]]
                    record = (
                        select,
                        write,
@ -169,14 +154,14 @@ class MassDnsParser(Parser):
                        split[2][:value_offset],
                    )
                    self.register(record)
-                    self.prof.enter_step('parse_massdns')
+                    self.prof.enter_step('parse_dnsmass')
            except KeyError:
                continue


 PARSERS = {
    'rapid7': Rapid7Parser,
-    'massdns': MassDnsParser,
+    'dnsmass': DnsMassParser,
 }

 if __name__ == '__main__':
@ -184,40 +169,29 @@ if __name__ == '__main__':
    # Parsing arguments
    log = logging.getLogger('feed_dns')
    args_parser = argparse.ArgumentParser(
-        description="Read DNS records and import "
-        "tracking-relevant data into the database")
+        description="TODO")
    args_parser.add_argument(
        'parser',
        choices=PARSERS.keys(),
-        help="Input format")
+        help="TODO")
    args_parser.add_argument(
        '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
-        help="Input file")
+        help="TODO")
+    args_parser.add_argument(
+        '-j', '--workers', type=int, default=4,
+        help="TODO")
    args_parser.add_argument(
        '-b', '--block-size', type=int, default=1024,
-        help="Performance tuning value")
+        help="TODO")
    args_parser.add_argument(
        '-q', '--queue-size', type=int, default=128,
-        help="Performance tuning value")
-    args_parser.add_argument(
-        '-a', '--autosave-interval', type=int, default=900,
-        help="Interval to which the database will save in seconds. "
-        "0 to disable.")
-    args_parser.add_argument(
-        '-4', '--ip4-cache', type=int, default=0,
-        help="RAM cache for faster IPv4 lookup. "
-        "Maximum useful value: 512 MiB (536870912). "
-        "Warning: Depending on the rules, this might already "
-        "be a memory-heavy process, even without the cache.")
+        help="TODO")
    args = args_parser.parse_args()

    recs_queue: multiprocessing.Queue = multiprocessing.Queue(
            maxsize=args.queue_size)

-    writer = Writer(recs_queue,
-                    autosave_interval=args.autosave_interval,
-                    ip4_cache=args.ip4_cache
-                    )
+    writer = Writer(recs_queue)
    writer.start()

    parser = PARSERS[args.parser](args.input, recs_queue, args.block_size)
--- a/feed_rules.py
+++ b/feed_rules.py
@ -7,24 +7,22 @@ import time

 FUNCTION_MAP = {
    'zone': database.Database.set_zone,
-    'hostname': database.Database.set_hostname,
-    'asn': database.Database.set_asn,
    'ip4network': database.Database.set_ip4network,
-    'ip4address': database.Database.set_ip4address,
+    'asn': database.Database.set_asn,
 }

 if __name__ == '__main__':

    # Parsing arguments
    parser = argparse.ArgumentParser(
-        description="Import base rules to the database")
+        description="TODO")
    parser.add_argument(
        'type',
        choices=FUNCTION_MAP.keys(),
        help="Type of rule inputed")
    parser.add_argument(
        '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
-        help="File with one rule per line")
+        help="List of domains domains to block (with their subdomains)")
    parser.add_argument(
        '-f', '--first-party', action='store_true',
        help="The input only comes from verified first-party sources")
@ -41,14 +39,10 @@ if __name__ == '__main__':
        source = database.RuleMultiPath()

    for rule in args.input:
-        rule = rule.strip()
-        try:
        fun(DB,
-                rule,
+            rule.strip(),
            source=source,
            updated=int(time.time()),
            )
-        except ValueError:
-            DB.log.error(f"Could not add rule: {rule}")

    DB.save()
--- a/fetch_resources.sh
+++ b/fetch_resources.sh
@ -30,12 +30,13 @@ dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hos
 # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt
 # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt

-log "Retrieving TLD list…"
-dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list
-grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list
-
 log "Retrieving nameservers…"
-dl https://public-dns.info/nameservers.txt nameservers/public-dns.cache.list
+rm -f nameservers
+touch nameservers
+[ -f nameservers.head ] && cat nameservers.head >> nameservers
+dl https://public-dns.info/nameservers.txt nameservers.temp
+sort -R nameservers.temp >> nameservers
+rm nameservers.temp

 log "Retrieving top subdomains…"
 dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip
@ -50,3 +51,4 @@ then
 else
    mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list
 fi
+dl https://www.orwell1984.today/cname/eulerian.net.txt subdomains/orwell-eulerian-cname-list.cache.list
--- a/filter_subdomains.py
+++ b/filter_subdomains.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+# pylint: disable=C0103
+
+"""
+From a list of subdomains, output only
+the ones resolving to a first-party tracker.
+"""
+
+import argparse
+import sys
+import progressbar
+import csv
+import typing
+import ipaddress
+
+# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
+DomainRule = typing.Union[bool, typing.Dict]
+# IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']]
+IpRule = typing.Union[bool, typing.Dict]
+
+RULES_DICT: DomainRule = dict()
+RULES_IP_DICT: IpRule = dict()
+
+
+def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
+    for char in address.packed:
+        for i in range(7, -1, -1):
+            yield (char >> i) & 0b1
+
+
+def subdomain_matching(subdomain: str) -> bool:
+    parts = subdomain.split('.')
+    parts.reverse()
+    dic = RULES_DICT
+    for part in parts:
+        if isinstance(dic, bool) or part not in dic:
+            break
+        dic = dic[part]
+    if isinstance(dic, bool):
+        return dic
+    return False
+
+
+def ip_matching(ip_str: str) -> bool:
+    ip = ipaddress.ip_address(ip_str)
+    dic = RULES_IP_DICT
+    i = 0
+    for bit in get_bits(ip):
+        i += 1
+        if isinstance(dic, bool) or bit not in dic:
+            break
+        dic = dic[bit]
+    if isinstance(dic, bool):
+        return dic
+    return False
+
+
+def get_matching(chain: typing.List[str], no_explicit: bool = False
+                 ) -> typing.Iterable[str]:
+    if len(chain) <= 1:
+        return
+    initial = chain[0]
+    cname_destinations = chain[1:-1]
+    a_destination = chain[-1]
+    initial_matching = subdomain_matching(initial)
+    if no_explicit and initial_matching:
+        return
+    cname_matching = any(map(subdomain_matching, cname_destinations))
+    if cname_matching or initial_matching or ip_matching(a_destination):
+        yield initial
+
+
+def register_rule(subdomain: str) -> None:
+    # Make a tree with domain parts
+    parts = subdomain.split('.')
+    parts.reverse()
+    dic = RULES_DICT
+    last_part = len(parts) - 1
+    for p, part in enumerate(parts):
+        if isinstance(dic, bool):
+            return
+        if p == last_part:
+            dic[part] = True
+        else:
+            dic.setdefault(part, dict())
+            dic = dic[part]
+
+
+def register_rule_ip(network: str) -> None:
+    net = ipaddress.ip_network(network)
+    ip = net.network_address
+    dic = RULES_IP_DICT
+    last_bit = net.prefixlen - 1
+    for b, bit in enumerate(get_bits(ip)):
+        if isinstance(dic, bool):
+            return
+        if b == last_bit:
+            dic[bit] = True
+        else:
+            dic.setdefault(bit, dict())
+            dic = dic[bit]
+
+
+if __name__ == '__main__':
+
+    # Parsing arguments
+    parser = argparse.ArgumentParser(
+        description="Filter first-party trackers from a list of subdomains")
+    parser.add_argument(
+        '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
+        help="Input file with DNS chains")
+    parser.add_argument(
+        '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
+        help="Outptut file with one tracking subdomain per line")
+    parser.add_argument(
+        '-n', '--no-explicit', action='store_true',
+        help="Don't output domains already blocked with rules without CNAME")
+    parser.add_argument(
+        '-r', '--rules', type=argparse.FileType('r'),
+        help="List of domains domains to block (with their subdomains)")
+    parser.add_argument(
+        '-p', '--rules-ip', type=argparse.FileType('r'),
+        help="List of IPs ranges to block")
+    args = parser.parse_args()
+
+    # Progress bar
+    widgets = [
+        progressbar.Percentage(),
+        ' ', progressbar.SimpleProgress(),
+        ' ', progressbar.Bar(),
+        ' ', progressbar.Timer(),
+        ' ', progressbar.AdaptiveTransferSpeed(unit='req'),
+        ' ', progressbar.AdaptiveETA(),
+    ]
+    progress = progressbar.ProgressBar(widgets=widgets)
+
+    # Reading rules
+    if args.rules:
+        for rule in args.rules:
+            register_rule(rule.strip())
+    if args.rules_ip:
+        for rule in args.rules_ip:
+            register_rule_ip(rule.strip())
+
+    # Approximating line count
+    if args.input.seekable():
+        lines = 0
+        for line in args.input:
+            lines += 1
+        progress.max_value = lines
+        args.input.seek(0)
+
+    # Reading domains to filter
+    reader = csv.reader(args.input)
+    progress.start()
+    for chain in reader:
+        for match in get_matching(chain, no_explicit=args.no_explicit):
+            print(match, file=args.output)
+        progress.update(progress.value + 1)
+    progress.finish()
--- a/import_rapid7.sh
+++ b/import_rapid7.sh
@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-function log() {
-    echo -e "\033[33m$@\033[0m"
-}
-
-function feed_rapid7_fdns { # dataset
-    dataset=$1
-    line=$(curl -s https://opendata.rapid7.com/sonar.fdns_v2/ | grep "href=\".\+-fdns_$dataset.json.gz\"")
-    link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)"
-    log "Reading $(echo "$dataset" | awk '{print toupper($0)}') records from $link"
-    curl -L "$link" | gunzip
-}
-
-function feed_rapid7_rdns {
-    dataset=$1
-    line=$(curl -s https://opendata.rapid7.com/sonar.rdns_v2/ | grep "href=\".\+-rdns.json.gz\"")
-    link="https://opendata.rapid7.com$(echo "$line" | cut -d'"' -f2)"
-    log "Reading PTR records from $link"
-    curl -L "$link" | gunzip
-}
-
-feed_rapid7_rdns | ./feed_dns.py rapid7
-feed_rapid7_fdns a | ./feed_dns.py rapid7 --ip4-cache 536870912
-# feed_rapid7_fdns aaaa | ./feed_dns.py rapid7 --ip6-cache 536870912
-feed_rapid7_fdns cname | ./feed_dns.py rapid7
--- a/import_rules.sh
+++ b/import_rules.sh
@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as

 ./feed_asn.py

-# log "Pruning old rules…"
-# ./db.py --prune --prune-before "$BEFORE" --prune-base
+log "Pruning old rules…"
+./db.py --prune --prune-before "$BEFORE" --prune-base
--- a/nameservers/.gitignore
+++ b/nameservers/.gitignore
@ -1,2 +0,0 @@
-*.custom.list
-*.cache.list
--- a/nameservers/popular.list
+++ b/nameservers/popular.list
@ -1,24 +0,0 @@
-8.8.8.8
-8.8.4.4
-2001:4860:4860:0:0:0:0:8888
-2001:4860:4860:0:0:0:0:8844
-208.67.222.222
-208.67.220.220
-2620:119:35::35
-2620:119:53::53
-4.2.2.1
-4.2.2.2
-8.26.56.26
-8.20.247.20
-84.200.69.80
-84.200.70.40
-2001:1608:10:25:0:0:1c04:b12f
-2001:1608:10:25:0:0:9249:d69b
-9.9.9.10
-149.112.112.10
-2620:fe::10
-2620:fe::fe:10
-1.1.1.1
-1.0.0.1
-2606:4700:4700::1111
-2606:4700:4700::1001
--- a/new_workflow.sh
+++ b/new_workflow.sh
@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+function log() {
+    echo -e "\033[33m$@\033[0m"
+}
+
+./fetch_resources.sh
+./import_rules.sh
+
+# TODO Fetch 'em
+log "Reading PTR records…"
+pv ptr.json.gz | gunzip | ./feed_dns.py
+log "Reading A records…"
+pv a.json.gz | gunzip | ./feed_dns.py
+log "Reading CNAME records…"
+pv cname.json.gz | gunzip | ./feed_dns.py
+
+log "Pruning old data…"
+./database.py --prune
+
+./filter_subdomains.sh
+
--- a/resolve_subdomains.sh
+++ b/resolve_subdomains.sh
@ -4,16 +4,9 @@ function log() {
    echo -e "\033[33m$@\033[0m"
 }

-log "Compiling nameservers…"
-pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list
-
-log "Compiling subdomain…"
+log "Compiling locally known subdomain…"
 # Sort by last character to utilize the DNS server caching mechanism
-# (not as efficient with massdns but it's almost free so why not)
-pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list
+pv subdomains/*.list | sed 's/\r$//' | rev | sort -u | rev > temp/all_subdomains.list
+log "Resolving locally known subdomain…"
+pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.csv

-log "Resolving subdomain…"
-massdns --output Snrql --retry REFUSED,SERVFAIL --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list
-
-log "Importing into database…"
-pv temp/all_resolved.txt | ./feed_dns.py massdns
--- a/rules/first-party.list
+++ b/rules/first-party.list
@ -18,14 +18,7 @@ omtrdc.net
 online-metrix.net
 # Webtrekk
 wt-eu02.net
-webtrekk.net
 # Otto Group
 oghub.io
-# Intent.com
+# ???
 partner.intentmedia.net
-# Wizaly
-wizaly.com
-# Commanders Act
-tagcommander.com
-# Ingenious Technologies
-affex.org
--- a/run_tests.py
+++ b/run_tests.py
@ -1,34 +0,0 @@
-#!/usr/bin/env python3
-
-import database
-import os
-import logging
-import csv
-
-TESTS_DIR = 'tests'
-
-if __name__ == '__main__':
-
-    DB = database.Database()
-    log = logging.getLogger('tests')
-
-    for filename in os.listdir(TESTS_DIR):
-        log.info("")
-        log.info("Running tests from %s", filename)
-        path = os.path.join(TESTS_DIR, filename)
-        with open(path, 'rt') as fdesc:
-            reader = csv.DictReader(fdesc)
-            for test in reader:
-                log.info("Testing %s (%s)", test['url'], test['comment'])
-
-                for white in test['white'].split(':'):
-                    if not white:
-                        continue
-                    if any(DB.get_domain(white)):
-                        log.error("False positive: %s", white)
-
-                for black in test['black'].split(':'):
-                    if not black:
-                        continue
-                    if not any(DB.get_domain(black)):
-                        log.error("False negative: %s", black)
--- a/tests/false-positives.csv
+++ b/tests/false-positives.csv
@ -1,5 +1,6 @@
 url,white,black,comment
 https://support.apple.com,support.apple.com,,EdgeKey / AkamaiEdge
 https://www.pinterest.fr/,i.pinimg.com,,Cedexis
+https://www.pinterest.fr/,i.pinimg.com,,Cedexis
 https://www.tumblr.com/,66.media.tumblr.com,,ChiCDN
 https://www.skype.com/fr/,www.skype.com,,TrafficManager
--- a/tests/first-party.csv
+++ b/tests/first-party.csv
@ -5,6 +5,3 @@ https://www.discover.com/,,content.discover.com,ThreatMetrix
 https://www.mytoys.de/,,web.mytoys.de,Webtrekk
 https://www.baur.de/,,tp.baur.de,Otto Group
 https://www.liligo.com/,,compare.liligo.com,???
-https://www.boulanger.com/,,tag.boulanger.fr,TagCommander
-https://www.airfrance.fr/FR/,,tk.airfrance.fr,Wizaly
-https://www.vsgamers.es/,,marketing.net.vsgamers.es,Affex
--- a/validate_list.py
+++ b/validate_list.py
@ -1,35 +0,0 @@
-#!/usr/bin/env python3
-# pylint: disable=C0103
-
-"""
-Filter out invalid domain names
-"""
-
-import database
-import argparse
-import sys
-
-if __name__ == '__main__':
-
-    # Parsing arguments
-    parser = argparse.ArgumentParser(
-        description="Filter out invalid domain name/ip addresses from a list.")
-    parser.add_argument(
-        '-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
-        help="Input file, one element per line")
-    parser.add_argument(
-        '-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
-        help="Output file, one element per line")
-    parser.add_argument(
-        '-d', '--domain', action='store_true',
-        help="Can be domain name")
-    parser.add_argument(
-        '-4', '--ip4', action='store_true',
-        help="Can be IP4")
-    args = parser.parse_args()
-
-    for line in args.input:
-        line = line.strip()
-        if (args.domain and database.Database.validate_domain(line)) or \
-                (args.ip4 and database.Database.validate_ip4address(line)):
-            print(line, file=args.output)