From dce35cb29974994b77a304c6a90a8eda9be91ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Tue, 17 Dec 2019 19:53:05 +0100 Subject: [PATCH] Harder verficiation before adding entries to DB --- database.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ export_lists.sh | 2 +- fetch_resources.sh | 5 +++- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/database.py b/database.py index 6e4ca3a..cddc326 100644 --- a/database.py +++ b/database.py @@ -10,6 +10,8 @@ import logging import coloredlogs import pickle +TLD_LIST: typing.Set[str] = set() + coloredlogs.install( level='DEBUG', fmt='%(asctime)s %(name)s %(levelname)s %(message)s' @@ -200,6 +202,27 @@ class Database(Profiler): self.log = logging.getLogger('db') self.load() + @staticmethod + def populate_tld_list() -> None: + with open('temp/all_tld.list', 'r') as tld_fdesc: + for tld in tld_fdesc: + tld = tld.strip() + TLD_LIST.add(tld) + + @staticmethod + def validate_domain(path: str) -> bool: + if len(path) > 255: + return False + splits = path.split('.') + if not TLD_LIST: + Database.populate_tld_list() + if splits[0] not in TLD_LIST: + return False + for split in splits: + if not 1 <= len(split) <= 63: + return False + return True + @staticmethod def pack_domain(domain: str) -> DomainPath: return DomainPath(domain.split('.')[::-1]) @@ -219,6 +242,19 @@ class Database(Profiler): def unpack_asn(asn: AsnPath) -> str: return f'AS{asn.asn}' + @staticmethod + def validate_ip4address(path: str) -> bool: + splits = path.split('.') + if len(splits) != 4: + return False + for split in splits: + try: + if not 0 <= int(split) <= 255: + return False + except ValueError: + return False + return True + @staticmethod def pack_ip4address(address: str) -> Ip4Path: addr = 0 @@ -237,6 +273,21 @@ class Database(Profiler): addr >>= 8 return '.'.join(map(str, octets)) + @staticmethod + def validate_ip4network(path: str) -> bool: + # A bit generous but ok for our usage + splits = path.split('/') + if len(splits) != 2: + return False + if not Database.validate_ip4address(splits[0]): + return False + try: + if not 0 <= int(splits[1]) <= 32: + return False + except ValueError: + return False + return True + @staticmethod def pack_ip4network(network: str) -> Ip4Path: address, prefixlen_str = network.split('/') @@ -549,6 +600,9 @@ class Database(Profiler): domain_str: str, updated: int, source: Path) -> None: + self.enter_step('set_domain_val') + if not Database.validate_domain(domain_str): + raise ValueError(f"Invalid domain: {domain_str}") self.enter_step('set_domain_pack') domain = self.pack_domain(domain_str) self.enter_step('set_domain_fp') @@ -636,6 +690,9 @@ class Database(Profiler): ip4address_str: str, *args: typing.Any, **kwargs: typing.Any ) -> None: + self.enter_step('set_ip4add_val') + if not Database.validate_ip4address(ip4address_str): + raise ValueError(f"Invalid ip4address: {ip4address_str}") self.enter_step('set_ip4add_pack') ip4 = self.pack_ip4address(ip4address_str) self._set_ip4(ip4, *args, **kwargs) @@ -644,6 +701,9 @@ class Database(Profiler): ip4network_str: str, *args: typing.Any, **kwargs: typing.Any ) -> None: + self.enter_step('set_ip4net_val') + if not Database.validate_ip4network(ip4network_str): + raise ValueError(f"Invalid ip4network: {ip4network_str}") self.enter_step('set_ip4net_pack') ip4 = self.pack_ip4network(ip4network_str) self._set_ip4(ip4, *args, **kwargs) diff --git a/export_lists.sh b/export_lists.sh index 7ef8156..1070865 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -8,7 +8,7 @@ log "Exporting lists…" ./export.py --first-party --output dist/firstparty-trackers.txt ./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt ./export.py --output dist/multiparty-trackers.txt -./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt +./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt log "Generating statistics…" ./export.py --count --first-party > temp/count_recs_firstparty.txt diff --git a/fetch_resources.sh b/fetch_resources.sh index 00d131f..f4c95b0 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -30,6 +30,10 @@ dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hos # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt # dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt +log "Retrieving TLD list…" +dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list +grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list + log "Retrieving nameservers…" rm -f nameservers touch nameservers @@ -51,4 +55,3 @@ then else mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list fi -dl https://www.orwell1984.today/cname/eulerian.net.txt subdomains/orwell-eulerian-cname-list.cache.list