Harder verficiation before adding entries to DB

newworkflow
Geoffrey Frogeye 2019-12-17 19:53:05 +01:00
parent 747fe46ad0
commit dce35cb299
Signed by: geoffrey
GPG Key ID: D8A7ECA00A8CD3DD
3 changed files with 65 additions and 2 deletions

View File

@ -10,6 +10,8 @@ import logging
import coloredlogs
import pickle
TLD_LIST: typing.Set[str] = set()
coloredlogs.install(
level='DEBUG',
fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
@ -200,6 +202,27 @@ class Database(Profiler):
self.log = logging.getLogger('db')
self.load()
@staticmethod
def populate_tld_list() -> None:
with open('temp/all_tld.list', 'r') as tld_fdesc:
for tld in tld_fdesc:
tld = tld.strip()
TLD_LIST.add(tld)
@staticmethod
def validate_domain(path: str) -> bool:
if len(path) > 255:
return False
splits = path.split('.')
if not TLD_LIST:
Database.populate_tld_list()
if splits[0] not in TLD_LIST:
return False
for split in splits:
if not 1 <= len(split) <= 63:
return False
return True
@staticmethod
def pack_domain(domain: str) -> DomainPath:
return DomainPath(domain.split('.')[::-1])
@ -219,6 +242,19 @@ class Database(Profiler):
def unpack_asn(asn: AsnPath) -> str:
return f'AS{asn.asn}'
@staticmethod
def validate_ip4address(path: str) -> bool:
splits = path.split('.')
if len(splits) != 4:
return False
for split in splits:
try:
if not 0 <= int(split) <= 255:
return False
except ValueError:
return False
return True
@staticmethod
def pack_ip4address(address: str) -> Ip4Path:
addr = 0
@ -237,6 +273,21 @@ class Database(Profiler):
addr >>= 8
return '.'.join(map(str, octets))
@staticmethod
def validate_ip4network(path: str) -> bool:
# A bit generous but ok for our usage
splits = path.split('/')
if len(splits) != 2:
return False
if not Database.validate_ip4address(splits[0]):
return False
try:
if not 0 <= int(splits[1]) <= 32:
return False
except ValueError:
return False
return True
@staticmethod
def pack_ip4network(network: str) -> Ip4Path:
address, prefixlen_str = network.split('/')
@ -549,6 +600,9 @@ class Database(Profiler):
domain_str: str,
updated: int,
source: Path) -> None:
self.enter_step('set_domain_val')
if not Database.validate_domain(domain_str):
raise ValueError(f"Invalid domain: {domain_str}")
self.enter_step('set_domain_pack')
domain = self.pack_domain(domain_str)
self.enter_step('set_domain_fp')
@ -636,6 +690,9 @@ class Database(Profiler):
ip4address_str: str,
*args: typing.Any, **kwargs: typing.Any
) -> None:
self.enter_step('set_ip4add_val')
if not Database.validate_ip4address(ip4address_str):
raise ValueError(f"Invalid ip4address: {ip4address_str}")
self.enter_step('set_ip4add_pack')
ip4 = self.pack_ip4address(ip4address_str)
self._set_ip4(ip4, *args, **kwargs)
@ -644,6 +701,9 @@ class Database(Profiler):
ip4network_str: str,
*args: typing.Any, **kwargs: typing.Any
) -> None:
self.enter_step('set_ip4net_val')
if not Database.validate_ip4network(ip4network_str):
raise ValueError(f"Invalid ip4network: {ip4network_str}")
self.enter_step('set_ip4net_pack')
ip4 = self.pack_ip4network(ip4network_str)
self._set_ip4(ip4, *args, **kwargs)

View File

@ -8,7 +8,7 @@ log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --output --no-dupplicates dist/multiparty-only-trackers.txt
./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt
log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt

View File

@ -30,6 +30,10 @@ dl https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hos
# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV.txt rules_hosts/smart-tv.cache.txt
# dl https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/AmazonFireTV.txt rules_hosts/amazon-fire-tv.cache.txt
log "Retrieving TLD list…"
dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list
grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list
log "Retrieving nameservers…"
rm -f nameservers
touch nameservers
@ -51,4 +55,3 @@ then
else
mv temp/cisco-umbrella_popularity.fresh.list subdomains/cisco-umbrella_popularity.cache.list
fi
dl https://www.orwell1984.today/cname/eulerian.net.txt subdomains/orwell-eulerian-cname-list.cache.list