Automatically get top 1M subdomains

newworkflow_parseropti
Geoffrey Frogeye 2019-11-14 11:23:59 +01:00
parent 7df00fc859
commit 04fe454d99
4 changed files with 17 additions and 9 deletions

View File

@ -2,6 +2,7 @@
# Main script for eulaurarien
./fetch_resources.sh
./collect_subdomains.sh
./filter_subdomains.sh

16
fetch_resources.sh Executable file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Get a list of nameservers
rm -f nameservers
touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers
curl https://public-dns.info/nameservers.txt | sort -R | head -64 >> nameservers
# Get top 1M subdomains
wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
unzip top-1m.csv.zip
sed 's|^[0-9]\+,||' top-1m.csv > subdomains/cisco-umbrella_popularity.cache.list
rm top-1m.csv top-1m.csv.zip

View File

@ -21,7 +21,6 @@ import progressbar
import regexes
DNS_TIMEOUT = 5.0
MAX_NAMESERVERS = 512
# TODO Retry failed requests
@ -96,7 +95,6 @@ def get_matching_subdomains(subdomains: typing.Iterable[str],
# Use interal resolver by default
servers = nameservers or dns.resolver.Resolver().nameservers
servers = servers[:MAX_NAMESERVERS]
# Create workers
for server in servers:

View File

@ -1,12 +1,5 @@
#!/usr/bin/env bash
# Get a list of nameservers
rm -f nameservers
touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers
curl https://public-dns.info/nameservers.txt | sort -R >> nameservers
# Filter out the subdomains not pointing to a first-party tracker
cat subdomains/*.list | sort -u > temp/all_subdomains.list