Browse Source

Added progressbar and ETA

newworkflow_parseropti
Geoffrey Frogeye 2 years ago
parent
commit
2f1af3c850
  1. 1
      README.md
  2. 30
      collect_subdomains.py
  3. 4
      eulaurarien.sh
  4. 38
      filter_subdomains.py

1
README.md

@ -42,6 +42,7 @@ Just to build the list, you can find an already-built list in the releases.
- Selenium
- seleniumwire
- dnspython
- [progressbar2](https://pypi.org/project/progressbar2/)
And then just run `eulaurarien.sh`.

30
collect_subdomains.py

@ -1,4 +1,5 @@
#!/usr/bin/env python3
# pylint: disable=C0103
"""
From a list of URLs, output the subdomains
@ -9,6 +10,7 @@ import sys
import typing
import urllib.parse
import progressbar
import selenium.webdriver.firefox.options
import seleniumwire.webdriver
@ -38,10 +40,26 @@ def collect_subdomains(url: str) -> typing.Iterable[str]:
driver.close()
def collect_subdomains_standalone(url: str) -> None:
url = url.strip()
if not url:
return
for subdomain in collect_subdomains(url):
print(subdomain)
if __name__ == '__main__':
for line in sys.stdin:
line = line.strip()
if not line:
continue
for subdomain in collect_subdomains(line):
print(subdomain)
assert len(sys.argv) <= 2
filename = None
if len(sys.argv) == 2 and sys.argv[1] != '-':
filename = sys.argv[1]
num_lines = sum(1 for line in open(filename))
iterator = progressbar.progressbar(open(filename), max_value=num_lines)
else:
iterator = sys.stdin
for line in iterator:
collect_subdomains_standalone(line)
if filename:
iterator.close()

4
eulaurarien.sh

@ -3,11 +3,11 @@
# Main script for eulaurarien
# Get all subdomains accessed by each website in the website list
cat websites.list | ./collect_subdomains.py > subdomains.list
./collect_subdomains.py websites.list > subdomains.list
sort -u subdomains.list > subdomains.sorted.list
# Filter out the subdomains not pointing to a first-party tracker
cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list
./filter_subdomains.py subdomains.sorted.list > toblock.list
sort -u toblock.list > toblock.sorted.list
# Format the blocklist so it can be used as a hostlist

38
filter_subdomains.py

@ -1,4 +1,5 @@
#!/usr/bin/env python3
# pylint: disable=C0103
"""
From a list of subdomains, output only
@ -9,6 +10,7 @@ import re
import sys
import dns.resolver
import progressbar
import regexes
@ -18,7 +20,10 @@ def is_subdomain_matching(subdomain: str) -> bool:
Indicates if the subdomain redirects to a first-party tracker.
"""
# TODO Look at the whole chain rather than the last one
query = dns.resolver.query(subdomain, 'A')
try:
query = dns.resolver.query(subdomain, 'A')
except dns.resolver.NXDOMAIN:
return False
canonical = query.canonical_name.to_text()
for regex in regexes.REGEXES:
if re.match(regex, canonical):
@ -26,10 +31,29 @@ def is_subdomain_matching(subdomain: str) -> bool:
return False
def is_subdomain_matching_standalone(subdomain: str) -> None:
"""
Print the subdomain if it redirects to a first-party tracker.
"""
subdomain = subdomain.strip()
if not subdomain:
return
if is_subdomain_matching(subdomain):
print(subdomain)
if __name__ == '__main__':
for line in sys.stdin:
line = line.strip()
if not line:
continue
if is_subdomain_matching(line):
print(line)
assert len(sys.argv) <= 2
filename = None
if len(sys.argv) == 2 and sys.argv[1] != '-':
filename = sys.argv[1]
num_lines = sum(1 for line in open(filename))
iterator = progressbar.progressbar(open(filename), max_value=num_lines)
else:
iterator = sys.stdin
for line in iterator:
is_subdomain_matching_standalone(line)
if filename:
iterator.close()
Loading…
Cancel
Save