diff --git a/.gitignore b/.gitignore index 2a2eceb..397b4a7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1 @@ -*.list -!websites.list *.log diff --git a/README.md b/README.md index d6e0a36..d5f4826 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,10 @@ That's where this scripts comes in, to generate a list of such subdomains. It takes an input a list of websites with trackers included. So far, this list is manually-generated from the list of clients of such first-party trackers (latter we should use a general list of websites to be more exhaustive). - It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes. + +Additionaly, or alternatively, you can feed the script some browsing history and get domains from there. + It then find the DNS redirections of those domains, and compare with regexes of known tracking domains. It finally outputs the matching ones. @@ -38,19 +40,43 @@ Just to build the list, you can find an already-built list in the releases. - Bash - Python 3.4+ +- [progressbar2](https://pypi.org/project/progressbar2/) +- dnspython + +(if you don't want to collect the subdomains, you can skip the following) + - Firefox - Selenium - seleniumwire -- dnspython -- [progressbar2](https://pypi.org/project/progressbar2/) -And then just run `eulaurarien.sh`. +## Usage + +### Add personal sources + +The list of websites provided in this script is by no mean exhaustive, +so adding your own browsing history will help create a better list. +Here's reference command for possible sources: + +- **Pi-hole**: `sqlite3 /etc/pihole-FTL.db "select distinct domain from queries" > /path/to/eulaurarien/subdomains/my-pihole.custom.list` +- **Firefox**: `cp ~/.mozilla/firefox/.default/places.sqlite temp; sqlite3 temp "select distinct rev_host from moz_places" | rev | sed 's|^\.||' > /path/to/eulaurarien/subdomains/my-firefox.custom.list` + +### Collect subdomains from websites + +This step is optional if you already added personal sources. +Just run `collect_subdomain.sh`. +This is a long step, and might be memory-intensive from time to time. + +### Extract tracking domains + +Make sure your system is configured with a DNS server without limitation. +Then, run `filter_subdomain.sh`. +The files you need will be in the folder `dist`. ## Contributing ### Adding websites -Just add them to `websites.list`. +Just add the URL to the relevant list: `websites/.list`. ### Adding first-party trackers regex diff --git a/collect_subdomains.sh b/collect_subdomains.sh new file mode 100755 index 0000000..bd4741a --- /dev/null +++ b/collect_subdomains.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Get all subdomains accessed by each website in the website list + +cat websites/*.list | sort -u > temp/all_websites.list +./collect_subdomains.py temp/all_websites.list > temp/subdomains_from_websites.list +sort -u temp/subdomains_from_websites.list > subdomains/from_websites.cache.list diff --git a/dist/.gitignore b/dist/.gitignore new file mode 100644 index 0000000..2211df6 --- /dev/null +++ b/dist/.gitignore @@ -0,0 +1 @@ +*.txt diff --git a/eulaurarien.sh b/eulaurarien.sh index d6e3a0e..88622f5 100755 --- a/eulaurarien.sh +++ b/eulaurarien.sh @@ -2,21 +2,6 @@ # Main script for eulaurarien -# Get all subdomains accessed by each website in the website list -./collect_subdomains.py websites.list > subdomains.list -sort -u subdomains.list > subdomains.sorted.list +./collect_subdomains.sh +./filter_subdomains.sh -# Filter out the subdomains not pointing to a first-party tracker -./filter_subdomains.py subdomains.sorted.list > toblock.list -sort -u toblock.list > toblock.sorted.list - -# Format the blocklist so it can be used as a hostlist - -( - echo "# First party trackers" - echo "# List generated on $(date -Isec) by eulaurarien $(git describe --tags --dirty)" - cat toblock.sorted.list | while read host; - do - echo "0.0.0.0 $host" - done -) > toblock.hosts.list diff --git a/filter_subdomains.sh b/filter_subdomains.sh new file mode 100755 index 0000000..139af0b --- /dev/null +++ b/filter_subdomains.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Filter out the subdomains not pointing to a first-party tracker + +cat subdomains/*.list | sort -u > temp/all_subdomains.list +./filter_subdomains.py temp/all_subdomains.list > temp/all_toblock.list +sort -u temp/all_toblock.list > dist/firstparty-trackers.txt + +# Format the blocklist so it can be used as a hostlist + +( + echo "# First-party trackers" + echo "# List generated on $(date -Isec) by eulaurarien $(git describe --tags --dirty)" + cat dist/firstparty-trackers.txt | while read host; + do + echo "0.0.0.0 $host" + done +) > dist/firstparty-trackers-hosts.txt diff --git a/regexes.py b/regexes.py index cb46a8f..11fff4b 100644 --- a/regexes.py +++ b/regexes.py @@ -4,6 +4,8 @@ List of regex matching first-party trackers. """ +# Syntax: https://docs.python.org/3/library/re.html#regular-expression-syntax + REGEXES = [ r'^.+\.eulerian\.net\.$', r'^.+\.criteo\.com\.$', diff --git a/subdomains/.gitignore b/subdomains/.gitignore new file mode 100644 index 0000000..dbd03bc --- /dev/null +++ b/subdomains/.gitignore @@ -0,0 +1,2 @@ +*.custom.list +*.cache.list diff --git a/temp/.gitignore b/temp/.gitignore new file mode 100644 index 0000000..b31be08 --- /dev/null +++ b/temp/.gitignore @@ -0,0 +1 @@ +*.list diff --git a/websites/.gitignore b/websites/.gitignore new file mode 100644 index 0000000..9f206f1 --- /dev/null +++ b/websites/.gitignore @@ -0,0 +1 @@ +*.custom.list diff --git a/websites.list b/websites/eulerian_clients.list similarity index 100% rename from websites.list rename to websites/eulerian_clients.list