Initial commit

2019-11-10 18:14:25 +01:00 · 2019-11-10 18:14:25 +01:00 · 80b23e2d5c
commit 80b23e2d5c
7 changed files with 222 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.list
+!websites.list
+*.log
--- a/README.md
+++ b/README.md
@ -0,0 +1,54 @@
+# eulaurarien
+
+Generates a host list of first-party trackers for ad-blocking.
+
+**DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk.
+
+## What's a first-party tracker?
+
+Traditionally, websites load trackers scripts directly.
+For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users.
+In order to block those, one can simply block the host `trackercompany.com`.
+
+However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`.
+The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script.
+Those are the first-party trackers.
+
+Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since:
+
+1. Most ad-blocker don't support wildcards
+2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com`
+
+So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot.
+That's where this scripts comes in, to generate a list of such subdomains.
+
+## How does this script work
+
+It takes an input a list of websites with trackers included.
+So far, this list is manually-generated from the list of clients of such first-party trackers
+(latter we should use a general list of websites to be more exhaustive).
+
+It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes.
+It then find the DNS redirections of those domains, and compare with regexes of known tracking domains.
+It finally outputs the matching ones.
+
+## Requirements
+
+Just to build the list, you can find an already-built list in the releases.
+
+- Bash
+- Python 3.4+
+- Firefox
+- Selenium
+- seleniumwire
+- dnspython
+
+## Contributing
+
+### Adding websites
+
+Just add them to `websites.list`.
+
+### Adding first-party trackers regex
+
+Just add them to `regexes.py`.
--- a/collect_subdomains.py
+++ b/collect_subdomains.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+"""
+From a list of URLs, output the subdomains
+accessed by the websites.
+"""
+
+import sys
+import typing
+import urllib.parse
+
+import selenium.webdriver.firefox.options
+import seleniumwire.webdriver
+
+
+def subdomain_from_url(url: str) -> str:
+    """
+    Extract the domain part from an url.
+    """
+    parsed = urllib.parse.urlparse(url)
+    return parsed.netloc
+
+
+def collect_subdomains(url: str) -> typing.Iterable[str]:
+    """
+    Load an URL into an headless browser and return all the domains
+    it tried to access.
+    """
+    options = selenium.webdriver.firefox.options.Options()
+    options.add_argument('-headless')
+    driver = seleniumwire.webdriver.Firefox(
+        executable_path='geckodriver', options=options)
+
+    driver.get(url)
+    for request in driver.requests:
+        if request.response:
+            yield subdomain_from_url(request.path)
+    driver.close()
+
+
+if __name__ == '__main__':
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        for subdomain in collect_subdomains(line):
+            print(subdomain)
--- a/eulaurarien.sh
+++ b/eulaurarien.sh
@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Main script for eulaurarien
+
+# Get all subdomains accessed by each website in the website list
+cat websites.list | ./collect_subdomains.py > subdomains.list
+sort -u subdomains.list > subdomains.sorted.list
+
+# Filter out the subdomains not pointing to a first-party tracker
+cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list
+sort -u toblock.list > toblock.sorted.list
+
+# Format the blocklist so it can be used as a hostlist
+
+(
+    echo "# First party trackers"
+    echo "# List generated on $(date -Isec) by eulaurarian $(git describe --tags --dirty)"
+    cat toblock.sorted.list | while read host;
+    do
+        echo "0.0.0.0 $host"
+    done
+) > toblock.hosts.list
--- a/filter_subdomains.py
+++ b/filter_subdomains.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""
+From a list of subdomains, output only
+the ones resolving to a first-party tracker.
+"""
+
+import re
+import sys
+
+import dns.resolver
+
+import regexes
+
+
+def is_subdomain_matching(subdomain: str) -> bool:
+    """
+    Indicates if the subdomain redirects to a first-party tracker.
+    """
+    # TODO Look at the whole chain rather than the last one
+    query = dns.resolver.query(subdomain, 'A')
+    canonical = query.canonical_name.to_text()
+    for regex in regexes.REGEXES:
+        if re.match(regex, canonical):
+            return True
+    return False
+
+
+if __name__ == '__main__':
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        if is_subdomain_matching(line):
+            print(line)
--- a/regexes.py
+++ b/regexes.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+"""
+List of regex matching first-party trackers.
+"""
+
+REGEXES = [
+    r'^.+\.eulerian\.net\.$'
+]
--- a/websites.list
+++ b/websites.list
@ -0,0 +1,52 @@
+https://oui.sncf/
+https://www.voyage-prive.com/
+https://www.odalys-vacances.com/
+https://www.homair.com/
+https://www.melia.com/
+https://www.locasun.fr/
+https://www.belambra.fr/
+http://www.xl.com/
+https://www.bordeaux.aeroport.fr/
+https://www.easyvoyage.com/
+https://www.leon-de-bruxelles.fr/
+https://www.sarenza.com/
+https://www.laredoute.fr/
+https://www.galerieslafayette.com/
+https://www.celio.com/
+https://vente-unique.com/
+https://www.francoisesaget.com/
+https://www.histoiredor.com/
+https://www.brandalley.fr/
+https://www.fleurancenature.fr/
+https://www.chausport.com/
+https://www.i-run.fr/
+https://fr.smallable.com/
+https://www.habitat.fr/
+https://www.bhv.fr/
+https://www.sfr.fr/
+https://www.red-by-sfr.fr/
+https://www.masmovil.es/
+https://www.yoigo.com/
+http://www.fnacdarty.com/
+https://www.fnac.com/
+https://www.darty.com/
+http://www.e-leclerc.com/
+https://www.monoprix.fr/
+https://www.officedepot.fr/
+https://www.carrefour-banque.fr/
+https://www.banque-casino.fr/
+https://mondial-assistance.fr/
+https://allianz-voyage.fr/
+https://www.bankia.com/
+https://www.april-moto.com/
+https://www.younited-credit.com/
+https://www.fortuneo.fr/
+https://www.orpi.com/
+https://www.warnerbros.fr/
+https://www.canalplus.com/
+https://www.skiset.com/
+https://www.promofarma.com/
+https://www.toner.fr/
+https://www.rentacar.fr/
+https://vivatechnology.com/
+https://www.liberation.fr/