Initial commit

2019-11-10 18:14:25 +01:00 · 2019-11-10 18:14:25 +01:00 · 80b23e2d5c
commit 80b23e2d5c
7 changed files with 222 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 *.list
 !websites.list
 *.log
--- a/README.md
+++ b/README.md
@ -0,0 +1,54 @@
 # eulaurarien
 Generates a host list of first-party trackers for ad-blocking.
 **DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk.
 ## What's a first-party tracker?
 Traditionally, websites load trackers scripts directly.
 For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users.
 In order to block those, one can simply block the host `trackercompany.com`.
 However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`.
 The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script.
 Those are the first-party trackers.
 Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since:
 1. Most ad-blocker don't support wildcards
 2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com`
 So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot.
 That's where this scripts comes in, to generate a list of such subdomains.
 ## How does this script work
 It takes an input a list of websites with trackers included.
 So far, this list is manually-generated from the list of clients of such first-party trackers
 (latter we should use a general list of websites to be more exhaustive).
 It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes.
 It then find the DNS redirections of those domains, and compare with regexes of known tracking domains.
 It finally outputs the matching ones.
 ## Requirements
 Just to build the list, you can find an already-built list in the releases.
 - Bash
 - Python 3.4+
 - Firefox
 - Selenium
 - seleniumwire
 - dnspython
 ## Contributing
 ### Adding websites
 Just add them to `websites.list`.
 ### Adding first-party trackers regex
 Just add them to `regexes.py`.
--- a/collect_subdomains.py
+++ b/collect_subdomains.py
@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 """
 From a list of URLs, output the subdomains
 accessed by the websites.
 """
 import sys
 import typing
 import urllib.parse
 import selenium.webdriver.firefox.options
 import seleniumwire.webdriver
 def subdomain_from_url(url: str) -> str:
    """
    Extract the domain part from an url.
    """
    parsed = urllib.parse.urlparse(url)
    return parsed.netloc
 def collect_subdomains(url: str) -> typing.Iterable[str]:
    """
    Load an URL into an headless browser and return all the domains
    it tried to access.
    """
    options = selenium.webdriver.firefox.options.Options()
    options.add_argument('-headless')
    driver = seleniumwire.webdriver.Firefox(
        executable_path='geckodriver', options=options)
    driver.get(url)
    for request in driver.requests:
        if request.response:
            yield subdomain_from_url(request.path)
    driver.close()
 if __name__ == '__main__':
    for line in sys.stdin:
        line = line.strip()
        if not line:
            continue
        for subdomain in collect_subdomains(line):
            print(subdomain)
--- a/eulaurarien.sh
+++ b/eulaurarien.sh
@ -0,0 +1,22 @@
 #!/usr/bin/env bash
 # Main script for eulaurarien
 # Get all subdomains accessed by each website in the website list
 cat websites.list | ./collect_subdomains.py > subdomains.list
 sort -u subdomains.list > subdomains.sorted.list
 # Filter out the subdomains not pointing to a first-party tracker
 cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list
 sort -u toblock.list > toblock.sorted.list
 # Format the blocklist so it can be used as a hostlist
 (
    echo "# First party trackers"
    echo "# List generated on $(date -Isec) by eulaurarian $(git describe --tags --dirty)"
    cat toblock.sorted.list | while read host;
    do
        echo "0.0.0.0 $host"
    done
 ) > toblock.hosts.list
--- a/filter_subdomains.py
+++ b/filter_subdomains.py
@ -0,0 +1,35 @@
 #!/usr/bin/env python3
 """
 From a list of subdomains, output only
 the ones resolving to a first-party tracker.
 """
 import re
 import sys
 import dns.resolver
 import regexes
 def is_subdomain_matching(subdomain: str) -> bool:
    """
    Indicates if the subdomain redirects to a first-party tracker.
    """
    # TODO Look at the whole chain rather than the last one
    query = dns.resolver.query(subdomain, 'A')
    canonical = query.canonical_name.to_text()
    for regex in regexes.REGEXES:
        if re.match(regex, canonical):
            return True
    return False
 if __name__ == '__main__':
    for line in sys.stdin:
        line = line.strip()
        if not line:
            continue
        if is_subdomain_matching(line):
            print(line)
--- a/regexes.py
+++ b/regexes.py
@ -0,0 +1,9 @@
 #!/usr/bin/env python3
 """
 List of regex matching first-party trackers.
 """
 REGEXES = [
    r'^.+\.eulerian\.net\.$'
 ]
--- a/websites.list
+++ b/websites.list
@ -0,0 +1,52 @@
 https://oui.sncf/
 https://www.voyage-prive.com/
 https://www.odalys-vacances.com/
 https://www.homair.com/
 https://www.melia.com/
 https://www.locasun.fr/
 https://www.belambra.fr/
 http://www.xl.com/
 https://www.bordeaux.aeroport.fr/
 https://www.easyvoyage.com/
 https://www.leon-de-bruxelles.fr/
 https://www.sarenza.com/
 https://www.laredoute.fr/
 https://www.galerieslafayette.com/
 https://www.celio.com/
 https://vente-unique.com/
 https://www.francoisesaget.com/
 https://www.histoiredor.com/
 https://www.brandalley.fr/
 https://www.fleurancenature.fr/
 https://www.chausport.com/
 https://www.i-run.fr/
 https://fr.smallable.com/
 https://www.habitat.fr/
 https://www.bhv.fr/
 https://www.sfr.fr/
 https://www.red-by-sfr.fr/
 https://www.masmovil.es/
 https://www.yoigo.com/
 http://www.fnacdarty.com/
 https://www.fnac.com/
 https://www.darty.com/
 http://www.e-leclerc.com/
 https://www.monoprix.fr/
 https://www.officedepot.fr/
 https://www.carrefour-banque.fr/
 https://www.banque-casino.fr/
 https://mondial-assistance.fr/
 https://allianz-voyage.fr/
 https://www.bankia.com/
 https://www.april-moto.com/
 https://www.younited-credit.com/
 https://www.fortuneo.fr/
 https://www.orpi.com/
 https://www.warnerbros.fr/
 https://www.canalplus.com/
 https://www.skiset.com/
 https://www.promofarma.com/
 https://www.toner.fr/
 https://www.rentacar.fr/
 https://vivatechnology.com/
 https://www.liberation.fr/