commit 80b23e2d5c4c3a7b2efb1cb0dfc1e7efe11c862a Author: Geoffrey “Frogeye” Preud'homme Date: Sun Nov 10 18:14:25 2019 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a2eceb --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.list +!websites.list +*.log diff --git a/README.md b/README.md new file mode 100644 index 0000000..4bbc83d --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +# eulaurarien + +Generates a host list of first-party trackers for ad-blocking. + +**DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk. + +## What's a first-party tracker? + +Traditionally, websites load trackers scripts directly. +For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users. +In order to block those, one can simply block the host `trackercompany.com`. + +However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`. +The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script. +Those are the first-party trackers. + +Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since: + +1. Most ad-blocker don't support wildcards +2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com` + +So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot. +That's where this scripts comes in, to generate a list of such subdomains. + +## How does this script work + +It takes an input a list of websites with trackers included. +So far, this list is manually-generated from the list of clients of such first-party trackers +(latter we should use a general list of websites to be more exhaustive). + +It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes. +It then find the DNS redirections of those domains, and compare with regexes of known tracking domains. +It finally outputs the matching ones. + +## Requirements + +Just to build the list, you can find an already-built list in the releases. + +- Bash +- Python 3.4+ +- Firefox +- Selenium +- seleniumwire +- dnspython + +## Contributing + +### Adding websites + +Just add them to `websites.list`. + +### Adding first-party trackers regex + +Just add them to `regexes.py`. diff --git a/collect_subdomains.py b/collect_subdomains.py new file mode 100755 index 0000000..4177812 --- /dev/null +++ b/collect_subdomains.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +""" +From a list of URLs, output the subdomains +accessed by the websites. +""" + +import sys +import typing +import urllib.parse + +import selenium.webdriver.firefox.options +import seleniumwire.webdriver + + +def subdomain_from_url(url: str) -> str: + """ + Extract the domain part from an url. + """ + parsed = urllib.parse.urlparse(url) + return parsed.netloc + + +def collect_subdomains(url: str) -> typing.Iterable[str]: + """ + Load an URL into an headless browser and return all the domains + it tried to access. + """ + options = selenium.webdriver.firefox.options.Options() + options.add_argument('-headless') + driver = seleniumwire.webdriver.Firefox( + executable_path='geckodriver', options=options) + + driver.get(url) + for request in driver.requests: + if request.response: + yield subdomain_from_url(request.path) + driver.close() + + +if __name__ == '__main__': + for line in sys.stdin: + line = line.strip() + if not line: + continue + for subdomain in collect_subdomains(line): + print(subdomain) diff --git a/eulaurarien.sh b/eulaurarien.sh new file mode 100755 index 0000000..d2e8f7a --- /dev/null +++ b/eulaurarien.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Main script for eulaurarien + +# Get all subdomains accessed by each website in the website list +cat websites.list | ./collect_subdomains.py > subdomains.list +sort -u subdomains.list > subdomains.sorted.list + +# Filter out the subdomains not pointing to a first-party tracker +cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list +sort -u toblock.list > toblock.sorted.list + +# Format the blocklist so it can be used as a hostlist + +( + echo "# First party trackers" + echo "# List generated on $(date -Isec) by eulaurarian $(git describe --tags --dirty)" + cat toblock.sorted.list | while read host; + do + echo "0.0.0.0 $host" + done +) > toblock.hosts.list diff --git a/filter_subdomains.py b/filter_subdomains.py new file mode 100755 index 0000000..fd3a590 --- /dev/null +++ b/filter_subdomains.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +""" +From a list of subdomains, output only +the ones resolving to a first-party tracker. +""" + +import re +import sys + +import dns.resolver + +import regexes + + +def is_subdomain_matching(subdomain: str) -> bool: + """ + Indicates if the subdomain redirects to a first-party tracker. + """ + # TODO Look at the whole chain rather than the last one + query = dns.resolver.query(subdomain, 'A') + canonical = query.canonical_name.to_text() + for regex in regexes.REGEXES: + if re.match(regex, canonical): + return True + return False + + +if __name__ == '__main__': + for line in sys.stdin: + line = line.strip() + if not line: + continue + if is_subdomain_matching(line): + print(line) diff --git a/regexes.py b/regexes.py new file mode 100644 index 0000000..bd99f7a --- /dev/null +++ b/regexes.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + +""" +List of regex matching first-party trackers. +""" + +REGEXES = [ + r'^.+\.eulerian\.net\.$' +] diff --git a/websites.list b/websites.list new file mode 100644 index 0000000..fca2ebd --- /dev/null +++ b/websites.list @@ -0,0 +1,52 @@ +https://oui.sncf/ +https://www.voyage-prive.com/ +https://www.odalys-vacances.com/ +https://www.homair.com/ +https://www.melia.com/ +https://www.locasun.fr/ +https://www.belambra.fr/ +http://www.xl.com/ +https://www.bordeaux.aeroport.fr/ +https://www.easyvoyage.com/ +https://www.leon-de-bruxelles.fr/ +https://www.sarenza.com/ +https://www.laredoute.fr/ +https://www.galerieslafayette.com/ +https://www.celio.com/ +https://vente-unique.com/ +https://www.francoisesaget.com/ +https://www.histoiredor.com/ +https://www.brandalley.fr/ +https://www.fleurancenature.fr/ +https://www.chausport.com/ +https://www.i-run.fr/ +https://fr.smallable.com/ +https://www.habitat.fr/ +https://www.bhv.fr/ +https://www.sfr.fr/ +https://www.red-by-sfr.fr/ +https://www.masmovil.es/ +https://www.yoigo.com/ +http://www.fnacdarty.com/ +https://www.fnac.com/ +https://www.darty.com/ +http://www.e-leclerc.com/ +https://www.monoprix.fr/ +https://www.officedepot.fr/ +https://www.carrefour-banque.fr/ +https://www.banque-casino.fr/ +https://mondial-assistance.fr/ +https://allianz-voyage.fr/ +https://www.bankia.com/ +https://www.april-moto.com/ +https://www.younited-credit.com/ +https://www.fortuneo.fr/ +https://www.orpi.com/ +https://www.warnerbros.fr/ +https://www.canalplus.com/ +https://www.skiset.com/ +https://www.promofarma.com/ +https://www.toner.fr/ +https://www.rentacar.fr/ +https://vivatechnology.com/ +https://www.liberation.fr/