Initial commit
This commit is contained in:
commit
80b23e2d5c
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
*.list
|
||||||
|
!websites.list
|
||||||
|
*.log
|
54
README.md
Normal file
54
README.md
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
# eulaurarien
|
||||||
|
|
||||||
|
Generates a host list of first-party trackers for ad-blocking.
|
||||||
|
|
||||||
|
**DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk.
|
||||||
|
|
||||||
|
## What's a first-party tracker?
|
||||||
|
|
||||||
|
Traditionally, websites load trackers scripts directly.
|
||||||
|
For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users.
|
||||||
|
In order to block those, one can simply block the host `trackercompany.com`.
|
||||||
|
|
||||||
|
However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`.
|
||||||
|
The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script.
|
||||||
|
Those are the first-party trackers.
|
||||||
|
|
||||||
|
Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since:
|
||||||
|
|
||||||
|
1. Most ad-blocker don't support wildcards
|
||||||
|
2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com`
|
||||||
|
|
||||||
|
So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot.
|
||||||
|
That's where this scripts comes in, to generate a list of such subdomains.
|
||||||
|
|
||||||
|
## How does this script work
|
||||||
|
|
||||||
|
It takes an input a list of websites with trackers included.
|
||||||
|
So far, this list is manually-generated from the list of clients of such first-party trackers
|
||||||
|
(latter we should use a general list of websites to be more exhaustive).
|
||||||
|
|
||||||
|
It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes.
|
||||||
|
It then find the DNS redirections of those domains, and compare with regexes of known tracking domains.
|
||||||
|
It finally outputs the matching ones.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Just to build the list, you can find an already-built list in the releases.
|
||||||
|
|
||||||
|
- Bash
|
||||||
|
- Python 3.4+
|
||||||
|
- Firefox
|
||||||
|
- Selenium
|
||||||
|
- seleniumwire
|
||||||
|
- dnspython
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
### Adding websites
|
||||||
|
|
||||||
|
Just add them to `websites.list`.
|
||||||
|
|
||||||
|
### Adding first-party trackers regex
|
||||||
|
|
||||||
|
Just add them to `regexes.py`.
|
47
collect_subdomains.py
Executable file
47
collect_subdomains.py
Executable file
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
From a list of URLs, output the subdomains
|
||||||
|
accessed by the websites.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import typing
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
import selenium.webdriver.firefox.options
|
||||||
|
import seleniumwire.webdriver
|
||||||
|
|
||||||
|
|
||||||
|
def subdomain_from_url(url: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract the domain part from an url.
|
||||||
|
"""
|
||||||
|
parsed = urllib.parse.urlparse(url)
|
||||||
|
return parsed.netloc
|
||||||
|
|
||||||
|
|
||||||
|
def collect_subdomains(url: str) -> typing.Iterable[str]:
|
||||||
|
"""
|
||||||
|
Load an URL into an headless browser and return all the domains
|
||||||
|
it tried to access.
|
||||||
|
"""
|
||||||
|
options = selenium.webdriver.firefox.options.Options()
|
||||||
|
options.add_argument('-headless')
|
||||||
|
driver = seleniumwire.webdriver.Firefox(
|
||||||
|
executable_path='geckodriver', options=options)
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
for request in driver.requests:
|
||||||
|
if request.response:
|
||||||
|
yield subdomain_from_url(request.path)
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
for subdomain in collect_subdomains(line):
|
||||||
|
print(subdomain)
|
22
eulaurarien.sh
Executable file
22
eulaurarien.sh
Executable file
|
@ -0,0 +1,22 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Main script for eulaurarien
|
||||||
|
|
||||||
|
# Get all subdomains accessed by each website in the website list
|
||||||
|
cat websites.list | ./collect_subdomains.py > subdomains.list
|
||||||
|
sort -u subdomains.list > subdomains.sorted.list
|
||||||
|
|
||||||
|
# Filter out the subdomains not pointing to a first-party tracker
|
||||||
|
cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list
|
||||||
|
sort -u toblock.list > toblock.sorted.list
|
||||||
|
|
||||||
|
# Format the blocklist so it can be used as a hostlist
|
||||||
|
|
||||||
|
(
|
||||||
|
echo "# First party trackers"
|
||||||
|
echo "# List generated on $(date -Isec) by eulaurarian $(git describe --tags --dirty)"
|
||||||
|
cat toblock.sorted.list | while read host;
|
||||||
|
do
|
||||||
|
echo "0.0.0.0 $host"
|
||||||
|
done
|
||||||
|
) > toblock.hosts.list
|
35
filter_subdomains.py
Executable file
35
filter_subdomains.py
Executable file
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
From a list of subdomains, output only
|
||||||
|
the ones resolving to a first-party tracker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import dns.resolver
|
||||||
|
|
||||||
|
import regexes
|
||||||
|
|
||||||
|
|
||||||
|
def is_subdomain_matching(subdomain: str) -> bool:
|
||||||
|
"""
|
||||||
|
Indicates if the subdomain redirects to a first-party tracker.
|
||||||
|
"""
|
||||||
|
# TODO Look at the whole chain rather than the last one
|
||||||
|
query = dns.resolver.query(subdomain, 'A')
|
||||||
|
canonical = query.canonical_name.to_text()
|
||||||
|
for regex in regexes.REGEXES:
|
||||||
|
if re.match(regex, canonical):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if is_subdomain_matching(line):
|
||||||
|
print(line)
|
9
regexes.py
Normal file
9
regexes.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
List of regex matching first-party trackers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
REGEXES = [
|
||||||
|
r'^.+\.eulerian\.net\.$'
|
||||||
|
]
|
52
websites.list
Normal file
52
websites.list
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
https://oui.sncf/
|
||||||
|
https://www.voyage-prive.com/
|
||||||
|
https://www.odalys-vacances.com/
|
||||||
|
https://www.homair.com/
|
||||||
|
https://www.melia.com/
|
||||||
|
https://www.locasun.fr/
|
||||||
|
https://www.belambra.fr/
|
||||||
|
http://www.xl.com/
|
||||||
|
https://www.bordeaux.aeroport.fr/
|
||||||
|
https://www.easyvoyage.com/
|
||||||
|
https://www.leon-de-bruxelles.fr/
|
||||||
|
https://www.sarenza.com/
|
||||||
|
https://www.laredoute.fr/
|
||||||
|
https://www.galerieslafayette.com/
|
||||||
|
https://www.celio.com/
|
||||||
|
https://vente-unique.com/
|
||||||
|
https://www.francoisesaget.com/
|
||||||
|
https://www.histoiredor.com/
|
||||||
|
https://www.brandalley.fr/
|
||||||
|
https://www.fleurancenature.fr/
|
||||||
|
https://www.chausport.com/
|
||||||
|
https://www.i-run.fr/
|
||||||
|
https://fr.smallable.com/
|
||||||
|
https://www.habitat.fr/
|
||||||
|
https://www.bhv.fr/
|
||||||
|
https://www.sfr.fr/
|
||||||
|
https://www.red-by-sfr.fr/
|
||||||
|
https://www.masmovil.es/
|
||||||
|
https://www.yoigo.com/
|
||||||
|
http://www.fnacdarty.com/
|
||||||
|
https://www.fnac.com/
|
||||||
|
https://www.darty.com/
|
||||||
|
http://www.e-leclerc.com/
|
||||||
|
https://www.monoprix.fr/
|
||||||
|
https://www.officedepot.fr/
|
||||||
|
https://www.carrefour-banque.fr/
|
||||||
|
https://www.banque-casino.fr/
|
||||||
|
https://mondial-assistance.fr/
|
||||||
|
https://allianz-voyage.fr/
|
||||||
|
https://www.bankia.com/
|
||||||
|
https://www.april-moto.com/
|
||||||
|
https://www.younited-credit.com/
|
||||||
|
https://www.fortuneo.fr/
|
||||||
|
https://www.orpi.com/
|
||||||
|
https://www.warnerbros.fr/
|
||||||
|
https://www.canalplus.com/
|
||||||
|
https://www.skiset.com/
|
||||||
|
https://www.promofarma.com/
|
||||||
|
https://www.toner.fr/
|
||||||
|
https://www.rentacar.fr/
|
||||||
|
https://vivatechnology.com/
|
||||||
|
https://www.liberation.fr/
|
Loading…
Reference in a new issue