Initial commit
This commit is contained in:
commit
80b23e2d5c
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
*.list
|
||||
!websites.list
|
||||
*.log
|
54
README.md
Normal file
54
README.md
Normal file
|
@ -0,0 +1,54 @@
|
|||
# eulaurarien
|
||||
|
||||
Generates a host list of first-party trackers for ad-blocking.
|
||||
|
||||
**DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk.
|
||||
|
||||
## What's a first-party tracker?
|
||||
|
||||
Traditionally, websites load trackers scripts directly.
|
||||
For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users.
|
||||
In order to block those, one can simply block the host `trackercompany.com`.
|
||||
|
||||
However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`.
|
||||
The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script.
|
||||
Those are the first-party trackers.
|
||||
|
||||
Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since:
|
||||
|
||||
1. Most ad-blocker don't support wildcards
|
||||
2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com`
|
||||
|
||||
So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot.
|
||||
That's where this scripts comes in, to generate a list of such subdomains.
|
||||
|
||||
## How does this script work
|
||||
|
||||
It takes an input a list of websites with trackers included.
|
||||
So far, this list is manually-generated from the list of clients of such first-party trackers
|
||||
(latter we should use a general list of websites to be more exhaustive).
|
||||
|
||||
It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes.
|
||||
It then find the DNS redirections of those domains, and compare with regexes of known tracking domains.
|
||||
It finally outputs the matching ones.
|
||||
|
||||
## Requirements
|
||||
|
||||
Just to build the list, you can find an already-built list in the releases.
|
||||
|
||||
- Bash
|
||||
- Python 3.4+
|
||||
- Firefox
|
||||
- Selenium
|
||||
- seleniumwire
|
||||
- dnspython
|
||||
|
||||
## Contributing
|
||||
|
||||
### Adding websites
|
||||
|
||||
Just add them to `websites.list`.
|
||||
|
||||
### Adding first-party trackers regex
|
||||
|
||||
Just add them to `regexes.py`.
|
47
collect_subdomains.py
Executable file
47
collect_subdomains.py
Executable file
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
From a list of URLs, output the subdomains
|
||||
accessed by the websites.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
import selenium.webdriver.firefox.options
|
||||
import seleniumwire.webdriver
|
||||
|
||||
|
||||
def subdomain_from_url(url: str) -> str:
|
||||
"""
|
||||
Extract the domain part from an url.
|
||||
"""
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
return parsed.netloc
|
||||
|
||||
|
||||
def collect_subdomains(url: str) -> typing.Iterable[str]:
|
||||
"""
|
||||
Load an URL into an headless browser and return all the domains
|
||||
it tried to access.
|
||||
"""
|
||||
options = selenium.webdriver.firefox.options.Options()
|
||||
options.add_argument('-headless')
|
||||
driver = seleniumwire.webdriver.Firefox(
|
||||
executable_path='geckodriver', options=options)
|
||||
|
||||
driver.get(url)
|
||||
for request in driver.requests:
|
||||
if request.response:
|
||||
yield subdomain_from_url(request.path)
|
||||
driver.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for subdomain in collect_subdomains(line):
|
||||
print(subdomain)
|
22
eulaurarien.sh
Executable file
22
eulaurarien.sh
Executable file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Main script for eulaurarien
|
||||
|
||||
# Get all subdomains accessed by each website in the website list
|
||||
cat websites.list | ./collect_subdomains.py > subdomains.list
|
||||
sort -u subdomains.list > subdomains.sorted.list
|
||||
|
||||
# Filter out the subdomains not pointing to a first-party tracker
|
||||
cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list
|
||||
sort -u toblock.list > toblock.sorted.list
|
||||
|
||||
# Format the blocklist so it can be used as a hostlist
|
||||
|
||||
(
|
||||
echo "# First party trackers"
|
||||
echo "# List generated on $(date -Isec) by eulaurarian $(git describe --tags --dirty)"
|
||||
cat toblock.sorted.list | while read host;
|
||||
do
|
||||
echo "0.0.0.0 $host"
|
||||
done
|
||||
) > toblock.hosts.list
|
35
filter_subdomains.py
Executable file
35
filter_subdomains.py
Executable file
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
From a list of subdomains, output only
|
||||
the ones resolving to a first-party tracker.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
import dns.resolver
|
||||
|
||||
import regexes
|
||||
|
||||
|
||||
def is_subdomain_matching(subdomain: str) -> bool:
|
||||
"""
|
||||
Indicates if the subdomain redirects to a first-party tracker.
|
||||
"""
|
||||
# TODO Look at the whole chain rather than the last one
|
||||
query = dns.resolver.query(subdomain, 'A')
|
||||
canonical = query.canonical_name.to_text()
|
||||
for regex in regexes.REGEXES:
|
||||
if re.match(regex, canonical):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if is_subdomain_matching(line):
|
||||
print(line)
|
9
regexes.py
Normal file
9
regexes.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
List of regex matching first-party trackers.
|
||||
"""
|
||||
|
||||
REGEXES = [
|
||||
r'^.+\.eulerian\.net\.$'
|
||||
]
|
52
websites.list
Normal file
52
websites.list
Normal file
|
@ -0,0 +1,52 @@
|
|||
https://oui.sncf/
|
||||
https://www.voyage-prive.com/
|
||||
https://www.odalys-vacances.com/
|
||||
https://www.homair.com/
|
||||
https://www.melia.com/
|
||||
https://www.locasun.fr/
|
||||
https://www.belambra.fr/
|
||||
http://www.xl.com/
|
||||
https://www.bordeaux.aeroport.fr/
|
||||
https://www.easyvoyage.com/
|
||||
https://www.leon-de-bruxelles.fr/
|
||||
https://www.sarenza.com/
|
||||
https://www.laredoute.fr/
|
||||
https://www.galerieslafayette.com/
|
||||
https://www.celio.com/
|
||||
https://vente-unique.com/
|
||||
https://www.francoisesaget.com/
|
||||
https://www.histoiredor.com/
|
||||
https://www.brandalley.fr/
|
||||
https://www.fleurancenature.fr/
|
||||
https://www.chausport.com/
|
||||
https://www.i-run.fr/
|
||||
https://fr.smallable.com/
|
||||
https://www.habitat.fr/
|
||||
https://www.bhv.fr/
|
||||
https://www.sfr.fr/
|
||||
https://www.red-by-sfr.fr/
|
||||
https://www.masmovil.es/
|
||||
https://www.yoigo.com/
|
||||
http://www.fnacdarty.com/
|
||||
https://www.fnac.com/
|
||||
https://www.darty.com/
|
||||
http://www.e-leclerc.com/
|
||||
https://www.monoprix.fr/
|
||||
https://www.officedepot.fr/
|
||||
https://www.carrefour-banque.fr/
|
||||
https://www.banque-casino.fr/
|
||||
https://mondial-assistance.fr/
|
||||
https://allianz-voyage.fr/
|
||||
https://www.bankia.com/
|
||||
https://www.april-moto.com/
|
||||
https://www.younited-credit.com/
|
||||
https://www.fortuneo.fr/
|
||||
https://www.orpi.com/
|
||||
https://www.warnerbros.fr/
|
||||
https://www.canalplus.com/
|
||||
https://www.skiset.com/
|
||||
https://www.promofarma.com/
|
||||
https://www.toner.fr/
|
||||
https://www.rentacar.fr/
|
||||
https://vivatechnology.com/
|
||||
https://www.liberation.fr/
|
Loading…
Reference in a new issue