commit
80b23e2d5c
7 changed files with 222 additions and 0 deletions
-
3.gitignore
-
54README.md
-
47collect_subdomains.py
-
22eulaurarien.sh
-
35filter_subdomains.py
-
9regexes.py
-
52websites.list
@ -0,0 +1,3 @@ |
|||
*.list |
|||
!websites.list |
|||
*.log |
@ -0,0 +1,54 @@ |
|||
# eulaurarien |
|||
|
|||
Generates a host list of first-party trackers for ad-blocking. |
|||
|
|||
**DISCLAIMER:** I'm by no way an expert on this subject so my vocabulary or other stuff might be wrong. Use at your own risk. |
|||
|
|||
## What's a first-party tracker? |
|||
|
|||
Traditionally, websites load trackers scripts directly. |
|||
For example, `website1.com` and `website2.com` both load `https://trackercompany.com/trackerscript.js` to track their users. |
|||
In order to block those, one can simply block the host `trackercompany.com`. |
|||
|
|||
However, to circumvent this easy block, tracker companies made the website using them load trackers from `somethingirelevant.website1.com`. |
|||
The latter being a DNS redirection to `website1.trackercompany.com`, directly pointing to a server serving the tracking script. |
|||
Those are the first-party trackers. |
|||
|
|||
Blocking `trackercompany.com` doesn't work any more, and blocking `*.trackercompany.com` isn't really possible since: |
|||
|
|||
1. Most ad-blocker don't support wildcards |
|||
2. It's a DNS redirection, meaning that most ad-blockers will only see `somethingirelevant.website1.com` |
|||
|
|||
So the only solution is to block every `somethingirelevant.website1.com`-like subdomains known, which is a lot. |
|||
That's where this scripts comes in, to generate a list of such subdomains. |
|||
|
|||
## How does this script work |
|||
|
|||
It takes an input a list of websites with trackers included. |
|||
So far, this list is manually-generated from the list of clients of such first-party trackers |
|||
(latter we should use a general list of websites to be more exhaustive). |
|||
|
|||
It open each ones of those websites (just the homepage) in a web browser, and record the domains of the network requests the page makes. |
|||
It then find the DNS redirections of those domains, and compare with regexes of known tracking domains. |
|||
It finally outputs the matching ones. |
|||
|
|||
## Requirements |
|||
|
|||
Just to build the list, you can find an already-built list in the releases. |
|||
|
|||
- Bash |
|||
- Python 3.4+ |
|||
- Firefox |
|||
- Selenium |
|||
- seleniumwire |
|||
- dnspython |
|||
|
|||
## Contributing |
|||
|
|||
### Adding websites |
|||
|
|||
Just add them to `websites.list`. |
|||
|
|||
### Adding first-party trackers regex |
|||
|
|||
Just add them to `regexes.py`. |
@ -0,0 +1,47 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
""" |
|||
From a list of URLs, output the subdomains |
|||
accessed by the websites. |
|||
""" |
|||
|
|||
import sys |
|||
import typing |
|||
import urllib.parse |
|||
|
|||
import selenium.webdriver.firefox.options |
|||
import seleniumwire.webdriver |
|||
|
|||
|
|||
def subdomain_from_url(url: str) -> str: |
|||
""" |
|||
Extract the domain part from an url. |
|||
""" |
|||
parsed = urllib.parse.urlparse(url) |
|||
return parsed.netloc |
|||
|
|||
|
|||
def collect_subdomains(url: str) -> typing.Iterable[str]: |
|||
""" |
|||
Load an URL into an headless browser and return all the domains |
|||
it tried to access. |
|||
""" |
|||
options = selenium.webdriver.firefox.options.Options() |
|||
options.add_argument('-headless') |
|||
driver = seleniumwire.webdriver.Firefox( |
|||
executable_path='geckodriver', options=options) |
|||
|
|||
driver.get(url) |
|||
for request in driver.requests: |
|||
if request.response: |
|||
yield subdomain_from_url(request.path) |
|||
driver.close() |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
for line in sys.stdin: |
|||
line = line.strip() |
|||
if not line: |
|||
continue |
|||
for subdomain in collect_subdomains(line): |
|||
print(subdomain) |
@ -0,0 +1,22 @@ |
|||
#!/usr/bin/env bash |
|||
|
|||
# Main script for eulaurarien |
|||
|
|||
# Get all subdomains accessed by each website in the website list |
|||
cat websites.list | ./collect_subdomains.py > subdomains.list |
|||
sort -u subdomains.list > subdomains.sorted.list |
|||
|
|||
# Filter out the subdomains not pointing to a first-party tracker |
|||
cat subdomains.sorted.list | ./filter_subdomains.py > toblock.list |
|||
sort -u toblock.list > toblock.sorted.list |
|||
|
|||
# Format the blocklist so it can be used as a hostlist |
|||
|
|||
( |
|||
echo "# First party trackers" |
|||
echo "# List generated on $(date -Isec) by eulaurarian $(git describe --tags --dirty)" |
|||
cat toblock.sorted.list | while read host; |
|||
do |
|||
echo "0.0.0.0 $host" |
|||
done |
|||
) > toblock.hosts.list |
@ -0,0 +1,35 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
""" |
|||
From a list of subdomains, output only |
|||
the ones resolving to a first-party tracker. |
|||
""" |
|||
|
|||
import re |
|||
import sys |
|||
|
|||
import dns.resolver |
|||
|
|||
import regexes |
|||
|
|||
|
|||
def is_subdomain_matching(subdomain: str) -> bool: |
|||
""" |
|||
Indicates if the subdomain redirects to a first-party tracker. |
|||
""" |
|||
# TODO Look at the whole chain rather than the last one |
|||
query = dns.resolver.query(subdomain, 'A') |
|||
canonical = query.canonical_name.to_text() |
|||
for regex in regexes.REGEXES: |
|||
if re.match(regex, canonical): |
|||
return True |
|||
return False |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
for line in sys.stdin: |
|||
line = line.strip() |
|||
if not line: |
|||
continue |
|||
if is_subdomain_matching(line): |
|||
print(line) |
@ -0,0 +1,9 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
""" |
|||
List of regex matching first-party trackers. |
|||
""" |
|||
|
|||
REGEXES = [ |
|||
r'^.+\.eulerian\.net\.$' |
|||
] |
@ -0,0 +1,52 @@ |
|||
https://oui.sncf/ |
|||
https://www.voyage-prive.com/ |
|||
https://www.odalys-vacances.com/ |
|||
https://www.homair.com/ |
|||
https://www.melia.com/ |
|||
https://www.locasun.fr/ |
|||
https://www.belambra.fr/ |
|||
http://www.xl.com/ |
|||
https://www.bordeaux.aeroport.fr/ |
|||
https://www.easyvoyage.com/ |
|||
https://www.leon-de-bruxelles.fr/ |
|||
https://www.sarenza.com/ |
|||
https://www.laredoute.fr/ |
|||
https://www.galerieslafayette.com/ |
|||
https://www.celio.com/ |
|||
https://vente-unique.com/ |
|||
https://www.francoisesaget.com/ |
|||
https://www.histoiredor.com/ |
|||
https://www.brandalley.fr/ |
|||
https://www.fleurancenature.fr/ |
|||
https://www.chausport.com/ |
|||
https://www.i-run.fr/ |
|||
https://fr.smallable.com/ |
|||
https://www.habitat.fr/ |
|||
https://www.bhv.fr/ |
|||
https://www.sfr.fr/ |
|||
https://www.red-by-sfr.fr/ |
|||
https://www.masmovil.es/ |
|||
https://www.yoigo.com/ |
|||
http://www.fnacdarty.com/ |
|||
https://www.fnac.com/ |
|||
https://www.darty.com/ |
|||
http://www.e-leclerc.com/ |
|||
https://www.monoprix.fr/ |
|||
https://www.officedepot.fr/ |
|||
https://www.carrefour-banque.fr/ |
|||
https://www.banque-casino.fr/ |
|||
https://mondial-assistance.fr/ |
|||
https://allianz-voyage.fr/ |
|||
https://www.bankia.com/ |
|||
https://www.april-moto.com/ |
|||
https://www.younited-credit.com/ |
|||
https://www.fortuneo.fr/ |
|||
https://www.orpi.com/ |
|||
https://www.warnerbros.fr/ |
|||
https://www.canalplus.com/ |
|||
https://www.skiset.com/ |
|||
https://www.promofarma.com/ |
|||
https://www.toner.fr/ |
|||
https://www.rentacar.fr/ |
|||
https://vivatechnology.com/ |
|||
https://www.liberation.fr/ |
Write
Preview
Loading…
Cancel
Save
Reference in new issue