diff --git a/.gitignore b/.gitignore index c72635d..e6abf3c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ *.log *.p -nameservers -nameservers.head diff --git a/database.py b/database.py index cddc326..0a62ad1 100644 --- a/database.py +++ b/database.py @@ -216,7 +216,7 @@ class Database(Profiler): splits = path.split('.') if not TLD_LIST: Database.populate_tld_list() - if splits[0] not in TLD_LIST: + if splits[-1] not in TLD_LIST: return False for split in splits: if not 1 <= len(split) <= 63: @@ -460,62 +460,56 @@ class Database(Profiler): string += f' ← {self.explain(match.source)}' return string - def export(self, - first_party_only: bool = False, - end_chain_only: bool = False, - no_dupplicates: bool = False, - explain: bool = False, - ) -> typing.Iterable[str]: + def list_records(self, + first_party_only: bool = False, + end_chain_only: bool = False, + no_dupplicates: bool = False, + rules_only: bool = False, + hostnames_only: bool = False, + explain: bool = False, + ) -> typing.Iterable[str]: def export_cb(path: Path, match: Match ) -> typing.Iterable[str]: - assert isinstance(path, DomainPath) - if not isinstance(path, HostnamePath): - return if first_party_only and not match.first_party: return if end_chain_only and match.references > 0: return if no_dupplicates and match.dupplicate: return + if rules_only and match.level > 1: + return + if hostnames_only and not isinstance(path, HostnamePath): + return + if explain: yield self.explain(path) else: - yield self.unpack_domain(path) + yield str(path) - yield from self.exec_each_domain(export_cb) - - def list_rules(self, - first_party_only: bool = False, - ) -> typing.Iterable[str]: - - def list_rules_cb(path: Path, match: Match - ) -> typing.Iterable[str]: - if first_party_only and not match.first_party: - return - if isinstance(path, ZonePath) \ - or (isinstance(path, Ip4Path) and path.prefixlen < 32): - # if match.level == 1: - # It should be the latter condition but it is more - # useful when using the former - yield self.explain(path) - - yield from self.exec_each(list_rules_cb) + yield from self.exec_each(export_cb) def count_records(self, first_party_only: bool = False, - rules_only: bool = False, + end_chain_only: bool = False, no_dupplicates: bool = False, + rules_only: bool = False, + hostnames_only: bool = False, ) -> str: memo: typing.Dict[str, int] = dict() def count_records_cb(path: Path, match: Match) -> None: if first_party_only and not match.first_party: return - if rules_only and match.level > 1: + if end_chain_only and match.references > 0: return if no_dupplicates and match.dupplicate: return + if rules_only and match.level > 1: + return + if hostnames_only and not isinstance(path, HostnamePath): + return + try: memo[path.__class__.__name__] += 1 except KeyError: @@ -523,9 +517,10 @@ class Database(Profiler): for _ in self.exec_each(count_records_cb): pass + split: typing.List[str] = list() for key, value in sorted(memo.items(), key=lambda s: s[0]): - split.append(f'{key[:-4]}: {value}') + split.append(f'{key[:-4].lower()}s: {value}') return ', '.join(split) def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]: diff --git a/export.py b/export.py index 8befd77..8dcf2c5 100755 --- a/export.py +++ b/export.py @@ -19,15 +19,18 @@ if __name__ == '__main__': parser.add_argument( '-e', '--end-chain', action='store_true', help="TODO") - parser.add_argument( - '-x', '--explain', action='store_true', - help="TODO") parser.add_argument( '-r', '--rules', action='store_true', help="TODO") + parser.add_argument( + '-b', '--base-rules', action='store_true', + help="TODO implies rules") parser.add_argument( '-d', '--no-dupplicates', action='store_true', help="TODO") + parser.add_argument( + '-x', '--explain', action='store_true', + help="TODO") parser.add_argument( '-c', '--count', action='store_true', help="TODO") @@ -36,19 +39,21 @@ if __name__ == '__main__': DB = database.Database() if args.count: + assert not args.explain print(DB.count_records( - first_party_only=args.first_party, - rules_only=args.rules, - no_dupplicates=args.no_dupplicates, - )) - else: - if args.rules: - for line in DB.list_rules(): - print(line) - for domain in DB.export( first_party_only=args.first_party, end_chain_only=args.end_chain, no_dupplicates=args.no_dupplicates, + rules_only=args.base_rules, + hostnames_only=not (args.rules or args.base_rules), + )) + else: + for domain in DB.list_records( + first_party_only=args.first_party, + end_chain_only=args.end_chain, + no_dupplicates=args.no_dupplicates, + rules_only=args.base_rules, + hostnames_only=not (args.rules or args.base_rules), explain=args.explain, ): print(domain, file=args.output) diff --git a/export_lists.sh b/export_lists.sh index 1070865..b9853ed 100755 --- a/export_lists.sh +++ b/export_lists.sh @@ -4,69 +4,94 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Exporting lists…" -./export.py --first-party --output dist/firstparty-trackers.txt -./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt -./export.py --output dist/multiparty-trackers.txt -./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt +log "Calculating statistics…" +gen_date=$(date -Isec) +gen_software=$(git describe --tags) +number_websites=$(wc -l < temp/all_websites.list) +number_subdomains=$(wc -l < temp/all_subdomains.list) +number_dns=$(grep '^$' temp/all_resolved.txt | wc -l) -log "Generating statistics…" -./export.py --count --first-party > temp/count_recs_firstparty.txt -./export.py --count > temp/count_recs_multiparty.txt -./export.py --rules --count --first-party > temp/count_rules_firstparty.txt -./export.py --rules --count > temp/count_rules_multiparty.txt +for partyness in {first,multi} +do + if [ $partyness = "first" ] + then + partyness_flags="--first-party" + else + partyness_flags="" + fi -log "Sorting lists…" -sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt -sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt -sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt -sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt + echo "Statistics for ${partyness}-party trackers" + echo "Input rules: $(./export.py --count --base-rules $partyness_flags)" + echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)" + echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)" + echo "Output hostnames: $(./export.py --count $partyness_flags)" + echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)" + echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)" + echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)" + echo -log "Generating hosts lists…" -function generate_hosts { - basename="$1" - description="$2" - description2="$3" + for trackerness in {trackers,only-trackers} + do + if [ $trackerness = "trackers" ] + then + trackerness_flags="" + else + trackerness_flags="--end-chain --no-dupplicates" + fi + file_list="dist/${partyness}party-${trackerness}.txt" + file_host="dist/${partyness}party-${trackerness}-hosts.txt" - ( - echo "# First-party trackers host list" - echo "# $description" - echo "# $description2" - echo "#" - echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" - echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" - echo "#" - echo "# In case of false positives/negatives, or any other question," - echo "# contact me the way you like: https://geoffrey.frogeye.fr" - echo "#" - echo "# Latest version:" - echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt" - echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" - echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt" - echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt" - echo '# (you can remove `-hosts` to get the raw list)' - echo "#" - echo "# Generation date: $(date -Isec)" - echo "# Generation software: eulaurarien $(git describe --tags)" - echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)" - echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" - echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO - echo "#" - echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)" - echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)" - echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" - echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)" - echo "#" - echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)" - echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)" - echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" - echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" - echo - sed 's|^|0.0.0.0 |' "dist/$basename.txt" - ) > "dist/$basename-hosts.txt" -} + log "Generating lists for variant ${partyness}-party ${trackerness}…" -generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" "" -generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection." -generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party." -generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists." + # Real export heeere + ./export.py $partyness_flags $trackerness_flags > $file_list + # Sometimes a bit heavy to have the DB open and sort the output + # so this is done in two steps + sort -u $file_list -o $file_list + + rules_input=$(./export.py --count --base-rules $partyness_flags) + rules_found=$(./export.py --count --rules $partyness_flags) + rules_output=$(./export.py --count $partyness_flags $trackerness_flags) + + function link() { # link partyness, link trackerness + url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt" + if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ] + then + url="$url (this one)" + fi + echo $url + } + + ( + echo "# First-party trackers host list" + echo "# Variant: ${partyness}-party ${trackerness}" + echo "#" + echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker" + echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien" + echo "#" + echo "# In case of false positives/negatives, or any other question," + echo "# contact me the way you like: https://geoffrey.frogeye.fr" + echo "#" + echo "# Latest versions:" + echo "# - First-party trackers : $(link first trackers)" + echo "# - … excluding redirected: $(link first only-trackers)" + echo "# - First and third party : $(link multi trackers)" + echo "# - … excluding redirected: $(link multi only-trackers)" + echo '# (you can remove `-hosts` to get the raw list)' + echo "#" + echo "# Generation date: $gen_date" + echo "# Generation software: eulaurarien $gen_software" + echo "# Number of source websites: $number_websites" + echo "# Number of source subdomains: $number_subdomains" + echo "# Number of source DNS records: ~2E9 + $number_dns" + echo "#" + echo "# Input rules: $rules_input" + echo "# Subsequent rules: $rules_found" + echo "# Output rules: $rules_output" + echo "#" + echo + sed 's|^|0.0.0.0 |' "$file_list" + ) > "$file_host" + + done +done diff --git a/feed_dns.py b/feed_dns.py index 0d9dd96..f923831 100755 --- a/feed_dns.py +++ b/feed_dns.py @@ -130,8 +130,8 @@ class Rapid7Parser(Parser): self.register(record) -class DnsMassParser(Parser): - # dnsmass --output Snrql +class MassDnsParser(Parser): + # massdns --output Snrql # --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4 TYPES = { 'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None), @@ -140,7 +140,7 @@ class DnsMassParser(Parser): } def consume(self) -> None: - self.prof.enter_step('parse_dnsmass') + self.prof.enter_step('parse_massdns') timestamp = 0 header = True for line in self.buf: @@ -156,7 +156,7 @@ class DnsMassParser(Parser): header = False else: select, write, name_offset, value_offset = \ - DnsMassParser.TYPES[split[1]] + MassDnsParser.TYPES[split[1]] record = ( select, write, @@ -165,14 +165,14 @@ class DnsMassParser(Parser): split[2][:value_offset], ) self.register(record) - self.prof.enter_step('parse_dnsmass') + self.prof.enter_step('parse_massdns') except KeyError: continue PARSERS = { 'rapid7': Rapid7Parser, - 'dnsmass': DnsMassParser, + 'massdns': MassDnsParser, } if __name__ == '__main__': diff --git a/fetch_resources.sh b/fetch_resources.sh index f4c95b0..d659fbc 100755 --- a/fetch_resources.sh +++ b/fetch_resources.sh @@ -35,12 +35,7 @@ dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list log "Retrieving nameservers…" -rm -f nameservers -touch nameservers -[ -f nameservers.head ] && cat nameservers.head >> nameservers -dl https://public-dns.info/nameservers.txt nameservers.temp -sort -R nameservers.temp >> nameservers -rm nameservers.temp +dl https://public-dns.info/nameservers.txt nameservers/public-dns.list log "Retrieving top subdomains…" dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip diff --git a/filter_subdomains.py b/filter_subdomains.py deleted file mode 100755 index 601a031..0000000 --- a/filter_subdomains.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 -# pylint: disable=C0103 - -""" -From a list of subdomains, output only -the ones resolving to a first-party tracker. -""" - -import argparse -import sys -import progressbar -import csv -import typing -import ipaddress - -# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']] -DomainRule = typing.Union[bool, typing.Dict] -# IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']] -IpRule = typing.Union[bool, typing.Dict] - -RULES_DICT: DomainRule = dict() -RULES_IP_DICT: IpRule = dict() - - -def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]: - for char in address.packed: - for i in range(7, -1, -1): - yield (char >> i) & 0b1 - - -def subdomain_matching(subdomain: str) -> bool: - parts = subdomain.split('.') - parts.reverse() - dic = RULES_DICT - for part in parts: - if isinstance(dic, bool) or part not in dic: - break - dic = dic[part] - if isinstance(dic, bool): - return dic - return False - - -def ip_matching(ip_str: str) -> bool: - ip = ipaddress.ip_address(ip_str) - dic = RULES_IP_DICT - i = 0 - for bit in get_bits(ip): - i += 1 - if isinstance(dic, bool) or bit not in dic: - break - dic = dic[bit] - if isinstance(dic, bool): - return dic - return False - - -def get_matching(chain: typing.List[str], no_explicit: bool = False - ) -> typing.Iterable[str]: - if len(chain) <= 1: - return - initial = chain[0] - cname_destinations = chain[1:-1] - a_destination = chain[-1] - initial_matching = subdomain_matching(initial) - if no_explicit and initial_matching: - return - cname_matching = any(map(subdomain_matching, cname_destinations)) - if cname_matching or initial_matching or ip_matching(a_destination): - yield initial - - -def register_rule(subdomain: str) -> None: - # Make a tree with domain parts - parts = subdomain.split('.') - parts.reverse() - dic = RULES_DICT - last_part = len(parts) - 1 - for p, part in enumerate(parts): - if isinstance(dic, bool): - return - if p == last_part: - dic[part] = True - else: - dic.setdefault(part, dict()) - dic = dic[part] - - -def register_rule_ip(network: str) -> None: - net = ipaddress.ip_network(network) - ip = net.network_address - dic = RULES_IP_DICT - last_bit = net.prefixlen - 1 - for b, bit in enumerate(get_bits(ip)): - if isinstance(dic, bool): - return - if b == last_bit: - dic[bit] = True - else: - dic.setdefault(bit, dict()) - dic = dic[bit] - - -if __name__ == '__main__': - - # Parsing arguments - parser = argparse.ArgumentParser( - description="Filter first-party trackers from a list of subdomains") - parser.add_argument( - '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, - help="Input file with DNS chains") - parser.add_argument( - '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, - help="Outptut file with one tracking subdomain per line") - parser.add_argument( - '-n', '--no-explicit', action='store_true', - help="Don't output domains already blocked with rules without CNAME") - parser.add_argument( - '-r', '--rules', type=argparse.FileType('r'), - help="List of domains domains to block (with their subdomains)") - parser.add_argument( - '-p', '--rules-ip', type=argparse.FileType('r'), - help="List of IPs ranges to block") - args = parser.parse_args() - - # Progress bar - widgets = [ - progressbar.Percentage(), - ' ', progressbar.SimpleProgress(), - ' ', progressbar.Bar(), - ' ', progressbar.Timer(), - ' ', progressbar.AdaptiveTransferSpeed(unit='req'), - ' ', progressbar.AdaptiveETA(), - ] - progress = progressbar.ProgressBar(widgets=widgets) - - # Reading rules - if args.rules: - for rule in args.rules: - register_rule(rule.strip()) - if args.rules_ip: - for rule in args.rules_ip: - register_rule_ip(rule.strip()) - - # Approximating line count - if args.input.seekable(): - lines = 0 - for line in args.input: - lines += 1 - progress.max_value = lines - args.input.seek(0) - - # Reading domains to filter - reader = csv.reader(args.input) - progress.start() - for chain in reader: - for match in get_matching(chain, no_explicit=args.no_explicit): - print(match, file=args.output) - progress.update(progress.value + 1) - progress.finish() diff --git a/import_rules.sh b/import_rules.sh index 14c8c78..cbcfbd8 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as ./feed_asn.py -log "Pruning old rules…" -./db.py --prune --prune-before "$BEFORE" --prune-base +# log "Pruning old rules…" +# ./db.py --prune --prune-before "$BEFORE" --prune-base diff --git a/nameservers/.gitignore b/nameservers/.gitignore new file mode 100644 index 0000000..dbd03bc --- /dev/null +++ b/nameservers/.gitignore @@ -0,0 +1,2 @@ +*.custom.list +*.cache.list diff --git a/nameservers/popular.list b/nameservers/popular.list new file mode 100644 index 0000000..c35d391 --- /dev/null +++ b/nameservers/popular.list @@ -0,0 +1,24 @@ +8.8.8.8 +8.8.4.4 +2001:4860:4860:0:0:0:0:8888 +2001:4860:4860:0:0:0:0:8844 +208.67.222.222 +208.67.220.220 +2620:119:35::35 +2620:119:53::53 +4.2.2.1 +4.2.2.2 +8.26.56.26 +8.20.247.20 +84.200.69.80 +84.200.70.40 +2001:1608:10:25:0:0:1c04:b12f +2001:1608:10:25:0:0:9249:d69b +9.9.9.10 +149.112.112.10 +2620:fe::10 +2620:fe::fe:10 +1.1.1.1 +1.0.0.1 +2606:4700:4700::1111 +2606:4700:4700::1001 diff --git a/resolve_subdomains.sh b/resolve_subdomains.sh index e37ddeb..7a91337 100755 --- a/resolve_subdomains.sh +++ b/resolve_subdomains.sh @@ -4,9 +4,16 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Compiling locally known subdomain…" -# Sort by last character to utilize the DNS server caching mechanism -pv subdomains/*.list | sed 's/\r$//' | rev | sort -u | rev > temp/all_subdomains.list -log "Resolving locally known subdomain…" -pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.csv +log "Compiling nameservers…" +pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list +log "Compiling subdomain…" +# Sort by last character to utilize the DNS server caching mechanism +# (not as efficient with massdns but it's almost free so why not) +pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list + +log "Resolving subdomain…" +massdns --output Snrql --retry REFUSED,SERVFAIL --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list + +log "Importing into database…" +pv temp/all_resolved.txt | ./feed_dns.py massdns diff --git a/validate_list.py b/validate_list.py new file mode 100755 index 0000000..62301c2 --- /dev/null +++ b/validate_list.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# pylint: disable=C0103 + +""" +Filter out invalid domain names +""" + +import database +import argparse +import sys + +if __name__ == '__main__': + + # Parsing arguments + parser = argparse.ArgumentParser( + description="Filter out invalid domain names.") + parser.add_argument( + '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, + help="TODO") + parser.add_argument( + '-o', '--output', type=argparse.FileType('w'), default=sys.stdout, + help="TODO") + parser.add_argument( + '-d', '--domain', action='store_true', + help="Can be domain") + parser.add_argument( + '-4', '--ip4', action='store_true', + help="Can be IP4") + args = parser.parse_args() + + for line in args.input: + line = line.strip() + if (args.domain and database.Database.validate_domain(line)) or \ + (args.ip4 and database.Database.validate_ip4address(line)): + print(line, file=args.output)