Browse Source

Fixed scripting around

tags/v2.1
Geoffrey Frogeye 3 months ago
parent
commit
aca5023c3f
Signed by: geoffrey <geoffrey@frogeye.fr> GPG Key ID: D8A7ECA00A8CD3DD
12 changed files with 208 additions and 282 deletions
  1. +0
    -2
      .gitignore
  2. +27
    -32
      database.py
  3. +14
    -9
      export.py
  4. +86
    -61
      export_lists.sh
  5. +6
    -6
      feed_dns.py
  6. +1
    -6
      fetch_resources.sh
  7. +0
    -160
      filter_subdomains.py
  8. +2
    -2
      import_rules.sh
  9. +2
    -0
      nameservers/.gitignore
  10. +24
    -0
      nameservers/popular.list
  11. +11
    -4
      resolve_subdomains.sh
  12. +35
    -0
      validate_list.py

+ 0
- 2
.gitignore View File

@@ -1,4 +1,2 @@
*.log
*.p
nameservers
nameservers.head

+ 27
- 32
database.py View File

@@ -216,7 +216,7 @@ class Database(Profiler):
splits = path.split('.')
if not TLD_LIST:
Database.populate_tld_list()
if splits[0] not in TLD_LIST:
if splits[-1] not in TLD_LIST:
return False
for split in splits:
if not 1 <= len(split) <= 63:
@@ -460,62 +460,56 @@ class Database(Profiler):
string += f' ← {self.explain(match.source)}'
return string

def export(self,
first_party_only: bool = False,
end_chain_only: bool = False,
no_dupplicates: bool = False,
explain: bool = False,
) -> typing.Iterable[str]:
def list_records(self,
first_party_only: bool = False,
end_chain_only: bool = False,
no_dupplicates: bool = False,
rules_only: bool = False,
hostnames_only: bool = False,
explain: bool = False,
) -> typing.Iterable[str]:

def export_cb(path: Path, match: Match
) -> typing.Iterable[str]:
assert isinstance(path, DomainPath)
if not isinstance(path, HostnamePath):
return
if first_party_only and not match.first_party:
return
if end_chain_only and match.references > 0:
return
if no_dupplicates and match.dupplicate:
return
if rules_only and match.level > 1:
return
if hostnames_only and not isinstance(path, HostnamePath):
return

if explain:
yield self.explain(path)
else:
yield self.unpack_domain(path)

yield from self.exec_each_domain(export_cb)

def list_rules(self,
first_party_only: bool = False,
) -> typing.Iterable[str]:

def list_rules_cb(path: Path, match: Match
) -> typing.Iterable[str]:
if first_party_only and not match.first_party:
return
if isinstance(path, ZonePath) \
or (isinstance(path, Ip4Path) and path.prefixlen < 32):
# if match.level == 1:
# It should be the latter condition but it is more
# useful when using the former
yield self.explain(path)
yield str(path)

yield from self.exec_each(list_rules_cb)
yield from self.exec_each(export_cb)

def count_records(self,
first_party_only: bool = False,
rules_only: bool = False,
end_chain_only: bool = False,
no_dupplicates: bool = False,
rules_only: bool = False,
hostnames_only: bool = False,
) -> str:
memo: typing.Dict[str, int] = dict()

def count_records_cb(path: Path, match: Match) -> None:
if first_party_only and not match.first_party:
return
if rules_only and match.level > 1:
if end_chain_only and match.references > 0:
return
if no_dupplicates and match.dupplicate:
return
if rules_only and match.level > 1:
return
if hostnames_only and not isinstance(path, HostnamePath):
return

try:
memo[path.__class__.__name__] += 1
except KeyError:
@@ -523,9 +517,10 @@ class Database(Profiler):

for _ in self.exec_each(count_records_cb):
pass

split: typing.List[str] = list()
for key, value in sorted(memo.items(), key=lambda s: s[0]):
split.append(f'{key[:-4]}: {value}')
split.append(f'{key[:-4].lower()}s: {value}')
return ', '.join(split)

def get_domain(self, domain_str: str) -> typing.Iterable[DomainPath]:


+ 14
- 9
export.py View File

@@ -19,15 +19,18 @@ if __name__ == '__main__':
parser.add_argument(
'-e', '--end-chain', action='store_true',
help="TODO")
parser.add_argument(
'-x', '--explain', action='store_true',
help="TODO")
parser.add_argument(
'-r', '--rules', action='store_true',
help="TODO")
parser.add_argument(
'-b', '--base-rules', action='store_true',
help="TODO implies rules")
parser.add_argument(
'-d', '--no-dupplicates', action='store_true',
help="TODO")
parser.add_argument(
'-x', '--explain', action='store_true',
help="TODO")
parser.add_argument(
'-c', '--count', action='store_true',
help="TODO")
@@ -36,19 +39,21 @@ if __name__ == '__main__':
DB = database.Database()

if args.count:
assert not args.explain
print(DB.count_records(
first_party_only=args.first_party,
rules_only=args.rules,
end_chain_only=args.end_chain,
no_dupplicates=args.no_dupplicates,
))
rules_only=args.base_rules,
hostnames_only=not (args.rules or args.base_rules),
))
else:
if args.rules:
for line in DB.list_rules():
print(line)
for domain in DB.export(
for domain in DB.list_records(
first_party_only=args.first_party,
end_chain_only=args.end_chain,
no_dupplicates=args.no_dupplicates,
rules_only=args.base_rules,
hostnames_only=not (args.rules or args.base_rules),
explain=args.explain,
):
print(domain, file=args.output)

+ 86
- 61
export_lists.sh View File

@@ -4,69 +4,94 @@ function log() {
echo -e "\033[33m$@\033[0m"
}

log "Exporting lists…"
./export.py --first-party --output dist/firstparty-trackers.txt
./export.py --first-party --end-chain --no-dupplicates --output dist/firstparty-only-trackers.txt
./export.py --output dist/multiparty-trackers.txt
./export.py --end-chain --no-dupplicates --output dist/multiparty-only-trackers.txt
log "Calculating statistics…"
gen_date=$(date -Isec)
gen_software=$(git describe --tags)
number_websites=$(wc -l < temp/all_websites.list)
number_subdomains=$(wc -l < temp/all_subdomains.list)
number_dns=$(grep '^$' temp/all_resolved.txt | wc -l)

log "Generating statistics…"
./export.py --count --first-party > temp/count_recs_firstparty.txt
./export.py --count > temp/count_recs_multiparty.txt
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
for partyness in {first,multi}
do
if [ $partyness = "first" ]
then
partyness_flags="--first-party"
else
partyness_flags=""
fi

log "Sorting lists…"
sort -u dist/firstparty-trackers.txt -o dist/firstparty-trackers.txt
sort -u dist/firstparty-only-trackers.txt -o dist/firstparty-only-trackers.txt
sort -u dist/multiparty-trackers.txt -o dist/multiparty-trackers.txt
sort -u dist/multiparty-only-trackers.txt -o dist/multiparty-only-trackers.txt
echo "Statistics for ${partyness}-party trackers"
echo "Input rules: $(./export.py --count --base-rules $partyness_flags)"
echo "Subsequent rules: $(./export.py --count --rules $partyness_flags)"
echo "Subsequent rules (no dupplicate): $(./export.py --count --rules --no-dupplicates $partyness_flags)"
echo "Output hostnames: $(./export.py --count $partyness_flags)"
echo "Output hostnames (no dupplicate): $(./export.py --count --no-dupplicates $partyness_flags)"
echo "Output hostnames (end-chain only): $(./export.py --count --end-chain $partyness_flags)"
echo "Output hostnames (no dupplicate, end-chain only): $(./export.py --count --no-dupplicates --end-chain $partyness_flags)"
echo

log "Generating hosts lists…"
function generate_hosts {
basename="$1"
description="$2"
description2="$3"
for trackerness in {trackers,only-trackers}
do
if [ $trackerness = "trackers" ]
then
trackerness_flags=""
else
trackerness_flags="--end-chain --no-dupplicates"
fi
file_list="dist/${partyness}party-${trackerness}.txt"
file_host="dist/${partyness}party-${trackerness}-hosts.txt"

(
echo "# First-party trackers host list"
echo "# $description"
echo "# $description2"
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# In case of false positives/negatives, or any other question,"
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
echo "#"
echo "# Latest version:"
echo "# - First-party trackers : https://hostfiles.frogeye.fr/firstparty-trackers-hosts.txt"
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt"
echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt"
echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt"
echo '# (you can remove `-hosts` to get the raw list)'
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2E9 + $(wc -l temp/all_resolved.json | cut -d' ' -f1)" # TODO
echo "#"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Found first-party trackers: $(cat temp/count_recs_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Found multi-party trackers: $(cat temp/count_recs_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo
sed 's|^|0.0.0.0 |' "dist/$basename.txt"
) > "dist/$basename-hosts.txt"
}
log "Generating lists for variant ${partyness}-party ${trackerness}…"

# Real export heeere
./export.py $partyness_flags $trackerness_flags > $file_list
# Sometimes a bit heavy to have the DB open and sort the output
# so this is done in two steps
sort -u $file_list -o $file_list

rules_input=$(./export.py --count --base-rules $partyness_flags)
rules_found=$(./export.py --count --rules $partyness_flags)
rules_output=$(./export.py --count $partyness_flags $trackerness_flags)

function link() { # link partyness, link trackerness
url="https://hostfiles.frogeye.fr/${partyness}party-${trackerness}-hosts.txt"
if [ "$1" = "$partyness" ] && [ "$2" = "$trackerness" ]
then
url="$url (this one)"
fi
echo $url
}

(
echo "# First-party trackers host list"
echo "# Variant: ${partyness}-party ${trackerness}"
echo "#"
echo "# About first-party trackers: https://git.frogeye.fr/geoffrey/eulaurarien#whats-a-first-party-tracker"
echo "# Source code: https://git.frogeye.fr/geoffrey/eulaurarien"
echo "#"
echo "# In case of false positives/negatives, or any other question,"
echo "# contact me the way you like: https://geoffrey.frogeye.fr"
echo "#"
echo "# Latest versions:"
echo "# - First-party trackers : $(link first trackers)"
echo "# - … excluding redirected: $(link first only-trackers)"
echo "# - First and third party : $(link multi trackers)"
echo "# - … excluding redirected: $(link multi only-trackers)"
echo '# (you can remove `-hosts` to get the raw list)'
echo "#"
echo "# Generation date: $gen_date"
echo "# Generation software: eulaurarien $gen_software"
echo "# Number of source websites: $number_websites"
echo "# Number of source subdomains: $number_subdomains"
echo "# Number of source DNS records: ~2E9 + $number_dns"
echo "#"
echo "# Input rules: $rules_input"
echo "# Subsequent rules: $rules_found"
echo "# Output rules: $rules_output"
echo "#"
echo
sed 's|^|0.0.0.0 |' "$file_list"
) > "$file_host"

generate_hosts "firstparty-trackers" "Generated from a curated list of first-party trackers" ""
generate_hosts "firstparty-only-trackers" "Generated from a curated list of first-party trackers" "Only contain the first chain of redirection."
generate_hosts "multiparty-trackers" "Generated from known third-party trackers." "Also contains trackers used as third-party."
generate_hosts "multiparty-only-trackers" "Generated from known third-party trackers." "Do not contain trackers used in third-party. Use in combination with third-party lists."
done
done

+ 6
- 6
feed_dns.py View File

@@ -130,8 +130,8 @@ class Rapid7Parser(Parser):
self.register(record)


class DnsMassParser(Parser):
# dnsmass --output Snrql
class MassDnsParser(Parser):
# massdns --output Snrql
# --retry REFUSED,SERVFAIL --resolvers nameservers-ipv4
TYPES = {
'A': (FUNCTION_MAP['a'][0], FUNCTION_MAP['a'][1], -1, None),
@@ -140,7 +140,7 @@ class DnsMassParser(Parser):
}

def consume(self) -> None:
self.prof.enter_step('parse_dnsmass')
self.prof.enter_step('parse_massdns')
timestamp = 0
header = True
for line in self.buf:
@@ -156,7 +156,7 @@ class DnsMassParser(Parser):
header = False
else:
select, write, name_offset, value_offset = \
DnsMassParser.TYPES[split[1]]
MassDnsParser.TYPES[split[1]]
record = (
select,
write,
@@ -165,14 +165,14 @@ class DnsMassParser(Parser):
split[2][:value_offset],
)
self.register(record)
self.prof.enter_step('parse_dnsmass')
self.prof.enter_step('parse_massdns')
except KeyError:
continue


PARSERS = {
'rapid7': Rapid7Parser,
'dnsmass': DnsMassParser,
'massdns': MassDnsParser,
}

if __name__ == '__main__':


+ 1
- 6
fetch_resources.sh View File

@@ -35,12 +35,7 @@ dl http://data.iana.org/TLD/tlds-alpha-by-domain.txt temp/all_tld.temp.list
grep -v '^#' temp/all_tld.temp.list | awk '{print tolower($0)}' > temp/all_tld.list

log "Retrieving nameservers…"
rm -f nameservers
touch nameservers
[ -f nameservers.head ] && cat nameservers.head >> nameservers
dl https://public-dns.info/nameservers.txt nameservers.temp
sort -R nameservers.temp >> nameservers
rm nameservers.temp
dl https://public-dns.info/nameservers.txt nameservers/public-dns.list

log "Retrieving top subdomains…"
dl http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip top-1m.csv.zip


+ 0
- 160
filter_subdomains.py View File

@@ -1,160 +0,0 @@
#!/usr/bin/env python3
# pylint: disable=C0103

"""
From a list of subdomains, output only
the ones resolving to a first-party tracker.
"""

import argparse
import sys
import progressbar
import csv
import typing
import ipaddress

# DomainRule = typing.Union[bool, typing.Dict[str, 'DomainRule']]
DomainRule = typing.Union[bool, typing.Dict]
# IpRule = typing.Union[bool, typing.Dict[int, 'DomainRule']]
IpRule = typing.Union[bool, typing.Dict]

RULES_DICT: DomainRule = dict()
RULES_IP_DICT: IpRule = dict()


def get_bits(address: ipaddress.IPv4Address) -> typing.Iterator[int]:
for char in address.packed:
for i in range(7, -1, -1):
yield (char >> i) & 0b1


def subdomain_matching(subdomain: str) -> bool:
parts = subdomain.split('.')
parts.reverse()
dic = RULES_DICT
for part in parts:
if isinstance(dic, bool) or part not in dic:
break
dic = dic[part]
if isinstance(dic, bool):
return dic
return False


def ip_matching(ip_str: str) -> bool:
ip = ipaddress.ip_address(ip_str)
dic = RULES_IP_DICT
i = 0
for bit in get_bits(ip):
i += 1
if isinstance(dic, bool) or bit not in dic:
break
dic = dic[bit]
if isinstance(dic, bool):
return dic
return False


def get_matching(chain: typing.List[str], no_explicit: bool = False
) -> typing.Iterable[str]:
if len(chain) <= 1:
return
initial = chain[0]
cname_destinations = chain[1:-1]
a_destination = chain[-1]
initial_matching = subdomain_matching(initial)
if no_explicit and initial_matching:
return
cname_matching = any(map(subdomain_matching, cname_destinations))
if cname_matching or initial_matching or ip_matching(a_destination):
yield initial


def register_rule(subdomain: str) -> None:
# Make a tree with domain parts
parts = subdomain.split('.')
parts.reverse()
dic = RULES_DICT
last_part = len(parts) - 1
for p, part in enumerate(parts):
if isinstance(dic, bool):
return
if p == last_part:
dic[part] = True
else:
dic.setdefault(part, dict())
dic = dic[part]


def register_rule_ip(network: str) -> None:
net = ipaddress.ip_network(network)
ip = net.network_address
dic = RULES_IP_DICT
last_bit = net.prefixlen - 1
for b, bit in enumerate(get_bits(ip)):
if isinstance(dic, bool):
return
if b == last_bit:
dic[bit] = True
else:
dic.setdefault(bit, dict())
dic = dic[bit]


if __name__ == '__main__':

# Parsing arguments
parser = argparse.ArgumentParser(
description="Filter first-party trackers from a list of subdomains")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with DNS chains")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line")
parser.add_argument(
'-n', '--no-explicit', action='store_true',
help="Don't output domains already blocked with rules without CNAME")
parser.add_argument(
'-r', '--rules', type=argparse.FileType('r'),
help="List of domains domains to block (with their subdomains)")
parser.add_argument(
'-p', '--rules-ip', type=argparse.FileType('r'),
help="List of IPs ranges to block")
args = parser.parse_args()

# Progress bar
widgets = [
progressbar.Percentage(),
' ', progressbar.SimpleProgress(),
' ', progressbar.Bar(),
' ', progressbar.Timer(),
' ', progressbar.AdaptiveTransferSpeed(unit='req'),
' ', progressbar.AdaptiveETA(),
]
progress = progressbar.ProgressBar(widgets=widgets)

# Reading rules
if args.rules:
for rule in args.rules:
register_rule(rule.strip())
if args.rules_ip:
for rule in args.rules_ip:
register_rule_ip(rule.strip())

# Approximating line count
if args.input.seekable():
lines = 0
for line in args.input:
lines += 1
progress.max_value = lines
args.input.seek(0)

# Reading domains to filter
reader = csv.reader(args.input)
progress.start()
for chain in reader:
for match in get_matching(chain, no_explicit=args.no_explicit):
print(match, file=args.output)
progress.update(progress.value + 1)
progress.finish()

+ 2
- 2
import_rules.sh View File

@@ -18,5 +18,5 @@ cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py as

./feed_asn.py

log "Pruning old rules…"
./db.py --prune --prune-before "$BEFORE" --prune-base
# log "Pruning old rules…"
# ./db.py --prune --prune-before "$BEFORE" --prune-base

+ 2
- 0
nameservers/.gitignore View File

@@ -0,0 +1,2 @@
*.custom.list
*.cache.list

+ 24
- 0
nameservers/popular.list View File

@@ -0,0 +1,24 @@
8.8.8.8
8.8.4.4
2001:4860:4860:0:0:0:0:8888
2001:4860:4860:0:0:0:0:8844
208.67.222.222
208.67.220.220
2620:119:35::35
2620:119:53::53
4.2.2.1
4.2.2.2
8.26.56.26
8.20.247.20
84.200.69.80
84.200.70.40
2001:1608:10:25:0:0:1c04:b12f
2001:1608:10:25:0:0:9249:d69b
9.9.9.10
149.112.112.10
2620:fe::10
2620:fe::fe:10
1.1.1.1
1.0.0.1
2606:4700:4700::1111
2606:4700:4700::1001

+ 11
- 4
resolve_subdomains.sh View File

@@ -4,9 +4,16 @@ function log() {
echo -e "\033[33m$@\033[0m"
}

log "Compiling locally known subdomain…"
log "Compiling nameservers…"
pv nameservers/*.list | ./validate_list.py --ip4 | sort -u > temp/all_nameservers_ip4.list

log "Compiling subdomain…"
# Sort by last character to utilize the DNS server caching mechanism
pv subdomains/*.list | sed 's/\r$//' | rev | sort -u | rev > temp/all_subdomains.list
log "Resolving locally known subdomain…"
pv temp/all_subdomains.list | ./resolve_subdomains.py --output temp/all_resolved.csv
# (not as efficient with massdns but it's almost free so why not)
pv subdomains/*.list | ./validate_list.py --domain | rev | sort -u | rev > temp/all_subdomains.list

log "Resolving subdomain…"
massdns --output Snrql --retry REFUSED,SERVFAIL --resolvers temp/all_nameservers_ip4.list --outfile temp/all_resolved.txt temp/all_subdomains.list

log "Importing into database…"
pv temp/all_resolved.txt | ./feed_dns.py massdns

+ 35
- 0
validate_list.py View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python3
# pylint: disable=C0103

"""
Filter out invalid domain names
"""

import database
import argparse
import sys

if __name__ == '__main__':

# Parsing arguments
parser = argparse.ArgumentParser(
description="Filter out invalid domain names.")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="TODO")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="TODO")
parser.add_argument(
'-d', '--domain', action='store_true',
help="Can be domain")
parser.add_argument(
'-4', '--ip4', action='store_true',
help="Can be IP4")
args = parser.parse_args()

for line in args.input:
line = line.strip()
if (args.domain and database.Database.validate_domain(line)) or \
(args.ip4 and database.Database.validate_ip4address(line)):
print(line, file=args.output)

Loading…
Cancel
Save