From e19f6663312abd1337c722e5274d3409f617793c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geoffrey=20=E2=80=9CFrogeye=E2=80=9D=20Preud=27homme?= Date: Fri, 13 Dec 2019 08:23:38 +0100 Subject: [PATCH] Workflow: Automatically import IP ranges from ASN Closes #9 --- database.py | 43 +++++++++++++++++++++++++++++++-- feed_rules.py | 15 ++++++------ filter_subdomains.sh | 19 ++++++--------- import_rules.sh | 5 ++++ rules_asn/.gitignore | 2 ++ rules_asn/first-party.txt | 10 ++++++++ rules_ip/first-party.txt | 51 --------------------------------------- 7 files changed, 72 insertions(+), 73 deletions(-) create mode 100644 rules_asn/.gitignore create mode 100644 rules_asn/first-party.txt diff --git a/database.py b/database.py index 4daf0ec..aa38604 100755 --- a/database.py +++ b/database.py @@ -33,6 +33,9 @@ class Database(): # self.conn.create_function("prepare_ip4address", 1, # Database.prepare_ip4address, # deterministic=True) + self.conn.create_function("unpack_domain", 1, + lambda s: s[:-1][::-1], + deterministic=True) def execute(self, cmd: str, args: typing.Union[ typing.Tuple[DbValue, ...], @@ -123,6 +126,13 @@ class Database(): def prepare_zone(self, zone: str) -> str: return self.prepare_hostname(zone) + @staticmethod + def prepare_asn(asn: str) -> int: + asn = asn.upper() + if asn.startswith('AS'): + asn = asn[2:] + return int(asn) + @staticmethod def prepare_ip4address(address: str) -> int: total = 0 @@ -169,7 +179,7 @@ class Database(): def export(self, first_party_only: bool = False, end_chain_only: bool = False) -> typing.Iterable[str]: - command = 'SELECT val FROM rules ' \ + command = 'SELECT unpack_domain(val) FROM rules ' \ 'INNER JOIN hostname ON rules.id = hostname.entry' restrictions: typing.List[str] = list() if first_party_only: @@ -178,9 +188,10 @@ class Database(): restrictions.append('rules.refs = 0') if restrictions: command += ' WHERE ' + ' AND '.join(restrictions) + command += ' ORDER BY unpack_domain(val) ASC' self.execute(command) for val, in self.cursor: - yield val[:-1][::-1] + yield val def get_domain(self, domain: str) -> typing.Iterable[int]: self.enter_step('get_domain_prepare') @@ -235,6 +246,13 @@ class Database(): self.enter_step('get_ip4_yield') yield entry + def list_asn(self) -> typing.Iterable[typing.Tuple[str, int]]: + self.enter_step('list_asn_select') + self.enter_step('get_domain_select') + self.execute('SELECT val, entry FROM asn') + for val, entry in self.cursor: + yield f'AS{val}', entry + def _set_generic(self, table: str, select_query: str, @@ -325,8 +343,29 @@ class Database(): *args, **kwargs ) + def set_asn(self, asn: str, + *args: typing.Any, **kwargs: typing.Any) -> None: + self.enter_step('set_asn_prepare') + try: + asn_prep = self.prepare_asn(asn) + except ValueError: + self.log.error("Invalid asn: %s", asn) + return + prep: typing.Dict[str, DbValue] = { + 'val': asn_prep, + } + self._set_generic( + 'asn', + 'SELECT entry FROM asn WHERE val=:val', + 'INSERT INTO asn (val, entry) ' + 'VALUES (:val, :entry)', + prep, + *args, **kwargs + ) + def set_ip4address(self, ip4address: str, *args: typing.Any, **kwargs: typing.Any) -> None: + # TODO Do not add if already in ip4network self.enter_step('set_ip4add_prepare') try: ip4address_prep = self.prepare_ip4address(ip4address) diff --git a/feed_rules.py b/feed_rules.py index 7a19614..a1d236d 100755 --- a/feed_rules.py +++ b/feed_rules.py @@ -3,8 +3,12 @@ import database import argparse import sys -import ipaddress +FUNCTION_MAP = { + 'zone': database.Database.set_zone, + 'ip4network': database.Database.set_ip4network, + 'asn': database.Database.set_asn, +} if __name__ == '__main__': @@ -13,7 +17,7 @@ if __name__ == '__main__': description="TODO") parser.add_argument( 'type', - choices={'zone', 'ip4network'}, + choices=FUNCTION_MAP.keys(), help="Type of rule inputed") parser.add_argument( '-i', '--input', type=argparse.FileType('r'), default=sys.stdin, @@ -25,14 +29,9 @@ if __name__ == '__main__': DB = database.Database() - FUNCTION_MAP = { - 'zone': DB.set_zone, - 'ip4network': DB.set_ip4network, - } - fun = FUNCTION_MAP[args.type] for rule in args.input: - fun(rule.strip(), is_first_party=args.first_party) + fun(DB, rule.strip(), is_first_party=args.first_party) DB.close() diff --git a/filter_subdomains.sh b/filter_subdomains.sh index 98638a9..67783e8 100755 --- a/filter_subdomains.sh +++ b/filter_subdomains.sh @@ -4,16 +4,13 @@ function log() { echo -e "\033[33m$@\033[0m" } -log "Updating references…" -./database.py --references - log "Exporting lists…" -./export.py --first-party | sort -u > dist/firstparty-trackers.txt -./export.py --first-party --end-chain | sort -u > dist/firstparty-only-trackers.txt -./export.py | sort -u > dist/multiparty-trackers.txt -./export.py --end-chain | sort -u > dist/multiparty-only-trackers.txt +./export.py --first-party --output dist/firstparty-trackers.txt +./export.py --first-party --end-chain --output dist/firstparty-only-trackers.txt +./export.py --output dist/multiparty-trackers.txt +./export.py --end-chain --output dist/multiparty-only-trackers.txt -# Format the blocklist so it can be used as a hostlist +log "Generating hosts lists…" function generate_hosts { basename="$1" description="$2" @@ -35,6 +32,7 @@ function generate_hosts { echo "# - … excluding redirected: https://hostfiles.frogeye.fr/firstparty-only-trackers-hosts.txt" echo "# - First and third party : https://hostfiles.frogeye.fr/multiparty-trackers-hosts.txt" echo "# - … excluding redirected: https://hostfiles.frogeye.fr/multiparty-only-trackers-hosts.txt" + echo '# (you can remove `-hosts` to get the raw list)' echo "#" echo "# Generation date: $(date -Isec)" echo "# Generation software: eulaurarien $(git describe --tags)" @@ -49,10 +47,7 @@ function generate_hosts { echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)" echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)" echo - cat "dist/$basename.txt" | while read host; - do - echo "0.0.0.0 $host" - done + sed 's|^|0.0.0.0 |' "dist/$basename.txt" ) > "dist/$basename-hosts.txt" } diff --git a/import_rules.sh b/import_rules.sh index d4d4719..358155c 100755 --- a/import_rules.sh +++ b/import_rules.sh @@ -9,6 +9,11 @@ cat rules_adblock/*.txt | grep -v '^!' | grep -v '^\[Adblock' | ./adblock_to_dom cat rules_hosts/*.txt | grep -v '^#' | grep -v '^$' | cut -d ' ' -f2 | ./feed_rules.py zone cat rules/*.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone cat rules_ip/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network +cat rules_asn/*.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn + cat rules/first-party.list | grep -v '^#' | grep -v '^$' | ./feed_rules.py zone --first-party cat rules_ip/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py ip4network --first-party +cat rules_asn/first-party.txt | grep -v '^#' | grep -v '^$' | ./feed_rules.py asn --first-party + +./feed_asn.py diff --git a/rules_asn/.gitignore b/rules_asn/.gitignore new file mode 100644 index 0000000..d2df6a8 --- /dev/null +++ b/rules_asn/.gitignore @@ -0,0 +1,2 @@ +*.custom.txt +*.cache.txt diff --git a/rules_asn/first-party.txt b/rules_asn/first-party.txt new file mode 100644 index 0000000..e7b93fa --- /dev/null +++ b/rules_asn/first-party.txt @@ -0,0 +1,10 @@ +# Eulerian +AS50234 +# Criteo +AS44788 +AS19750 +AS55569 +# ThreatMetrix +AS30286 +# Webtrekk +AS60164 diff --git a/rules_ip/first-party.txt b/rules_ip/first-party.txt index 3561894..e69de29 100644 --- a/rules_ip/first-party.txt +++ b/rules_ip/first-party.txt @@ -1,51 +0,0 @@ -# Eulerian (AS50234 EULERIAN TECHNOLOGIES S.A.S.) -109.232.192.0/21 -# Criteo (AS44788 Criteo SA) -91.199.242.0/24 -91.212.98.0/24 -178.250.0.0/21 -178.250.0.0/24 -178.250.1.0/24 -178.250.2.0/24 -178.250.3.0/24 -178.250.4.0/24 -178.250.6.0/24 -185.235.84.0/24 -# Criteo (AS19750 Criteo Corp.) -74.119.116.0/22 -74.119.117.0/24 -74.119.118.0/24 -74.119.119.0/24 -91.199.242.0/24 -185.235.85.0/24 -199.204.168.0/22 -199.204.168.0/24 -199.204.169.0/24 -199.204.170.0/24 -199.204.171.0/24 -178.250.0.0/21 -91.212.98.0/24 -91.199.242.0/24 -185.235.84.0/24 -# Criteo (AS55569 Criteo APAC) -91.199.242.0/24 -116.213.20.0/22 -116.213.20.0/24 -116.213.21.0/24 -182.161.72.0/22 -182.161.72.0/24 -182.161.73.0/24 -185.235.86.0/24 -185.235.87.0/24 -# ThreatMetrix (AS30286 ThreatMetrix Inc.) -69.84.176.0/24 -173.254.179.0/24 -185.32.240.0/23 -185.32.242.0/23 -192.225.156.0/22 -199.101.156.0/23 -199.101.158.0/23 -# Webtrekk (AS60164 Webtrekk GmbH) -185.54.148.0/22 -185.54.150.0/24 -185.54.151.0/24