Browse Source
Improved rules handling
Improved rules handling
Rules can now come in 3 different formats: - AdBlock rules - Host lists - Domains lists All will be converted into domain lists and aggregated (only AdBlock rules matching a whole domain will be kept). Subdomains will now be matched if it is a subdomain of any domain of the rule. It is way faster (seconds rather than hours!) but less flexible (although it shouldn't be a problem).newworkflow_parseropti v1.4
11 changed files with 130 additions and 28 deletions
-
49adblock_to_domain_list.py
-
1eulaurarien.sh
-
18fetch_resources.sh
-
45filter_subdomains.py
-
28filter_subdomains.sh
-
7resolve_subdomains.sh
-
4rules/.gitignore
-
1rules/first-party.list
-
1rules/first-party.txt
-
2rules_adblock/.gitignore
-
2rules_hosts/.gitignore
@ -0,0 +1,49 @@ |
|||
#!/usr/bin/env python3 |
|||
# pylint: disable=C0103 |
|||
|
|||
""" |
|||
Extract the domains to block as a whole |
|||
from a AdBlock rules list. |
|||
""" |
|||
|
|||
import argparse |
|||
import sys |
|||
import typing |
|||
|
|||
import abp.filters |
|||
|
|||
|
|||
def get_domains(rule: abp.filters.parser.Filter) -> typing.Iterable[str]: |
|||
for key, val in rule.options: |
|||
if key not in ('third-party',): |
|||
return |
|||
selector_type = rule.selector['type'] |
|||
selector_value = rule.selector['value'] |
|||
if selector_type == 'url-pattern' \ |
|||
and selector_value.startswith('||') \ |
|||
and selector_value.endswith('^'): |
|||
yield selector_value[2:-1] |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# Parsing arguments |
|||
parser = argparse.ArgumentParser( |
|||
description="TODO") |
|||
parser.add_argument( |
|||
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin, |
|||
help="Input file with AdBlock rules") |
|||
parser.add_argument( |
|||
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout, |
|||
help="Outptut file with one rule tracking subdomain per line") |
|||
args = parser.parse_args() |
|||
|
|||
# Reading rules |
|||
rules = abp.filters.parse_filterlist(args.input) |
|||
|
|||
# Filtering |
|||
for rule in rules: |
|||
if not isinstance(rule, abp.filters.parser.Filter): |
|||
continue |
|||
for domain in get_domains(rule): |
|||
print(domain, file=args.output) |
@ -0,0 +1,7 @@ |
|||
#!/usr/bin/env bash |
|||
|
|||
# Resolve the CNAME chain of all the known subdomains for later analysis |
|||
cat subdomains/*.list | sort -u > temp/all_subdomains.list |
|||
./resolve_subdomains.py --input temp/all_subdomains.list --output temp/all_resolved.csv |
|||
sort -u temp/all_resolved.csv > temp/all_resolved_sorted.csv |
|||
|
@ -1,2 +1,2 @@ |
|||
*.custom.txt |
|||
*.cache.txt |
|||
*.custom.list |
|||
*.cache.list |
@ -0,0 +1 @@ |
|||
at-o.net |
@ -1 +0,0 @@ |
|||
||at-o.net^ |
@ -0,0 +1,2 @@ |
|||
*.custom.txt |
|||
*.cache.txt |
@ -0,0 +1,2 @@ |
|||
*.custom.txt |
|||
*.cache.txt |
Write
Preview
Loading…
Cancel
Save
Reference in new issue