Browse Source

Worflow: Fixed rules counts

newworkflow_threaded^2
Geoffrey Frogeye 2 years ago
parent
commit
269b8278b5
Signed by: geoffrey GPG Key ID: D8A7ECA00A8CD3DD
  1. 20
      database.py
  2. 2
      database_schema.sql
  3. 25
      export.py
  4. 18
      filter_subdomains.sh
  5. 4
      new_workflow.sh

20
database.py

@ -23,7 +23,7 @@ DbValue = typing.Union[None, int, float, str, bytes]
class Database():
VERSION = 4
VERSION = 5
PATH = "blocking.db"
def open(self) -> None:
@ -250,6 +250,24 @@ class Database():
else:
yield val
def count_rules(self,
first_party_only: bool = False,
) -> str:
counts: typing.List[str] = list()
cursor = self.conn.cursor()
for table in ['asn', 'ip4network', 'ip4address', 'zone', 'hostname']:
command = f'SELECT count(*) FROM rules ' \
f'INNER JOIN {table} ON rules.id = {table}.entry ' \
'WHERE rules.level = 0'
if first_party_only:
command += ' AND first_party=1'
cursor.execute(command)
count, = cursor.fetchone()
if count > 0:
counts.append(f'{table}: {count}')
return ', '.join(counts)
def get_domain(self, domain: str) -> typing.Iterable[int]:
self.enter_step('get_domain_prepare')
domain_prep = self.pack_hostname(domain)

2
database_schema.sql

@ -11,6 +11,8 @@ CREATE TABLE rules (
FOREIGN KEY (source) REFERENCES rules(id) ON DELETE CASCADE
);
CREATE INDEX rules_source ON rules (source); -- for references recounting
CREATE INDEX rules_updated ON rules (updated); -- for pruning
CREATE INDEX rules_level_firstparty ON rules (level, first_party); -- for counting rules
CREATE TABLE asn (
val INTEGER PRIMARY KEY,

25
export.py

@ -22,15 +22,28 @@ if __name__ == '__main__':
parser.add_argument(
'-x', '--explain', action='store_true',
help="TODO")
parser.add_argument(
'-r', '--rules', action='store_true',
help="TODO")
parser.add_argument(
'-c', '--count', action='store_true',
help="TODO")
args = parser.parse_args()
DB = database.Database()
for domain in DB.export(
first_party_only=args.first_party,
end_chain_only=args.end_chain,
explain=args.explain,
):
print(domain, file=args.output)
if args.rules:
if not args.count:
raise NotImplementedError
print(DB.count_rules(first_party_only=args.first_party))
else:
if args.count:
raise NotImplementedError
for domain in DB.export(
first_party_only=args.first_party,
end_chain_only=args.end_chain,
explain=args.explain,
):
print(domain, file=args.output)
DB.close()

18
filter_subdomains.sh

@ -4,6 +4,9 @@ function log() {
echo -e "\033[33m$@\033[0m"
}
log "Pruning old data…"
./database.py --prune
log "Recounting references…"
./database.py --references
@ -14,6 +17,8 @@ log "Exporting lists…"
./export.py --end-chain --output dist/multiparty-only-trackers.txt
log "Generating hosts lists…"
./export.py --rules --count --first-party > temp/count_rules_firstparty.txt
./export.py --rules --count > temp/count_rules_multiparty.txt
function generate_hosts {
basename="$1"
description="$2"
@ -39,15 +44,16 @@ function generate_hosts {
echo "#"
echo "# Generation date: $(date -Isec)"
echo "# Generation software: eulaurarien $(git describe --tags)"
echo "# Number of source websites: TODO"
echo "# Number of source subdomains: TODO"
echo "# Number of source websites: $(wc -l temp/all_websites.list | cut -d' ' -f1)"
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of source DNS records: ~2M + $(wc -l temp/all_resolved.json | cut -d' ' -f1)"
echo "#"
echo "# Number of known first-party trackers: TODO"
echo "# Number of first-party subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Known first-party trackers: $(cat temp/count_rules_firstparty.txt)"
echo "# Number of first-party hostnames: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/firstparty-only-trackers.txt | cut -d' ' -f1)"
echo "#"
echo "# Number of known multi-party trackers: TODO"
echo "# Number of multi-party subdomains: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# Known multi-party trackers: $(cat temp/count_rules_multiparty.txt)"
echo "# Number of multi-party hostnames: $(wc -l dist/multiparty-trackers.txt | cut -d' ' -f1)"
echo "# … excluding redirected: $(wc -l dist/multiparty-only-trackers.txt | cut -d' ' -f1)"
echo
sed 's|^|0.0.0.0 |' "dist/$basename.txt"

4
new_workflow.sh

@ -4,9 +4,7 @@ function log() {
echo -e "\033[33m$@\033[0m"
}
log "Preparing database…"
./database.py --expire
./fetch_resources.sh
./import_rules.sh
# TODO Fetch 'em

Loading…
Cancel
Save