Compare commits

..

No commits in common. "87bb24c51109259a6c5fb8c2a1d6166994c4be96" and "88f0bcc64857723f4fa3b820874304a6f0778e30" have entirely different histories.

2 changed files with 15 additions and 23 deletions

View file

@ -5,7 +5,6 @@ From a list of subdomains, output only
the ones resolving to a first-party tracker. the ones resolving to a first-party tracker.
""" """
import argparse
import logging import logging
import os import os
import queue import queue
@ -195,29 +194,11 @@ def main() -> None:
Also shows a nice progressbar. Also shows a nice progressbar.
""" """
# Initialization
coloredlogs.install( coloredlogs.install(
level='DEBUG', level='DEBUG',
fmt='%(asctime)s %(name)s %(levelname)s %(message)s' fmt='%(asctime)s %(name)s %(levelname)s %(message)s'
) )
# Parsing arguments
parser = argparse.ArgumentParser(
description="Filter first-party trackers from a list of subdomains")
parser.add_argument(
'-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Input file with one subdomain per line")
parser.add_argument(
'-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Outptut file with one tracking subdomain per line")
# parser.add_argument(
# '-n', '--nameserver', type=argparse.FileType('r'),
# default='nameservers', help="File with one nameserver per line")
# parser.add_argument(
# '-j', '--workers', type=int, default=512,
# help="Number of threads to use")
args = parser.parse_args()
# Progress bar # Progress bar
widgets = [ widgets = [
progressbar.Percentage(), progressbar.Percentage(),
@ -229,8 +210,19 @@ def main() -> None:
] ]
progress = progressbar.ProgressBar(widgets=widgets) progress = progressbar.ProgressBar(widgets=widgets)
# Parsing arguments
assert len(sys.argv) <= 2
filename = None
if len(sys.argv) == 2 and sys.argv[1] != '-':
filename = sys.argv[1]
progress.max_value = sum(1 for line in open(filename))
textio = open(filename)
else:
textio = sys.stdin
# Cleaning input # Cleaning input
iterator = iter(args.input) iterator = iter(textio)
iterator = map(str.strip, iterator) iterator = map(str.strip, iterator)
iterator = filter(None, iterator) iterator = filter(None, iterator)
@ -244,7 +236,7 @@ def main() -> None:
for subdomain, matching in Orchestrator(iterator, servers).run(): for subdomain, matching in Orchestrator(iterator, servers).run():
progress.update(progress.value + 1) progress.update(progress.value + 1)
if matching: if matching:
print(subdomain, file=args.output) print(subdomain)
progress.finish() progress.finish()

View file

@ -3,7 +3,7 @@
# Filter out the subdomains not pointing to a first-party tracker # Filter out the subdomains not pointing to a first-party tracker
cat subdomains/*.list | sort -u > temp/all_subdomains.list cat subdomains/*.list | sort -u > temp/all_subdomains.list
./filter_subdomains.py --input temp/all_subdomains.list --output temp/all_toblock.list ./filter_subdomains.py temp/all_subdomains.list > temp/all_toblock.list
sort -u temp/all_toblock.list > dist/firstparty-trackers.txt sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
# Format the blocklist so it can be used as a hostlist # Format the blocklist so it can be used as a hostlist
@ -21,7 +21,7 @@ sort -u temp/all_toblock.list > dist/firstparty-trackers.txt
echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)" echo "# Number of source subdomains: $(wc -l temp/all_subdomains.list | cut -d' ' -f1)"
echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')" echo "# Number of known trackers : $(python -c 'import regexes; print(len(regexes.REGEXES))')"
echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)" echo "# Number of blocked subdomains: $(wc -l dist/firstparty-trackers.txt | cut -d' ' -f1)"
echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc -l)" echo "# Number of first-party subdomains: $(./filter_out_explicit.py dist/firstparty-trackers.txt | wc)"
echo echo
cat dist/firstparty-trackers.txt | while read host; cat dist/firstparty-trackers.txt | while read host;
do do