From 6f960aefec79be3cc7375a85dd991e23b0999a53 Mon Sep 17 00:00:00 2001 From: Shawn Grigson Date: Sun, 20 Aug 2023 02:50:05 -0500 Subject: [PATCH 1/3] Create Domain Audit File Option Allows for the creation of an audit file that will explain for each domain the number of matches across available sources --- etc/sample.fediblockhole.conf.toml | 3 ++ src/fediblockhole/__init__.py | 60 +++++++++++++++++++++--- src/fediblockhole/blocklists.py | 29 +++++++++++- src/fediblockhole/const.py | 75 ++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 7 deletions(-) diff --git a/etc/sample.fediblockhole.conf.toml b/etc/sample.fediblockhole.conf.toml index bd93663..44cd6ff 100644 --- a/etc/sample.fediblockhole.conf.toml +++ b/etc/sample.fediblockhole.conf.toml @@ -42,6 +42,9 @@ blocklist_instance_destinations = [ ## File to save the fully merged blocklist into # blocklist_savefile = '/tmp/merged_blocklist.csv' +## File to save the audit log of counts across sources +# blocklist_auditfile = '/tmp/domain_counts_list.csv' + ## Don't push blocklist to instances, even if they're defined above # no_push_instance = false diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py index 67c80ea..01e6a96 100755 --- a/src/fediblockhole/__init__.py +++ b/src/fediblockhole/__init__.py @@ -11,8 +11,8 @@ import os.path import sys import urllib.request as urlr -from .blocklists import Blocklist, parse_blocklist -from .const import DomainBlock, BlockSeverity +from .blocklists import Blocklist, BlockAuditList, parse_blocklist +from .const import DomainBlock, BlockSeverity, BlockAudit from importlib.metadata import version __version__ = version('fediblockhole') @@ -71,7 +71,7 @@ def sync_blocklists(conf: argparse.Namespace): import_fields, conf.save_intermediate, conf.savedir, export_fields)) # Merge blocklists into an update dict - merged = merge_blocklists(blocklists, conf.mergeplan, conf.merge_threshold, conf.merge_threshold_type) + merged = merge_blocklists(blocklists, conf.mergeplan, conf.merge_threshold, conf.merge_threshold_type, conf.blocklist_auditfile) # Remove items listed in allowlists, if any allowlists = fetch_allowlists(conf) @@ -183,7 +183,8 @@ def fetch_from_instances(sources: dict, def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshold: int=0, - threshold_type: str='count') -> Blocklist: + threshold_type: str='count', + save_block_audit_file: str='') -> Blocklist: """Merge fetched remote blocklists into a bulk update @param blocklists: A dict of lists of DomainBlocks, keyed by source. Each value is a list of DomainBlocks @@ -201,6 +202,7 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', @param returns: A dict of DomainBlocks keyed by domain """ merged = Blocklist('fediblockhole.merge_blocklists') + audit = BlockAuditList('fediblockhole.merge_blocklists') num_blocklists = len(blocklists) @@ -219,10 +221,12 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', # Only merge items if `threshold` is met or exceeded for domain in domain_blocks: + domain_matches_count = len(domain_blocks[domain]) + domain_matches_percent = domain_matches_count / num_blocklists * 100 if threshold_type == 'count': - domain_threshold_level = len(domain_blocks[domain]) + domain_threshold_level = domain_matches_count elif threshold_type == 'pct': - domain_threshold_level = len(domain_blocks[domain]) / num_blocklists * 100 + domain_threshold_level = domain_matches_percent # log.debug(f"domain threshold level: {domain_threshold_level}") else: raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'") @@ -238,6 +242,18 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', block = apply_mergeplan(block, newblock, mergeplan) merged.blocks[block.domain] = block + if len(save_block_audit_file) > 0: + blockdata:BlockAudit = { + 'domain': domain, + 'count': domain_matches_count, + 'percent': domain_matches_percent, + } + audit.blocks[domain] = blockdata + + if len(save_block_audit_file) > 0: + log.info(f"Saving audit file to {save_block_audit_file}") + save_domain_block_audit_to_file(audit, save_block_audit_file) + return merged def apply_mergeplan(oldblock: DomainBlock, newblock: DomainBlock, mergeplan: str='max') -> dict: @@ -672,6 +688,34 @@ def save_blocklist_to_file( for key, value in sorted_list: writer.writerow(value) +def save_domain_block_audit_to_file( + blocklist: BlockAuditList, + filepath: str): + """Save an audit log of domains blocked + + @param blocklist: A dictionary of block definitions, keyed by domain + @param filepath: The path to the file the list should be saved in. + """ + export_fields = ['domain', 'count', 'percent'] + + try: + sorted_list = sorted(blocklist.blocks.items()) + except KeyError: + log.error("Field 'domain' not found in blocklist.") + log.debug(f"blocklist is: {sorted_list}") + except AttributeError: + log.error("Attribute error!") + import pdb + pdb.set_trace() + + log.debug("exporting audit file") + + with open(filepath, "w") as fp: + writer = csv.DictWriter(fp, export_fields, extrasaction='ignore') + writer.writeheader() + for key, value in sorted_list: + writer.writerow(value) + def augment_args(args, tomldata: str=None): """Augment commandline arguments with config file parameters @@ -701,6 +745,9 @@ def augment_args(args, tomldata: str=None): if not args.savedir: args.savedir = conf.get('savedir', '/tmp') + if not args.blocklist_auditfile: + args.blocklist_auditfile = conf.get('blocklist_auditfile', '') + if not args.export_fields: args.export_fields = conf.get('export_fields', []) @@ -737,6 +784,7 @@ def setup_argparse(): ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.") ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.") ap.add_argument('-m', '--mergeplan', choices=['min', 'max'], help="Set mergeplan.") + ap.add_argument('-b', '--block-audit-file', dest="blocklist_auditfile", help="Save blocklist auditfile to this location.") ap.add_argument('--merge-threshold', type=int, help="Merge threshold value") ap.add_argument('--merge-threshold-type', choices=['count', 'pct'], help="Type of merge threshold to use.") diff --git a/src/fediblockhole/blocklists.py b/src/fediblockhole/blocklists.py index 72cb804..30781b9 100644 --- a/src/fediblockhole/blocklists.py +++ b/src/fediblockhole/blocklists.py @@ -6,7 +6,7 @@ import json from typing import Iterable from dataclasses import dataclass, field -from .const import DomainBlock, BlockSeverity +from .const import DomainBlock, BlockSeverity, BlockAudit import logging log = logging.getLogger('fediblockhole') @@ -38,6 +38,33 @@ class Blocklist: def values(self): return self.blocks.values() +@dataclass +class BlockAuditList: + """ A BlockAuditlist object + + A BlockAuditlist is a list of BlockAudits from an origin + """ + origin: str = None + blocks: dict[str, BlockAudit] = field(default_factory=dict) + + def __len__(self): + return len(self.blocks) + + def __class_getitem__(cls, item): + return dict[str, BlockAudit] + + def __getitem__(self, item): + return self.blocks[item] + + def __iter__(self): + return self.blocks.__iter__() + + def items(self): + return self.blocks.items() + + def values(self): + return self.blocks.values() + class BlocklistParser(object): """ Base class for parsing blocklists diff --git a/src/fediblockhole/const.py b/src/fediblockhole/const.py index ea35cb1..f741304 100644 --- a/src/fediblockhole/const.py +++ b/src/fediblockhole/const.py @@ -84,6 +84,81 @@ class BlockSeverity(object): def __ge__(self, other): if self._level >= other._level: return True + +class BlockAudit(object): + + fields = [ + 'domain', + 'count', + 'percent', + ] + + all_fields = [ + 'domain', + 'count', + 'percent', + 'id' + ] + + def __init__(self, domain:str, + count: int=0, + percent: int=0, + id: int=None): + """Initialize the BlockAudit + """ + self.domain = domain + self.count = count + self.percent = percent + self.id = id + + def _asdict(self): + """Return a dict version of this object + """ + dictval = { + 'domain': self.domain, + 'count': self.count, + 'percent': self.percent, + } + if self.id: + dictval['id'] = self.id + + return dictval + + def __repr__(self): + + return f"" + + def copy(self): + """Make a copy of this object and return it + """ + retval = BlockAudit(**self._asdict()) + return retval + + def update(self, dict): + """Update my kwargs + """ + for key in dict: + setattr(self, key, dict[key]) + + def __iter__(self): + """Be iterable""" + keys = self.fields + + if getattr(self, 'id', False): + keys.append('id') + + for k in keys: + yield k + + def __getitem__(self, k, default=None): + "Behave like a dict for getting values" + if k not in self.all_fields: + raise KeyError(f"Invalid key '{k}'") + + return getattr(self, k, default) + + def get(self, k, default=None): + return self.__getitem__(k, default) # class _DomainBlock(NamedTuple): # domain: str # FIXME: Use an actual Domain object from somewhere? From b86f7cd0e465fcfa3a790251e9abbaef3bc7b803 Mon Sep 17 00:00:00 2001 From: Shawn Grigson Date: Wed, 6 Sep 2023 11:26:02 -0500 Subject: [PATCH 2/3] Default to None --- src/fediblockhole/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py index 01e6a96..aec0a08 100755 --- a/src/fediblockhole/__init__.py +++ b/src/fediblockhole/__init__.py @@ -184,7 +184,7 @@ def fetch_from_instances(sources: dict, def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshold: int=0, threshold_type: str='count', - save_block_audit_file: str='') -> Blocklist: + save_block_audit_file: str=None) -> Blocklist: """Merge fetched remote blocklists into a bulk update @param blocklists: A dict of lists of DomainBlocks, keyed by source. Each value is a list of DomainBlocks @@ -242,7 +242,7 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', block = apply_mergeplan(block, newblock, mergeplan) merged.blocks[block.domain] = block - if len(save_block_audit_file) > 0: + if save_block_audit_file: blockdata:BlockAudit = { 'domain': domain, 'count': domain_matches_count, @@ -250,7 +250,7 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', } audit.blocks[domain] = blockdata - if len(save_block_audit_file) > 0: + if save_block_audit_file: log.info(f"Saving audit file to {save_block_audit_file}") save_domain_block_audit_to_file(audit, save_block_audit_file) @@ -746,7 +746,7 @@ def augment_args(args, tomldata: str=None): args.savedir = conf.get('savedir', '/tmp') if not args.blocklist_auditfile: - args.blocklist_auditfile = conf.get('blocklist_auditfile', '') + args.blocklist_auditfile = conf.get('blocklist_auditfile', None) if not args.export_fields: args.export_fields = conf.get('export_fields', []) From d413f27e8379c438d746eb0e0bd924a2582e02d8 Mon Sep 17 00:00:00 2001 From: Shawn Grigson Date: Wed, 6 Sep 2023 11:30:46 -0500 Subject: [PATCH 3/3] Add config option to README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 89062bd..bdd295c 100644 --- a/README.md +++ b/README.md @@ -363,6 +363,11 @@ The filename is based on the URL or domain used so you can tell where each list Sets where to save intermediate blocklist files. Defaults to `/tmp`. +### blocklist_auditfile + +If provided, will save an audit file of counts and percentages by domain. Useful for debugging +thresholds. Defaults to None. + ### no_push_instance Defaults to False.