Add merge thresholds to merge_blocklists()

This commit is contained in:
Justin Warren 2023-01-22 09:02:03 +11:00
parent e0207c4371
commit c018ebdf35
No known key found for this signature in database
1 changed files with 19 additions and 6 deletions

View File

@ -180,16 +180,23 @@ def fetch_from_instances(blocklists: dict, sources: dict,
save_intermediate_blocklist(blocklists[itemsrc], domain, savedir, export_fields) save_intermediate_blocklist(blocklists[itemsrc], domain, savedir, export_fields)
return blocklists return blocklists
def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshold: int=0) -> dict: def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
threshold: int=0,
threshold_type: str='count') -> dict:
"""Merge fetched remote blocklists into a bulk update """Merge fetched remote blocklists into a bulk update
@param blocklists: A dict of lists of DomainBlocks, keyed by source. @param blocklists: A dict of lists of DomainBlocks, keyed by source.
Each value is a list of DomainBlocks Each value is a list of DomainBlocks
@param mergeplan: An optional method of merging overlapping block definitions @param mergeplan: An optional method of merging overlapping block definitions
'max' (the default) uses the highest severity block found 'max' (the default) uses the highest severity block found
'min' uses the lowest severity block found 'min' uses the lowest severity block found
@param threshold: An integer percentage [0-100]. @param threshold: An integer used in the threshold mechanism.
If a domain is not present in this pct or more of the blocklists, If a domain is not present in this number/pct or more of the blocklists,
it will not get merged into the final list. it will not get merged into the final list.
@param threshold_type: choice of ['count', 'pct']
If `count`, threshold is met if block is present in `threshold`
or more blocklists.
If `pct`, theshold is met if block is present in
count_of_mentions / number_of_blocklists.
@param returns: A dict of DomainBlocks keyed by domain @param returns: A dict of DomainBlocks keyed by domain
""" """
merged = {} merged = {}
@ -209,10 +216,16 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshol
else: else:
domain_blocks[block.domain] = [block,] domain_blocks[block.domain] = [block,]
# Only merge items if there are more than `threshold` pct of them # Only merge items if `threshold` is met or exceeded
for domain in domain_blocks: for domain in domain_blocks:
pct = len(domain_blocks[domain]) / num_blocklists if threshold_type == 'count':
if pct >= threshold: domain_threshold_level = len(domain_blocks[domain])
elif threshold_type == 'pct':
domain_threshold_level = len(domain_blocks[domain]) / num_blocklists
else:
raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'")
if domain_threshold_level >= threshold:
# Add first block in the list to merged # Add first block in the list to merged
merged[domain] = domain_blocks[domain][0] merged[domain] = domain_blocks[domain][0]
# Merge the others with this record # Merge the others with this record