Merge pull request #44 from eigenmagic/allowlist-thresholds
Config options for blocklist thresholds
This commit is contained in:
commit
61748acb1a
|
@ -71,7 +71,7 @@ def sync_blocklists(conf: argparse.Namespace):
|
||||||
import_fields, conf.save_intermediate, conf.savedir, export_fields))
|
import_fields, conf.save_intermediate, conf.savedir, export_fields))
|
||||||
|
|
||||||
# Merge blocklists into an update dict
|
# Merge blocklists into an update dict
|
||||||
merged = merge_blocklists(blocklists, conf.mergeplan)
|
merged = merge_blocklists(blocklists, conf.mergeplan, conf.merge_threshold, conf.merge_threshold_type)
|
||||||
|
|
||||||
# Remove items listed in allowlists, if any
|
# Remove items listed in allowlists, if any
|
||||||
allowlists = fetch_allowlists(conf)
|
allowlists = fetch_allowlists(conf)
|
||||||
|
@ -222,17 +222,21 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
|
||||||
if threshold_type == 'count':
|
if threshold_type == 'count':
|
||||||
domain_threshold_level = len(domain_blocks[domain])
|
domain_threshold_level = len(domain_blocks[domain])
|
||||||
elif threshold_type == 'pct':
|
elif threshold_type == 'pct':
|
||||||
domain_threshold_level = len(domain_blocks[domain]) / num_blocklists
|
domain_threshold_level = len(domain_blocks[domain]) / num_blocklists * 100
|
||||||
|
# log.debug(f"domain threshold level: {domain_threshold_level}")
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'")
|
raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'")
|
||||||
|
|
||||||
|
log.debug(f"Checking if {domain_threshold_level} >= {threshold} for {domain}")
|
||||||
if domain_threshold_level >= threshold:
|
if domain_threshold_level >= threshold:
|
||||||
# Add first block in the list to merged
|
# Add first block in the list to merged
|
||||||
block = domain_blocks[domain][0]
|
block = domain_blocks[domain][0]
|
||||||
|
log.debug(f"Yes. Merging block: {block}")
|
||||||
|
|
||||||
# Merge the others with this record
|
# Merge the others with this record
|
||||||
for newblock in domain_blocks[domain][1:]:
|
for newblock in domain_blocks[domain][1:]:
|
||||||
block = apply_mergeplan(block, newblock, mergeplan)
|
block = apply_mergeplan(block, newblock, mergeplan)
|
||||||
merged.blocks[block.domain] = block
|
merged.blocks[block.domain] = block
|
||||||
|
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
@ -706,6 +710,12 @@ def augment_args(args, tomldata: str=None):
|
||||||
if not args.mergeplan:
|
if not args.mergeplan:
|
||||||
args.mergeplan = conf.get('mergeplan', 'max')
|
args.mergeplan = conf.get('mergeplan', 'max')
|
||||||
|
|
||||||
|
if not args.merge_threshold:
|
||||||
|
args.merge_threshold = conf.get('merge_threshold', 0)
|
||||||
|
|
||||||
|
if not args.merge_threshold_type:
|
||||||
|
args.merge_threshold_type = conf.get('merge_threshold_type', 'count')
|
||||||
|
|
||||||
args.blocklist_url_sources = conf.get('blocklist_url_sources', [])
|
args.blocklist_url_sources = conf.get('blocklist_url_sources', [])
|
||||||
args.blocklist_instance_sources = conf.get('blocklist_instance_sources', [])
|
args.blocklist_instance_sources = conf.get('blocklist_instance_sources', [])
|
||||||
args.allowlist_url_sources = conf.get('allowlist_url_sources', [])
|
args.allowlist_url_sources = conf.get('allowlist_url_sources', [])
|
||||||
|
@ -727,6 +737,8 @@ def setup_argparse():
|
||||||
ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.")
|
ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.")
|
||||||
ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.")
|
ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.")
|
||||||
ap.add_argument('-m', '--mergeplan', choices=['min', 'max'], help="Set mergeplan.")
|
ap.add_argument('-m', '--mergeplan', choices=['min', 'max'], help="Set mergeplan.")
|
||||||
|
ap.add_argument('--merge-threshold', type=int, help="Merge threshold value")
|
||||||
|
ap.add_argument('--merge-threshold-type', choices=['count', 'pct'], help="Type of merge threshold to use.")
|
||||||
|
|
||||||
ap.add_argument('-I', '--import-field', dest='import_fields', action='append', help="Extra blocklist fields to import.")
|
ap.add_argument('-I', '--import-field', dest='import_fields', action='append', help="Extra blocklist fields to import.")
|
||||||
ap.add_argument('-E', '--export-field', dest='export_fields', action='append', help="Extra blocklist fields to export.")
|
ap.add_argument('-E', '--export-field', dest='export_fields', action='append', help="Extra blocklist fields to export.")
|
||||||
|
|
|
@ -49,3 +49,33 @@ allowlist_url_sources = [ { url='file:///path/to/allowlist', format='csv'} ]
|
||||||
'url': 'file:///path/to/allowlist',
|
'url': 'file:///path/to/allowlist',
|
||||||
'format': 'csv',
|
'format': 'csv',
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
def test_set_merge_thresold_default():
|
||||||
|
tomldata = """
|
||||||
|
"""
|
||||||
|
args = shim_argparse([], tomldata)
|
||||||
|
|
||||||
|
assert args.mergeplan == 'max'
|
||||||
|
assert args.merge_threshold_type == 'count'
|
||||||
|
|
||||||
|
def test_set_merge_thresold_count():
|
||||||
|
tomldata = """# Add a merge threshold
|
||||||
|
merge_threshold_type = 'count'
|
||||||
|
merge_threshold = 2
|
||||||
|
"""
|
||||||
|
args = shim_argparse([], tomldata)
|
||||||
|
|
||||||
|
assert args.mergeplan == 'max'
|
||||||
|
assert args.merge_threshold_type == 'count'
|
||||||
|
assert args.merge_threshold == 2
|
||||||
|
|
||||||
|
def test_set_merge_thresold_pct():
|
||||||
|
tomldata = """# Add a merge threshold
|
||||||
|
merge_threshold_type = 'pct'
|
||||||
|
merge_threshold = 35
|
||||||
|
"""
|
||||||
|
args = shim_argparse([], tomldata)
|
||||||
|
|
||||||
|
assert args.mergeplan == 'max'
|
||||||
|
assert args.merge_threshold_type == 'pct'
|
||||||
|
assert args.merge_threshold == 35
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
"""Test merge with thresholds
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fediblockhole.blocklists import Blocklist, parse_blocklist
|
||||||
|
from fediblockhole import merge_blocklists, apply_mergeplan
|
||||||
|
|
||||||
|
from fediblockhole.const import SeverityLevel, DomainBlock
|
||||||
|
|
||||||
|
datafile01 = "data-suspends-01.csv"
|
||||||
|
datafile02 = "data-silences-01.csv"
|
||||||
|
datafile03 = "data-noop-01.csv"
|
||||||
|
|
||||||
|
import_fields = [
|
||||||
|
'domain',
|
||||||
|
'severity',
|
||||||
|
'public_comment',
|
||||||
|
'private_comment',
|
||||||
|
'reject_media',
|
||||||
|
'reject_reports',
|
||||||
|
'obfuscate'
|
||||||
|
]
|
||||||
|
|
||||||
|
def load_test_blocklist_data(datafiles):
|
||||||
|
|
||||||
|
blocklists = []
|
||||||
|
|
||||||
|
for df in datafiles:
|
||||||
|
with open(df) as fp:
|
||||||
|
data = fp.read()
|
||||||
|
bl = parse_blocklist(data, df, 'csv', import_fields)
|
||||||
|
blocklists.append(bl)
|
||||||
|
|
||||||
|
return blocklists
|
||||||
|
|
||||||
|
def test_mergeplan_count_2():
|
||||||
|
"""Only merge a block if present in 2 or more lists
|
||||||
|
"""
|
||||||
|
|
||||||
|
bl_1 = Blocklist('test01', {
|
||||||
|
'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_2 = Blocklist('test2', {
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_3 = Blocklist('test3', {
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
ml = merge_blocklists([bl_1, bl_2, bl_3], 'max', threshold=2)
|
||||||
|
|
||||||
|
assert 'onemention.example.org' not in ml
|
||||||
|
assert 'twomention.example.org' in ml
|
||||||
|
assert 'threemention.example.org' in ml
|
||||||
|
|
||||||
|
def test_mergeplan_count_3():
|
||||||
|
"""Only merge a block if present in 3 or more lists
|
||||||
|
"""
|
||||||
|
|
||||||
|
bl_1 = Blocklist('test01', {
|
||||||
|
'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_2 = Blocklist('test2', {
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_3 = Blocklist('test3', {
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
ml = merge_blocklists([bl_1, bl_2, bl_3], 'max', threshold=3)
|
||||||
|
|
||||||
|
assert 'onemention.example.org' not in ml
|
||||||
|
assert 'twomention.example.org' not in ml
|
||||||
|
assert 'threemention.example.org' in ml
|
||||||
|
|
||||||
|
def test_mergeplan_pct_30():
|
||||||
|
"""Only merge a block if present in 2 or more lists
|
||||||
|
"""
|
||||||
|
|
||||||
|
bl_1 = Blocklist('test01', {
|
||||||
|
'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_2 = Blocklist('test2', {
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_3 = Blocklist('test3', {
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_4 = Blocklist('test4', {
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
ml = merge_blocklists([bl_1, bl_2, bl_3, bl_4], 'max', threshold=30, threshold_type='pct')
|
||||||
|
|
||||||
|
assert 'onemention.example.org' not in ml
|
||||||
|
assert 'twomention.example.org' in ml
|
||||||
|
assert 'threemention.example.org' in ml
|
||||||
|
assert 'fourmention.example.org' in ml
|
||||||
|
|
||||||
|
def test_mergeplan_pct_55():
|
||||||
|
"""Only merge a block if present in 2 or more lists
|
||||||
|
"""
|
||||||
|
|
||||||
|
bl_1 = Blocklist('test01', {
|
||||||
|
'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_2 = Blocklist('test2', {
|
||||||
|
'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_3 = Blocklist('test3', {
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
bl_4 = Blocklist('test4', {
|
||||||
|
'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True),
|
||||||
|
})
|
||||||
|
|
||||||
|
ml = merge_blocklists([bl_1, bl_2, bl_3, bl_4], 'max', threshold=55, threshold_type='pct')
|
||||||
|
|
||||||
|
assert 'onemention.example.org' not in ml
|
||||||
|
assert 'twomention.example.org' not in ml
|
||||||
|
assert 'threemention.example.org' in ml
|
||||||
|
assert 'fourmention.example.org' in ml
|
Loading…
Reference in New Issue