From b67ff0c47174b66996e19ab4ba335d67fa49fac7 Mon Sep 17 00:00:00 2001 From: Justin Warren Date: Sun, 12 Feb 2023 17:53:26 +1100 Subject: [PATCH 1/2] Merging domain above threshold needs to be in the threshold check block. Added debugging statements for threshold merging. --- src/fediblockhole/__init__.py | 8 +- tests/test_merge_thresholds.py | 153 +++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 tests/test_merge_thresholds.py diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py index f1bc354..465b08d 100755 --- a/src/fediblockhole/__init__.py +++ b/src/fediblockhole/__init__.py @@ -222,17 +222,21 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', if threshold_type == 'count': domain_threshold_level = len(domain_blocks[domain]) elif threshold_type == 'pct': - domain_threshold_level = len(domain_blocks[domain]) / num_blocklists + domain_threshold_level = len(domain_blocks[domain]) / num_blocklists * 100 + # log.debug(f"domain threshold level: {domain_threshold_level}") else: raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'") + log.debug(f"Checking if {domain_threshold_level} >= {threshold} for {domain}") if domain_threshold_level >= threshold: # Add first block in the list to merged block = domain_blocks[domain][0] + log.debug(f"Yes. Merging block: {block}") + # Merge the others with this record for newblock in domain_blocks[domain][1:]: block = apply_mergeplan(block, newblock, mergeplan) - merged.blocks[block.domain] = block + merged.blocks[block.domain] = block return merged diff --git a/tests/test_merge_thresholds.py b/tests/test_merge_thresholds.py new file mode 100644 index 0000000..4cde03e --- /dev/null +++ b/tests/test_merge_thresholds.py @@ -0,0 +1,153 @@ +"""Test merge with thresholds +""" + +from fediblockhole.blocklists import Blocklist, parse_blocklist +from fediblockhole import merge_blocklists, apply_mergeplan + +from fediblockhole.const import SeverityLevel, DomainBlock + +datafile01 = "data-suspends-01.csv" +datafile02 = "data-silences-01.csv" +datafile03 = "data-noop-01.csv" + +import_fields = [ + 'domain', + 'severity', + 'public_comment', + 'private_comment', + 'reject_media', + 'reject_reports', + 'obfuscate' +] + +def load_test_blocklist_data(datafiles): + + blocklists = [] + + for df in datafiles: + with open(df) as fp: + data = fp.read() + bl = parse_blocklist(data, df, 'csv', import_fields) + blocklists.append(bl) + + return blocklists + +def test_mergeplan_count_2(): + """Only merge a block if present in 2 or more lists + """ + + bl_1 = Blocklist('test01', { + 'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True), + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_2 = Blocklist('test2', { + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_3 = Blocklist('test3', { + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + }) + + ml = merge_blocklists([bl_1, bl_2, bl_3], 'max', threshold=2) + + assert 'onemention.example.org' not in ml + assert 'twomention.example.org' in ml + assert 'threemention.example.org' in ml + +def test_mergeplan_count_3(): + """Only merge a block if present in 3 or more lists + """ + + bl_1 = Blocklist('test01', { + 'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True), + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_2 = Blocklist('test2', { + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_3 = Blocklist('test3', { + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + }) + + ml = merge_blocklists([bl_1, bl_2, bl_3], 'max', threshold=3) + + assert 'onemention.example.org' not in ml + assert 'twomention.example.org' not in ml + assert 'threemention.example.org' in ml + +def test_mergeplan_pct_30(): + """Only merge a block if present in 2 or more lists + """ + + bl_1 = Blocklist('test01', { + 'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True), + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + + }) + + bl_2 = Blocklist('test2', { + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_3 = Blocklist('test3', { + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_4 = Blocklist('test4', { + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + }) + + ml = merge_blocklists([bl_1, bl_2, bl_3, bl_4], 'max', threshold=30, threshold_type='pct') + + assert 'onemention.example.org' not in ml + assert 'twomention.example.org' in ml + assert 'threemention.example.org' in ml + assert 'fourmention.example.org' in ml + +def test_mergeplan_pct_55(): + """Only merge a block if present in 2 or more lists + """ + + bl_1 = Blocklist('test01', { + 'onemention.example.org': DomainBlock('onemention.example.org', 'suspend', '', '', True, True, True), + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + + }) + + bl_2 = Blocklist('test2', { + 'twomention.example.org': DomainBlock('twomention.example.org', 'suspend', '', '', True, True, True), + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_3 = Blocklist('test3', { + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + }) + + bl_4 = Blocklist('test4', { + 'threemention.example.org': DomainBlock('threemention.example.org', 'suspend', '', '', True, True, True), + 'fourmention.example.org': DomainBlock('fourmention.example.org', 'suspend', '', '', True, True, True), + }) + + ml = merge_blocklists([bl_1, bl_2, bl_3, bl_4], 'max', threshold=55, threshold_type='pct') + + assert 'onemention.example.org' not in ml + assert 'twomention.example.org' not in ml + assert 'threemention.example.org' in ml + assert 'fourmention.example.org' in ml \ No newline at end of file From bb1d89e8bea39ca278b23ae2876cbfa3935e4f19 Mon Sep 17 00:00:00 2001 From: Justin Warren Date: Sun, 12 Feb 2023 18:06:07 +1100 Subject: [PATCH 2/2] Added blocklist threshold config params. Added tests for threshold config params. Actually using the config values in merge_blocklists() --- src/fediblockhole/__init__.py | 10 +++++++++- tests/test_configfile.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py index 465b08d..8a0823c 100755 --- a/src/fediblockhole/__init__.py +++ b/src/fediblockhole/__init__.py @@ -71,7 +71,7 @@ def sync_blocklists(conf: argparse.Namespace): import_fields, conf.save_intermediate, conf.savedir, export_fields)) # Merge blocklists into an update dict - merged = merge_blocklists(blocklists, conf.mergeplan) + merged = merge_blocklists(blocklists, conf.mergeplan, conf.merge_threshold, conf.merge_threshold_type) # Remove items listed in allowlists, if any allowlists = fetch_allowlists(conf) @@ -710,6 +710,12 @@ def augment_args(args, tomldata: str=None): if not args.mergeplan: args.mergeplan = conf.get('mergeplan', 'max') + if not args.merge_threshold: + args.merge_threshold = conf.get('merge_threshold', 0) + + if not args.merge_threshold_type: + args.merge_threshold_type = conf.get('merge_threshold_type', 'count') + args.blocklist_url_sources = conf.get('blocklist_url_sources', []) args.blocklist_instance_sources = conf.get('blocklist_instance_sources', []) args.allowlist_url_sources = conf.get('allowlist_url_sources', []) @@ -731,6 +737,8 @@ def setup_argparse(): ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.") ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.") ap.add_argument('-m', '--mergeplan', choices=['min', 'max'], help="Set mergeplan.") + ap.add_argument('--merge-threshold', type=int, help="Merge threshold value") + ap.add_argument('--merge-threshold-type', choices=['count', 'pct'], help="Type of merge threshold to use.") ap.add_argument('-I', '--import-field', dest='import_fields', action='append', help="Extra blocklist fields to import.") ap.add_argument('-E', '--export-field', dest='export_fields', action='append', help="Extra blocklist fields to export.") diff --git a/tests/test_configfile.py b/tests/test_configfile.py index 4b2c1e7..9e31c9d 100644 --- a/tests/test_configfile.py +++ b/tests/test_configfile.py @@ -49,3 +49,33 @@ allowlist_url_sources = [ { url='file:///path/to/allowlist', format='csv'} ] 'url': 'file:///path/to/allowlist', 'format': 'csv', }] + +def test_set_merge_thresold_default(): + tomldata = """ +""" + args = shim_argparse([], tomldata) + + assert args.mergeplan == 'max' + assert args.merge_threshold_type == 'count' + +def test_set_merge_thresold_count(): + tomldata = """# Add a merge threshold +merge_threshold_type = 'count' +merge_threshold = 2 +""" + args = shim_argparse([], tomldata) + + assert args.mergeplan == 'max' + assert args.merge_threshold_type == 'count' + assert args.merge_threshold == 2 + +def test_set_merge_thresold_pct(): + tomldata = """# Add a merge threshold +merge_threshold_type = 'pct' +merge_threshold = 35 +""" + args = shim_argparse([], tomldata) + + assert args.mergeplan == 'max' + assert args.merge_threshold_type == 'pct' + assert args.merge_threshold == 35