From 70b1ff32ff1f1fde3589fbf60ed5ef8756d02cd5 Mon Sep 17 00:00:00 2001
From: Justin Warren <justin@eigenmagic.com>
Date: Tue, 17 Jan 2023 09:04:34 +1100
Subject: [PATCH 1/5] Support a merge threshold level. Added a Blocklist
 object. Refactored tests to match changed code.

---
 src/fediblockhole/__init__.py                 | 70 +++++++++++++------
 .../{blocklist_parser.py => blocklists.py}    | 44 ++++++++++--
 src/fediblockhole/const.py                    |  4 +-
 tests/test_mergeplan.py                       |  9 ++-
 tests/test_parser_csv.py                      | 54 +++++++-------
 tests/test_parser_json.py                     | 41 ++++++-----
 tests/test_parser_rapidblockcsv.py            | 12 ++--
 tests/test_parser_rapidblockjson.py           | 32 ++++-----
 8 files changed, 165 insertions(+), 101 deletions(-)
 rename src/fediblockhole/{blocklist_parser.py => blocklists.py} (87%)

diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py
index 945e29c..893787d 100755
--- a/src/fediblockhole/__init__.py
+++ b/src/fediblockhole/__init__.py
@@ -11,7 +11,7 @@ import os.path
 import sys
 import urllib.request as urlr
 
-from .blocklist_parser import parse_blocklist
+from .blocklists import Blocklist, parse_blocklist
 from .const import DomainBlock, BlockSeverity
 
 from importlib.metadata import version
@@ -178,41 +178,71 @@ def fetch_from_instances(blocklists: dict, sources: dict,
             save_intermediate_blocklist(blocklists[itemsrc], domain, savedir, export_fields)
     return blocklists
 
-def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
+def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshold: int=0) -> dict:
     """Merge fetched remote blocklists into a bulk update
     @param blocklists: A dict of lists of DomainBlocks, keyed by source.
         Each value is a list of DomainBlocks
     @param mergeplan: An optional method of merging overlapping block definitions
         'max' (the default) uses the highest severity block found
         'min' uses the lowest severity block found
+    @param threshold: An integer percentage [0-100].
+        If a domain is not present in this pct or more of the blocklists,
+        it will not get merged into the final list.
     @param returns: A dict of DomainBlocks keyed by domain
     """
     merged = {}
 
-    for key, blist in blocklists.items():
-        log.debug(f"processing blocklist from: {key} ...")
-        for newblock in blist:
-            domain = newblock.domain
-            # If the domain has two asterisks in it, it's obfuscated
-            # and we can't really use it, so skip it and do the next one
-            if '*' in domain:
+    num_blocklists = len(blocklists)
+
+    # Create a domain keyed list of blocks for each domain
+    domain_blocks = {}
+
+    for bl in blocklists:
+        for block in bl.values():
+            if '*' in block.domain:
                 log.debug(f"Domain '{domain}' is obfuscated. Skipping it.")
                 continue
-
-            elif domain in merged:
-                log.debug(f"Overlapping block for domain {domain}. Merging...")
-                blockdata = apply_mergeplan(merged[domain], newblock, mergeplan)
-
+            elif block.domain in domain_blocks:
+                domain_blocks[block.domain].append(block)
             else:
-                # New block
-                blockdata = newblock
+                domain_blocks[block.domain] = [block,]
 
-            # end if
-            log.debug(f"blockdata is: {blockdata}")
-            merged[domain] = blockdata
-        # end for
+    # Only merge items if there are more than `threshold` pct of them
+    for domain in domain_blocks:
+        pct = len(domain_blocks[domain]) / num_blocklists
+        if pct >= threshold:
+            # Add first block in the list to merged
+            merged[domain] = domain_blocks[domain][0]
+            # Merge the others with this record
+            for block in domain_blocks[domain][1:]:
+                merged[domain] = apply_mergeplan(merged[domain], block, mergeplan)
+    
     return merged
 
+    # for key, blist in blocklists.items():
+    #     log.debug(f"processing blocklist from: {key} ...")
+    #     for newblock in blist:
+    #         domain = newblock.domain
+    #         # If the domain has two asterisks in it, it's obfuscated
+    #         # and we can't really use it, so skip it and do the next one
+    #         if '*' in domain:
+    #             log.debug(f"Domain '{domain}' is obfuscated. Skipping it.")
+    #             continue
+
+    #         elif domain in merged:
+    #             log.debug(f"Overlapping block for domain {domain}. Merging...")
+    #             blockdata = apply_mergeplan(merged[domain], newblock, mergeplan)
+
+    #         else:
+    #             # New block
+    #             blockdata = newblock
+
+    #         # end if
+    #         log.debug(f"blockdata is: {blockdata}")
+    #         merged[domain] = blockdata
+    #     # end for
+    # return merged
+
 def apply_mergeplan(oldblock: DomainBlock, newblock: DomainBlock, mergeplan: str='max') -> dict:
     """Use a mergeplan to decide how to merge two overlapping block definitions
     
diff --git a/src/fediblockhole/blocklist_parser.py b/src/fediblockhole/blocklists.py
similarity index 87%
rename from src/fediblockhole/blocklist_parser.py
rename to src/fediblockhole/blocklists.py
index d5d8394..33b71a2 100644
--- a/src/fediblockhole/blocklist_parser.py
+++ b/src/fediblockhole/blocklists.py
@@ -1,14 +1,42 @@
 """Parse various blocklist data formats
 """
-from typing import Iterable
-from .const import DomainBlock, BlockSeverity
-
 import csv
 import json
+from typing import Iterable
+from dataclasses import dataclass, field
+
+from .const import DomainBlock, BlockSeverity
 
 import logging
 log = logging.getLogger('fediblockhole')
 
+@dataclass
+class Blocklist:
+    """ A Blocklist object
+
+    A Blocklist is a list of DomainBlocks from an origin
+    """
+    origin: str = None
+    blocks: dict[str, DomainBlock] = field(default_factory=dict)
+
+    def __len__(self):
+        return len(self.blocks)
+
+    def __class_getitem__(cls, item):
+        return dict[str, DomainBlock]
+
+    def __getitem__(self, item):
+        return self.blocks[item]
+
+    def __iter__(self):
+        return self.blocks.__iter__()
+
+    def items(self):
+        return self.blocks.items()
+
+    def values(self):
+        return self.blocks.values()
+
 class BlocklistParser(object):
     """
     Base class for parsing blocklists
@@ -30,7 +58,7 @@ class BlocklistParser(object):
         """
         raise NotImplementedError
 
-    def parse_blocklist(self, blockdata) -> dict[DomainBlock]:
+    def parse_blocklist(self, blockdata, origin:str=None) -> Blocklist:
         """Parse an iterable of blocklist items
         @param blocklist: An Iterable of blocklist items
         @returns: A dict of DomainBlocks, keyed by domain
@@ -38,9 +66,10 @@ class BlocklistParser(object):
         if self.preparse:
             blockdata = self.preparse(blockdata)
 
-        parsed_list = []
+        parsed_list = Blocklist(origin)
         for blockitem in blockdata:
-            parsed_list.append(self.parse_item(blockitem))
+            block = self.parse_item(blockitem)
+            parsed_list.blocks[block.domain] = block
         return parsed_list
     
     def parse_item(self, blockitem) -> DomainBlock:
@@ -178,6 +207,7 @@ FORMAT_PARSERS = {
 # helper function to select the appropriate Parser
 def parse_blocklist(
     blockdata,
+    origin,
     format="csv",
     import_fields: list=['domain', 'severity'],
     max_severity: str='suspend'):
@@ -185,4 +215,4 @@ def parse_blocklist(
     """
     parser = FORMAT_PARSERS[format](import_fields, max_severity)
     log.debug(f"parsing {format} blocklist with import_fields: {import_fields}...")
-    return parser.parse_blocklist(blockdata)
\ No newline at end of file
+    return parser.parse_blocklist(blockdata, origin)
\ No newline at end of file
diff --git a/src/fediblockhole/const.py b/src/fediblockhole/const.py
index 93cf2ef..7ed9f48 100644
--- a/src/fediblockhole/const.py
+++ b/src/fediblockhole/const.py
@@ -123,7 +123,8 @@ class DomainBlock(object):
             reject_media: bool=False,
             reject_reports: bool=False,
             obfuscate: bool=False,
-            id: int=None):
+            id: int=None,
+            count: int=0):
         """Initialize the DomainBlock
         """        
         self.domain = domain
@@ -134,6 +135,7 @@ class DomainBlock(object):
         self.reject_reports = reject_reports
         self.obfuscate = obfuscate
         self.id = id
+        self.count = 0
 
     @property
     def severity(self):
diff --git a/tests/test_mergeplan.py b/tests/test_mergeplan.py
index 55f3914..42d2816 100644
--- a/tests/test_mergeplan.py
+++ b/tests/test_mergeplan.py
@@ -1,7 +1,7 @@
 """Various mergeplan tests
 """
 
-from fediblockhole.blocklist_parser import parse_blocklist
+from fediblockhole.blocklists import parse_blocklist
 from fediblockhole import merge_blocklists, merge_comments, apply_mergeplan
 
 from fediblockhole.const import SeverityLevel, DomainBlock
@@ -22,20 +22,19 @@ import_fields = [
 
 def load_test_blocklist_data(datafiles):
 
-    blocklists = {}
+    blocklists = []
 
     for df in datafiles:
         with open(df) as fp:
             data = fp.read()
-            bl = parse_blocklist(data, 'csv', import_fields)
-            blocklists[df] = bl
+            bl = parse_blocklist(data, df, 'csv', import_fields)
+            blocklists.append(bl)
     
     return blocklists
 
 def test_mergeplan_max():
     """Test 'max' mergeplan"""
     blocklists = load_test_blocklist_data([datafile01, datafile02])
-
     bl = merge_blocklists(blocklists, 'max')
     assert len(bl) == 13
 
diff --git a/tests/test_parser_csv.py b/tests/test_parser_csv.py
index c817e16..703fe95 100644
--- a/tests/test_parser_csv.py
+++ b/tests/test_parser_csv.py
@@ -1,22 +1,24 @@
 """Tests of the CSV parsing
 """
 
-from fediblockhole.blocklist_parser import BlocklistParserCSV, parse_blocklist
-from fediblockhole.const import DomainBlock, BlockSeverity, SeverityLevel
+from fediblockhole.blocklists import BlocklistParserCSV, parse_blocklist
+from fediblockhole.const import SeverityLevel
 
 
 def test_single_line():
     csvdata = "example.org"
+    origin = "csvfile"
 
     parser = BlocklistParserCSV()
-    bl = parser.parse_blocklist(csvdata)
+    bl = parser.parse_blocklist(csvdata, origin)
     assert len(bl) == 0
 
 def test_header_only():
     csvdata = "domain,severity,public_comment"
+    origin = "csvfile"
 
     parser = BlocklistParserCSV()
-    bl = parser.parse_blocklist(csvdata)
+    bl = parser.parse_blocklist(csvdata, origin)
     assert len(bl) == 0
 
 def test_2_blocks():
@@ -24,12 +26,13 @@ def test_2_blocks():
 example.org,silence
 example2.org,suspend
 """
+    origin = "csvfile"
 
     parser = BlocklistParserCSV()
-    bl = parser.parse_blocklist(csvdata)
+    bl = parser.parse_blocklist(csvdata, origin)
 
     assert len(bl) == 2
-    assert bl[0].domain == 'example.org'
+    assert 'example.org' in bl
 
 def test_4_blocks():
     csvdata = """domain,severity,public_comment
@@ -38,20 +41,21 @@ example2.org,suspend,"test 2"
 example3.org,noop,"test 3"
 example4.org,suspend,"test 4"
 """
+    origin = "csvfile"
 
     parser = BlocklistParserCSV()
-    bl = parser.parse_blocklist(csvdata)
+    bl = parser.parse_blocklist(csvdata, origin)
 
     assert len(bl) == 4
-    assert bl[0].domain == 'example.org'
-    assert bl[1].domain == 'example2.org'
-    assert bl[2].domain == 'example3.org'
-    assert bl[3].domain == 'example4.org'
+    assert 'example.org' in bl
+    assert 'example2.org' in bl
+    assert 'example3.org' in bl
+    assert 'example4.org' in bl
 
-    assert bl[0].severity.level == SeverityLevel.SILENCE
-    assert bl[1].severity.level == SeverityLevel.SUSPEND
-    assert bl[2].severity.level == SeverityLevel.NONE
-    assert bl[3].severity.level == SeverityLevel.SUSPEND
+    assert bl['example.org'].severity.level == SeverityLevel.SILENCE
+    assert bl['example2.org'].severity.level == SeverityLevel.SUSPEND
+    assert bl['example3.org'].severity.level == SeverityLevel.NONE
+    assert bl['example4.org'].severity.level == SeverityLevel.SUSPEND
 
 def test_ignore_comments():
     csvdata = """domain,severity,public_comment,private_comment
@@ -60,18 +64,18 @@ example2.org,suspend,"test 2","ignote me also"
 example3.org,noop,"test 3","and me"
 example4.org,suspend,"test 4","also me"
 """
+    origin = "csvfile"
 
     parser = BlocklistParserCSV()
-    bl = parser.parse_blocklist(csvdata)
+    bl = parser.parse_blocklist(csvdata, origin)
 
     assert len(bl) == 4
-    assert bl[0].domain == 'example.org'
-    assert bl[1].domain == 'example2.org'
-    assert bl[2].domain == 'example3.org'
-    assert bl[3].domain == 'example4.org'
+    assert 'example.org' in bl
+    assert 'example2.org' in bl
+    assert 'example3.org' in bl
+    assert 'example4.org' in bl
 
-    assert bl[0].public_comment == ''
-    assert bl[0].private_comment == ''
-
-    assert bl[2].public_comment == ''
-    assert bl[2].private_comment == ''
\ No newline at end of file
+    assert bl['example.org'].public_comment == ''
+    assert bl['example.org'].private_comment == ''
+    assert bl['example3.org'].public_comment == ''
+    assert bl['example4.org'].private_comment == ''
\ No newline at end of file
diff --git a/tests/test_parser_json.py b/tests/test_parser_json.py
index 8bf17df..b2fb0a1 100644
--- a/tests/test_parser_json.py
+++ b/tests/test_parser_json.py
@@ -1,8 +1,8 @@
 """Tests of the CSV parsing
 """
 
-from fediblockhole.blocklist_parser import BlocklistParserJSON, parse_blocklist
-from fediblockhole.const import DomainBlock, BlockSeverity, SeverityLevel
+from fediblockhole.blocklists import BlocklistParserJSON, parse_blocklist
+from fediblockhole.const import SeverityLevel
 
 datafile = 'data-mastodon.json'
 
@@ -14,33 +14,32 @@ def test_json_parser():
 
     data = load_data()
     parser = BlocklistParserJSON()
-    bl = parser.parse_blocklist(data)
+    bl = parser.parse_blocklist(data, 'test_json')
 
     assert len(bl) == 10
-    assert bl[0].domain == 'example.org'
-    assert bl[1].domain == 'example2.org'
-    assert bl[2].domain == 'example3.org'
-    assert bl[3].domain == 'example4.org'
+    assert 'example.org' in bl
+    assert 'example2.org' in bl
+    assert 'example3.org' in bl
+    assert 'example4.org' in bl
 
-    assert bl[0].severity.level == SeverityLevel.SUSPEND
-    assert bl[1].severity.level == SeverityLevel.SILENCE
-    assert bl[2].severity.level == SeverityLevel.SUSPEND
-    assert bl[3].severity.level == SeverityLevel.NONE
+    assert bl['example.org'].severity.level == SeverityLevel.SUSPEND
+    assert bl['example2.org'].severity.level == SeverityLevel.SILENCE
+    assert bl['example3.org'].severity.level == SeverityLevel.SUSPEND
+    assert bl['example4.org'].severity.level == SeverityLevel.NONE
 
 def test_ignore_comments():
 
     data = load_data()
     parser = BlocklistParserJSON()
-    bl = parser.parse_blocklist(data)
+    bl = parser.parse_blocklist(data, 'test_json')
 
     assert len(bl) == 10
-    assert bl[0].domain == 'example.org'
-    assert bl[1].domain == 'example2.org'
-    assert bl[2].domain == 'example3.org'
-    assert bl[3].domain == 'example4.org'
+    assert 'example.org' in bl
+    assert 'example2.org' in bl
+    assert 'example3.org' in bl
+    assert 'example4.org' in bl
 
-    assert bl[0].public_comment == ''
-    assert bl[0].private_comment == ''
-
-    assert bl[2].public_comment == ''
-    assert bl[2].private_comment == ''
\ No newline at end of file
+    assert bl['example.org'].public_comment == ''
+    assert bl['example.org'].private_comment == ''
+    assert bl['example3.org'].public_comment == ''
+    assert bl['example4.org'].private_comment == ''
\ No newline at end of file
diff --git a/tests/test_parser_rapidblockcsv.py b/tests/test_parser_rapidblockcsv.py
index edb8d1e..65d579d 100644
--- a/tests/test_parser_rapidblockcsv.py
+++ b/tests/test_parser_rapidblockcsv.py
@@ -1,7 +1,7 @@
 """Tests of the Rapidblock CSV parsing
 """
 
-from fediblockhole.blocklist_parser import RapidBlockParserCSV, parse_blocklist
+from fediblockhole.blocklists import RapidBlockParserCSV, parse_blocklist
 from fediblockhole.const import DomainBlock, BlockSeverity, SeverityLevel
 
 csvdata = """example.org\r\nsubdomain.example.org\r\nanotherdomain.org\r\ndomain4.org\r\n"""
@@ -11,13 +11,13 @@ def test_basic_rapidblock():
 
     bl = parser.parse_blocklist(csvdata)
     assert len(bl) == 4
-    assert bl[0].domain == 'example.org'
-    assert bl[1].domain == 'subdomain.example.org'
-    assert bl[2].domain == 'anotherdomain.org'
-    assert bl[3].domain == 'domain4.org'
+    assert 'example.org' in bl
+    assert 'subdomain.example.org' in bl
+    assert 'anotherdomain.org' in bl
+    assert 'domain4.org' in bl
 
 def test_severity_is_suspend():
     bl = parser.parse_blocklist(csvdata)
 
-    for block in bl:
+    for block in bl.values():
         assert block.severity.level == SeverityLevel.SUSPEND
\ No newline at end of file
diff --git a/tests/test_parser_rapidblockjson.py b/tests/test_parser_rapidblockjson.py
index 8ccca0f..ad13811 100644
--- a/tests/test_parser_rapidblockjson.py
+++ b/tests/test_parser_rapidblockjson.py
@@ -1,6 +1,6 @@
 """Test parsing the RapidBlock JSON format
 """
-from fediblockhole.blocklist_parser import parse_blocklist
+from fediblockhole.blocklists import parse_blocklist
 
 from fediblockhole.const import SeverityLevel
 
@@ -9,26 +9,26 @@ rapidblockjson = "data-rapidblock.json"
 def test_parse_rapidblock_json():
     with open(rapidblockjson) as fp:
         data = fp.read()
-        bl = parse_blocklist(data, 'rapidblock.json')
+        bl = parse_blocklist(data, 'pytest', 'rapidblock.json')
 
-        assert bl[0].domain == '101010.pl'
-        assert bl[0].severity.level == SeverityLevel.SUSPEND
-        assert bl[0].public_comment == ''
+        assert '101010.pl' in bl
+        assert bl['101010.pl'].severity.level == SeverityLevel.SUSPEND
+        assert bl['101010.pl'].public_comment == ''
 
-        assert bl[10].domain == 'berserker.town'
-        assert bl[10].severity.level == SeverityLevel.SUSPEND
-        assert bl[10].public_comment == ''
-        assert bl[10].private_comment == ''
+        assert 'berserker.town' in bl
+        assert bl['berserker.town'].severity.level == SeverityLevel.SUSPEND
+        assert bl['berserker.town'].public_comment == ''
+        assert bl['berserker.town'].private_comment == ''
 
 def test_parse_with_comments():
     with open(rapidblockjson) as fp:
         data = fp.read()
-        bl = parse_blocklist(data, 'rapidblock.json', ['domain', 'severity', 'public_comment', 'private_comment'])
+        bl = parse_blocklist(data, 'pytest', 'rapidblock.json', ['domain', 'severity', 'public_comment', 'private_comment'])
 
-        assert bl[0].domain == '101010.pl'
-        assert bl[0].severity.level == SeverityLevel.SUSPEND
-        assert bl[0].public_comment == 'cryptomining javascript, white supremacy'
+        assert '101010.pl' in bl
+        assert bl['101010.pl'].severity.level == SeverityLevel.SUSPEND
+        assert bl['101010.pl'].public_comment == 'cryptomining javascript, white supremacy'
 
-        assert bl[10].domain == 'berserker.town'
-        assert bl[10].severity.level == SeverityLevel.SUSPEND
-        assert bl[10].public_comment == 'freeze peach'
\ No newline at end of file
+        assert 'berserker.town' in bl
+        assert bl['berserker.town'].severity.level == SeverityLevel.SUSPEND
+        assert bl['berserker.town'].public_comment == 'freeze peach'
\ No newline at end of file

From e0207c437116883aa4b022fbb447cea5b3d0fec7 Mon Sep 17 00:00:00 2001
From: Justin Warren <justin@eigenmagic.com>
Date: Sun, 22 Jan 2023 09:01:30 +1100
Subject: [PATCH 2/5] Remove unnecessary `count` from DomainBlock object.

---
 src/fediblockhole/const.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/fediblockhole/const.py b/src/fediblockhole/const.py
index 7ed9f48..93cf2ef 100644
--- a/src/fediblockhole/const.py
+++ b/src/fediblockhole/const.py
@@ -123,8 +123,7 @@ class DomainBlock(object):
             reject_media: bool=False,
             reject_reports: bool=False,
             obfuscate: bool=False,
-            id: int=None,
-            count: int=0):
+            id: int=None):
         """Initialize the DomainBlock
         """        
         self.domain = domain
@@ -135,7 +134,6 @@ class DomainBlock(object):
         self.reject_reports = reject_reports
         self.obfuscate = obfuscate
         self.id = id
-        self.count = 0
 
     @property
     def severity(self):

From c018ebdf35be73eacf71887206b393612b64fb42 Mon Sep 17 00:00:00 2001
From: Justin Warren <justin@eigenmagic.com>
Date: Sun, 22 Jan 2023 09:02:03 +1100
Subject: [PATCH 3/5] Add merge thresholds to merge_blocklists()

---
 src/fediblockhole/__init__.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py
index e95aa97..9be1b78 100755
--- a/src/fediblockhole/__init__.py
+++ b/src/fediblockhole/__init__.py
@@ -180,16 +180,23 @@ def fetch_from_instances(blocklists: dict, sources: dict,
             save_intermediate_blocklist(blocklists[itemsrc], domain, savedir, export_fields)
     return blocklists
 
-def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshold: int=0) -> dict:
+def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
+    threshold: int=0,
+    threshold_type: str='count') -> dict:
     """Merge fetched remote blocklists into a bulk update
     @param blocklists: A dict of lists of DomainBlocks, keyed by source.
         Each value is a list of DomainBlocks
     @param mergeplan: An optional method of merging overlapping block definitions
         'max' (the default) uses the highest severity block found
         'min' uses the lowest severity block found
-    @param threshold: An integer percentage [0-100].
-        If a domain is not present in this pct or more of the blocklists,
+    @param threshold: An integer used in the threshold mechanism.
+        If a domain is not present in this number/pct or more of the blocklists,
         it will not get merged into the final list.
+    @param threshold_type: choice of ['count', 'pct']
+        If `count`, threshold is met if block is present in `threshold`
+        or more blocklists.
+        If `pct`, theshold is met if block is present in
+        count_of_mentions / number_of_blocklists.
     @param returns: A dict of DomainBlocks keyed by domain
     """
     merged = {}
@@ -209,10 +216,16 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max', threshol
             else:
                 domain_blocks[block.domain] = [block,]
 
-    # Only merge items if there are more than `threshold` pct of them
+    # Only merge items if `threshold` is met or exceeded
     for domain in domain_blocks:
-        pct = len(domain_blocks[domain]) / num_blocklists
-        if pct >= threshold:
+        if threshold_type == 'count':
+            domain_threshold_level = len(domain_blocks[domain])
+        elif threshold_type == 'pct':
+            domain_threshold_level = len(domain_blocks[domain]) / num_blocklists
+        else:
+            raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'")
+            
+        if domain_threshold_level >= threshold:
             # Add first block in the list to merged
             merged[domain] = domain_blocks[domain][0]
             # Merge the others with this record

From abc53c9148fd0d5f5b293e56eec301121da768bb Mon Sep 17 00:00:00 2001
From: Justin Warren <justin@eigenmagic.com>
Date: Sun, 22 Jan 2023 13:03:32 +1100
Subject: [PATCH 4/5] Updated documentation and examples for allowlists.

---
 README.md                     | 2 ++
 samples/demo-allowlist-01.csv | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 44a9864..882fca8 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,8 @@ appropriate.
 
  - Provides (hopefully) sensible defaults to minimise first-time setup.
  - Global and fine-grained configuration options available for those complex situations that crop up sometimes.
+ - Allowlists to override blocks in blocklists to ensure you never block instances you want to keep.
+ - Blocklist thresholds if you want to only block when an instance shows up in multiple blocklists.
 
 ## Installing
 
diff --git a/samples/demo-allowlist-01.csv b/samples/demo-allowlist-01.csv
index 6ee7744..665ff6a 100644
--- a/samples/demo-allowlist-01.csv
+++ b/samples/demo-allowlist-01.csv
@@ -1,3 +1,4 @@
 "domain","severity","private_comment","public_comment","reject_media","reject_reports","obfuscate"
-"eigenmagic.net","noop","Never block me","Only the domain field matters",False,False,False
-"example.org","noop","Never block me either","The severity is ignored as are all other fields",False,False,False
+"eigenmagic.net","noop","Never block me","Only the domain field matters for allowlists",False,False,False
+"example.org","noop","Never block me either","The severity is ignored in allowlists as are all other fields",False,False,False
+"demo01.example.org","noop","Never block me either","But you can use them to leave yourself or others notes on why the item is here",False,False,False

From 3b4cdcbcdf964a71bfe15e2299c84b08b6e77bb5 Mon Sep 17 00:00:00 2001
From: Justin Warren <justin@eigenmagic.com>
Date: Sun, 22 Jan 2023 13:05:44 +1100
Subject: [PATCH 5/5] Refactored the code to use Blocklist data structure for
 allowlists. Fixed bug in parse_blocklist() for preparse selection. Updated
 test cases for allowlists. Removed some code that is no longer needed.

---
 src/fediblockhole/__init__.py   | 163 ++++++++++++++------------------
 src/fediblockhole/blocklists.py |  18 ++--
 tests/test_allowlist.py         |  37 ++++----
 3 files changed, 100 insertions(+), 118 deletions(-)

diff --git a/src/fediblockhole/__init__.py b/src/fediblockhole/__init__.py
index 9be1b78..67b1f06 100755
--- a/src/fediblockhole/__init__.py
+++ b/src/fediblockhole/__init__.py
@@ -59,16 +59,16 @@ def sync_blocklists(conf: argparse.Namespace):
     # Add extra export fields if defined in config
     export_fields.extend(conf.export_fields)
 
-    blocklists = {}
+    blocklists = []
     # Fetch blocklists from URLs
     if not conf.no_fetch_url:
-        blocklists = fetch_from_urls(blocklists, conf.blocklist_url_sources,
-            import_fields, conf.save_intermediate, conf.savedir, export_fields)
+        blocklists.extend(fetch_from_urls(conf.blocklist_url_sources,
+            import_fields, conf.save_intermediate, conf.savedir, export_fields))
 
     # Fetch blocklists from remote instances
     if not conf.no_fetch_instance:
-        blocklists = fetch_from_instances(blocklists, conf.blocklist_instance_sources,
-            import_fields, conf.save_intermediate, conf.savedir, export_fields)
+        blocklists.extend(fetch_from_instances(conf.blocklist_instance_sources,
+            import_fields, conf.save_intermediate, conf.savedir, export_fields))
 
     # Merge blocklists into an update dict
     merged = merge_blocklists(blocklists, conf.mergeplan)
@@ -80,48 +80,48 @@ def sync_blocklists(conf: argparse.Namespace):
     # Save the final mergelist, if requested
     if conf.blocklist_savefile:
         log.info(f"Saving merged blocklist to {conf.blocklist_savefile}")
-        save_blocklist_to_file(merged.values(), conf.blocklist_savefile, export_fields)
+        save_blocklist_to_file(merged, conf.blocklist_savefile, export_fields)
 
     # Push the blocklist to destination instances
     if not conf.no_push_instance:
         log.info("Pushing domain blocks to instances...")
         for dest in conf.blocklist_instance_destinations:
-            domain = dest['domain']
+            target = dest['domain']
             token = dest['token']
             scheme = dest.get('scheme', 'https')
             max_followed_severity = BlockSeverity(dest.get('max_followed_severity', 'silence'))
-            push_blocklist(token, domain, merged.values(), conf.dryrun, import_fields, max_followed_severity, scheme)
+            push_blocklist(token, target, merged, conf.dryrun, import_fields, max_followed_severity, scheme)
 
-def apply_allowlists(merged: dict, conf: argparse.Namespace, allowlists: dict):
+def apply_allowlists(merged: Blocklist, conf: argparse.Namespace, allowlists: dict):
     """Apply allowlists
     """
     # Apply allows specified on the commandline
     for domain in conf.allow_domains:
         log.info(f"'{domain}' allowed by commandline, removing any blocks...")
-        if domain in merged:
-            del merged[domain]
+        if domain in merged.blocks:
+            del merged.blocks[domain]
 
     # Apply allows from URLs lists
     log.info("Removing domains from URL allowlists...")
-    for key, alist in allowlists.items():
-        log.debug(f"Processing allows from '{key}'...")
-        for allowed in alist:
+    for alist in allowlists:
+        log.debug(f"Processing allows from '{alist.origin}'...")
+        for allowed in alist.blocks.values():
             domain = allowed.domain
             log.debug(f"Removing allowlisted domain '{domain}' from merged list.")
-            if domain in merged:
-                del merged[domain]
+            if domain in merged.blocks:
+                del merged.blocks[domain]
 
     return merged
 
-def fetch_allowlists(conf: argparse.Namespace) -> dict:
+def fetch_allowlists(conf: argparse.Namespace) -> Blocklist:
     """
     """
     if conf.allowlist_url_sources:
-        allowlists = fetch_from_urls({}, conf.allowlist_url_sources, ALLOWLIST_IMPORT_FIELDS)
+        allowlists = fetch_from_urls(conf.allowlist_url_sources, ALLOWLIST_IMPORT_FIELDS, conf.save_intermediate, conf.savedir)
         return allowlists
-    return {}
+    return Blocklist()
 
-def fetch_from_urls(blocklists: dict, url_sources: dict,
+def fetch_from_urls(url_sources: dict,
     import_fields: list=IMPORT_FIELDS,
     save_intermediate: bool=False,
     savedir: str=None, export_fields: list=EXPORT_FIELDS) -> dict:
@@ -131,7 +131,7 @@ def fetch_from_urls(blocklists: dict, url_sources: dict,
     @returns: A dict of blocklists, same as input, but (possibly) modified
     """
     log.info("Fetching domain blocks from URLs...")
-
+    blocklists = []
     for item in url_sources:
         url = item['url']
         # If import fields are provided, they override the global ones passed in
@@ -144,14 +144,14 @@ def fetch_from_urls(blocklists: dict, url_sources: dict,
         listformat = item.get('format', 'csv')
         with urlr.urlopen(url) as fp:
             rawdata = fp.read(URL_BLOCKLIST_MAXSIZE).decode('utf-8')
-            blocklists[url] = parse_blocklist(rawdata, listformat, import_fields, max_severity)
-            
-        if save_intermediate:
-            save_intermediate_blocklist(blocklists[url], url, savedir, export_fields)
+            bl = parse_blocklist(rawdata, url, listformat, import_fields, max_severity)
+            blocklists.append(bl)
+            if save_intermediate:
+                save_intermediate_blocklist(bl, savedir, export_fields)
     
     return blocklists
 
-def fetch_from_instances(blocklists: dict, sources: dict,
+def fetch_from_instances(sources: dict,
     import_fields: list=IMPORT_FIELDS,
     save_intermediate: bool=False,
     savedir: str=None, export_fields: list=EXPORT_FIELDS) -> dict:
@@ -161,12 +161,13 @@ def fetch_from_instances(blocklists: dict, sources: dict,
     @returns: A dict of blocklists, same as input, but (possibly) modified
     """
     log.info("Fetching domain blocks from instances...")
+    blocklists = []
     for item in sources:
         domain = item['domain']
         admin = item.get('admin', False)
         token = item.get('token', None)
         scheme = item.get('scheme', 'https')
-        itemsrc = f"{scheme}://{domain}/api"
+        # itemsrc = f"{scheme}://{domain}/api"
 
         # If import fields are provided, they override the global ones passed in
         source_import_fields = item.get('import_fields', None)
@@ -174,15 +175,15 @@ def fetch_from_instances(blocklists: dict, sources: dict,
             # Ensure we always use the default fields
             import_fields = IMPORT_FIELDS.extend(source_import_fields)
 
-        # Add the blocklist with the domain as the source key
-        blocklists[itemsrc] = fetch_instance_blocklist(domain, token, admin, import_fields, scheme)
+        bl = fetch_instance_blocklist(domain, token, admin, import_fields, scheme)
+        blocklists.append(bl)
         if save_intermediate:
-            save_intermediate_blocklist(blocklists[itemsrc], domain, savedir, export_fields)
+            save_intermediate_blocklist(bl, savedir, export_fields)
     return blocklists
 
 def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
     threshold: int=0,
-    threshold_type: str='count') -> dict:
+    threshold_type: str='count') -> Blocklist:
     """Merge fetched remote blocklists into a bulk update
     @param blocklists: A dict of lists of DomainBlocks, keyed by source.
         Each value is a list of DomainBlocks
@@ -199,7 +200,7 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
         count_of_mentions / number_of_blocklists.
     @param returns: A dict of DomainBlocks keyed by domain
     """
-    merged = {}
+    merged = Blocklist('fediblockhole.merge_blocklists')
 
     num_blocklists = len(blocklists)
 
@@ -209,7 +210,7 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
     for bl in blocklists:
         for block in bl.values():
             if '*' in block.domain:
-                log.debug(f"Domain '{domain}' is obfuscated. Skipping it.")
+                log.debug(f"Domain '{block.domain}' is obfuscated. Skipping it.")
                 continue
             elif block.domain in domain_blocks:
                 domain_blocks[block.domain].append(block)
@@ -224,40 +225,17 @@ def merge_blocklists(blocklists: list[Blocklist], mergeplan: str='max',
             domain_threshold_level = len(domain_blocks[domain]) / num_blocklists
         else:
             raise ValueError(f"Unsupported threshold type '{threshold_type}'. Supported values are: 'count', 'pct'")
-            
+
         if domain_threshold_level >= threshold:
             # Add first block in the list to merged
-            merged[domain] = domain_blocks[domain][0]
+            block = domain_blocks[domain][0]
             # Merge the others with this record
-            for block in domain_blocks[domain][1:]:
-                merged[domain] = apply_mergeplan(merged[domain], block, mergeplan)
-    
+            for newblock in domain_blocks[domain][1:]:
+                block = apply_mergeplan(block, newblock, mergeplan)
+        merged.blocks[block.domain] = block
+
     return merged
 
-    # for key, blist in blocklists.items():
-    #     log.debug(f"processing blocklist from: {key} ...")
-    #     for newblock in blist:
-    #         domain = newblock.domain
-    #         # If the domain has two asterisks in it, it's obfuscated
-    #         # and we can't really use it, so skip it and do the next one
-    #         if '*' in domain:
-    #             log.debug(f"Domain '{domain}' is obfuscated. Skipping it.")
-    #             continue
-
-    #         elif domain in merged:
-    #             log.debug(f"Overlapping block for domain {domain}. Merging...")
-    #             blockdata = apply_mergeplan(merged[domain], newblock, mergeplan)
-
-    #         else:
-    #             # New block
-    #             blockdata = newblock
-
-    #         # end if
-    #         log.debug(f"blockdata is: {blockdata}")
-    #         merged[domain] = blockdata
-    #     # end for
-    # return merged
-
 def apply_mergeplan(oldblock: DomainBlock, newblock: DomainBlock, mergeplan: str='max') -> dict:
     """Use a mergeplan to decide how to merge two overlapping block definitions
     
@@ -282,10 +260,10 @@ def apply_mergeplan(oldblock: DomainBlock, newblock: DomainBlock, mergeplan: str
     # How do we override an earlier block definition?
     if mergeplan in ['max', None]:
         # Use the highest block level found (the default)
-        log.debug(f"Using 'max' mergeplan.")
+        # log.debug(f"Using 'max' mergeplan.")
 
         if newblock.severity > oldblock.severity:
-            log.debug(f"New block severity is higher. Using that.")
+            # log.debug(f"New block severity is higher. Using that.")
             blockdata['severity'] = newblock.severity
         
         # For 'reject_media', 'reject_reports', and 'obfuscate' if
@@ -314,7 +292,7 @@ def apply_mergeplan(oldblock: DomainBlock, newblock: DomainBlock, mergeplan: str
     else:
         raise NotImplementedError(f"Mergeplan '{mergeplan}' not implemented.")
 
-    log.debug(f"Block severity set to {blockdata['severity']}")
+    # log.debug(f"Block severity set to {blockdata['severity']}")
 
     return DomainBlock(**blockdata)
 
@@ -396,17 +374,19 @@ def fetch_instance_blocklist(host: str, token: str=None, admin: bool=False,
 
     url = f"{scheme}://{host}{api_path}"
 
-    blocklist = []
+    blockdata = []
     link = True
-
     while link:
         response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
         if response.status_code != 200:
             log.error(f"Cannot fetch remote blocklist: {response.content}")
             raise ValueError("Unable to fetch domain block list: %s", response)
 
-        blocklist.extend( parse_blocklist(response.content, parse_format, import_fields) )
-        
+        # Each block of returned data is a JSON list of dicts
+        # so we parse them and append them to the fetched list
+        # of JSON data we need to parse.
+
+        blockdata.extend(json.loads(response.content.decode('utf-8')))
         # Parse the link header to find the next url to fetch
         # This is a weird and janky way of doing pagination but
         # hey nothing we can do about it we just have to deal
@@ -424,6 +404,8 @@ def fetch_instance_blocklist(host: str, token: str=None, admin: bool=False,
             urlstring, rel = next.split('; ')
             url = urlstring.strip('<').rstrip('>')
 
+    blocklist = parse_blocklist(blockdata, url, parse_format, import_fields)
+
     return blocklist
 
 def delete_block(token: str, host: str, id: int, scheme: str='https'):
@@ -513,13 +495,9 @@ def update_known_block(token: str, host: str, block: DomainBlock, scheme: str='h
     """Update an existing domain block with information in blockdict"""
     api_path = "/api/v1/admin/domain_blocks/"
 
-    try:
-        id = block.id
-        blockdata = block._asdict()
-        del blockdata['id']
-    except KeyError:
-        import pdb
-        pdb.set_trace()
+    id = block.id
+    blockdata = block._asdict()
+    del blockdata['id']
 
     url = f"{scheme}://{host}{api_path}{id}"
 
@@ -553,7 +531,7 @@ def add_block(token: str, host: str, blockdata: DomainBlock, scheme: str='https'
             
         raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")
            
-def push_blocklist(token: str, host: str, blocklist: list[dict],
+def push_blocklist(token: str, host: str, blocklist: list[DomainBlock],
                     dryrun: bool=False,
                     import_fields: list=['domain', 'severity'],
                     max_followed_severity:BlockSeverity=BlockSeverity('silence'),
@@ -561,8 +539,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict],
                     ):
     """Push a blocklist to a remote instance.
     
-    Merging the blocklist with the existing list the instance has,
-    updating existing entries if they exist.
+    Updates existing entries if they exist, creates new blocks if they don't.
 
     @param token: The Bearer token for OAUTH API authentication
     @param host: The instance host, FQDN or IP
@@ -577,15 +554,16 @@ def push_blocklist(token: str, host: str, blocklist: list[dict],
     serverblocks = fetch_instance_blocklist(host, token, True, import_fields, scheme)
 
     # # Convert serverblocks to a dictionary keyed by domain name
-    knownblocks = {row.domain: row for row in serverblocks}
+    # knownblocks = {row.domain: row for row in serverblocks}
 
-    for newblock in blocklist:
+    for newblock in blocklist.values():
 
         log.debug(f"Processing block: {newblock}")
-        oldblock = knownblocks.get(newblock.domain, None)
-        if oldblock:
+        if newblock.domain in serverblocks:
             log.debug(f"Block already exists for {newblock.domain}, checking for differences...")
 
+            oldblock = serverblocks[newblock.domain]
+
             change_needed = is_change_needed(oldblock, newblock, import_fields)
 
             # Is the severity changing?
@@ -644,15 +622,14 @@ def load_config(configfile: str):
     conf = toml.load(configfile)
     return conf
 
-def save_intermediate_blocklist(
-    blocklist: list[dict], source: str,
-    filedir: str,
+def save_intermediate_blocklist(blocklist: Blocklist, filedir: str,
     export_fields: list=['domain','severity']):
     """Save a local copy of a blocklist we've downloaded
     """
     # Invent a filename based on the remote source
     # If the source was a URL, convert it to something less messy
     # If the source was a remote domain, just use the name of the domain
+    source = blocklist.origin
     log.debug(f"Saving intermediate blocklist from {source}")
     source = source.replace('/','-')
     filename = f"{source}.csv"
@@ -660,7 +637,7 @@ def save_intermediate_blocklist(
     save_blocklist_to_file(blocklist, filepath, export_fields)
 
 def save_blocklist_to_file(
-    blocklist: list[DomainBlock],
+    blocklist: Blocklist,
     filepath: str,
     export_fields: list=['domain','severity']):
     """Save a blocklist we've downloaded from a remote source
@@ -670,18 +647,22 @@ def save_blocklist_to_file(
     @param export_fields: Which fields to include in the export.
     """
     try:
-        blocklist = sorted(blocklist, key=lambda x: x.domain)
+        sorted_list = sorted(blocklist.blocks.items())
     except KeyError:
         log.error("Field 'domain' not found in blocklist.")
-        log.debug(f"blocklist is: {blocklist}")
+        log.debug(f"blocklist is: {sorted_list}")
+    except AttributeError:
+        log.error("Attribute error!")
+        import pdb
+        pdb.set_trace()
 
     log.debug(f"export fields: {export_fields}")
 
     with open(filepath, "w") as fp:
         writer = csv.DictWriter(fp, export_fields, extrasaction='ignore')
         writer.writeheader()
-        for item in blocklist:
-            writer.writerow(item._asdict())
+        for key, value in sorted_list:
+            writer.writerow(value)
 
 def augment_args(args, tomldata: str=None):
     """Augment commandline arguments with config file parameters
diff --git a/src/fediblockhole/blocklists.py b/src/fediblockhole/blocklists.py
index f79f3d2..7a9e44f 100644
--- a/src/fediblockhole/blocklists.py
+++ b/src/fediblockhole/blocklists.py
@@ -41,7 +41,7 @@ class BlocklistParser(object):
     """
     Base class for parsing blocklists
     """
-    preparse = False
+    do_preparse = False
 
     def __init__(self, import_fields: list=['domain', 'severity'], 
         max_severity: str='suspend'):
@@ -63,7 +63,7 @@ class BlocklistParser(object):
         @param blocklist: An Iterable of blocklist items
         @returns: A dict of DomainBlocks, keyed by domain
         """
-        if self.preparse:
+        if self.do_preparse:
             blockdata = self.preparse(blockdata)
 
         parsed_list = Blocklist(origin)
@@ -82,12 +82,13 @@ class BlocklistParser(object):
 
 class BlocklistParserJSON(BlocklistParser):
     """Parse a JSON formatted blocklist"""
-    preparse = True
+    do_preparse = True
 
     def preparse(self, blockdata) -> Iterable:
-        """Parse the blockdata as JSON
-        """
-        return json.loads(blockdata)
+        """Parse the blockdata as JSON if needed"""
+        if type(blockdata) == type(''):
+            return json.loads(blockdata)
+        return blockdata
 
     def parse_item(self, blockitem: dict) -> DomainBlock:
         # Remove fields we don't want to import
@@ -131,7 +132,7 @@ class BlocklistParserCSV(BlocklistParser):
 
     The parser expects the CSV data to include a header with the field names.
     """
-    preparse = True
+    do_preparse = True
 
     def preparse(self, blockdata) -> Iterable:
         """Use a csv.DictReader to create an iterable from the blockdata
@@ -237,6 +238,7 @@ def parse_blocklist(
     max_severity: str='suspend'):
     """Parse a blocklist in the given format
     """
-    parser = FORMAT_PARSERS[format](import_fields, max_severity)
     log.debug(f"parsing {format} blocklist with import_fields: {import_fields}...")
+
+    parser = FORMAT_PARSERS[format](import_fields, max_severity)
     return parser.parse_blocklist(blockdata, origin)
\ No newline at end of file
diff --git a/tests/test_allowlist.py b/tests/test_allowlist.py
index 902b301..ddd53b9 100644
--- a/tests/test_allowlist.py
+++ b/tests/test_allowlist.py
@@ -4,6 +4,7 @@ import pytest
 
 from util import shim_argparse
 from fediblockhole.const import DomainBlock
+from fediblockhole.blocklists import Blocklist
 from fediblockhole import fetch_allowlists, apply_allowlists
 
 def test_cmdline_allow_removes_domain():
@@ -11,17 +12,13 @@ def test_cmdline_allow_removes_domain():
     """
     conf = shim_argparse(['-A', 'removeme.org'])
 
-    merged = {
+    merged = Blocklist('test_allowlist.merged', {
         'example.org': DomainBlock('example.org'),
         'example2.org': DomainBlock('example2.org'),
         'removeme.org': DomainBlock('removeme.org'),
         'keepblockingme.org': DomainBlock('keepblockingme.org'),
-    }
+    })
 
-    # allowlists = {
-    #     'testlist': [ DomainBlock('removeme.org', 'noop'), ]
-    # }
-    
     merged = apply_allowlists(merged, conf, {})
 
     with pytest.raises(KeyError):
@@ -32,16 +29,18 @@ def test_allowlist_removes_domain():
     """
     conf = shim_argparse()
 
-    merged = {
+    merged = Blocklist('test_allowlist.merged', {
         'example.org': DomainBlock('example.org'),
         'example2.org': DomainBlock('example2.org'),
         'removeme.org': DomainBlock('removeme.org'),
         'keepblockingme.org': DomainBlock('keepblockingme.org'),
-    }
+    })
 
-    allowlists = {
-        'testlist': [ DomainBlock('removeme.org', 'noop'), ]
-    }
+    allowlists = [
+        Blocklist('test_allowlist', {
+            'removeme.org': DomainBlock('removeme.org', 'noop'),
+            })
+    ]
     
     merged = apply_allowlists(merged, conf, allowlists)
 
@@ -53,19 +52,19 @@ def test_allowlist_removes_tld():
     """
     conf = shim_argparse()
 
-    merged = {
+    merged = Blocklist('test_allowlist.merged', {
         '.cf': DomainBlock('.cf'),
         'example.org': DomainBlock('example.org'),
         '.tk': DomainBlock('.tk'),
         'keepblockingme.org': DomainBlock('keepblockingme.org'),
-    }
+    })
 
-    allowlists = {
-        'list1': [
-            DomainBlock('.cf', 'noop'), 
-            DomainBlock('.tk', 'noop'), 
-        ]
-    }
+    allowlists = [
+        Blocklist('test_allowlist.list1', {
+        '.cf': DomainBlock('.cf', 'noop'),
+        '.tk': DomainBlock('.tk', 'noop'), 
+        })
+    ]
     
     merged = apply_allowlists(merged, conf, allowlists)