More advanced mergeplan handling.

Support for 'noop' severity for removing a block. Support for mergeplan as commandline parameter. Support for mergeplan as config file option. Added --dryrun commandline option.
2022-12-20 14:29:23 +11:00 · 2022-12-20 14:29:23 +11:00 · 8fc8160f82
parent ea58d34c14
commit 8fc8160f82
2 changed files with 161 additions and 77 deletions
--- a/bin/fediblock_sync.py
+++ b/bin/fediblock_sync.py
@ -21,6 +21,27 @@ log = logging.getLogger('fediblock_sync')

 CONFIGFILE = "/home/mastodon/etc/admin.conf"

+# The relative severity levels of blocks
+SEVERITY = {
+    'noop': 0,
+    'silence': 1,
+    'suspend': 2,
+}
+
+# Default for 'reject_media' setting for each severity level
+REJECT_MEDIA_DEFAULT = {
+    'noop': False,
+    'silence': True,
+    'suspend': True,
+}
+
+# Default for 'reject_reports' setting for each severity level
+REJECT_REPORTS_DEFAULT = {
+    'noop': False,
+    'silence': True,
+    'suspend': True,
+}
+
 def sync_blocklists(conf: dict):
    """Sync instance blocklists from remote sources.

@ -39,6 +60,10 @@ def sync_blocklists(conf: dict):
                rawdata = fp.read(URL_BLOCKLIST_MAXSIZE).decode('utf-8')
                reader = csv.DictReader(rawdata.split('\n'))
                for row in reader:
+                    # Coerce booleans from string to Python bool
+                    for boolkey in ['reject_media', 'reject_reports', 'obfuscate']:
+                        if boolkey in row:
+                            row[boolkey] = str2bool(row[boolkey])
                    blocklists[listurl].append(row)
            if conf.save_intermediate:
                save_intermediate_blocklist(blocklists[listurl], listurl, conf.savedir)
@ -54,7 +79,7 @@ def sync_blocklists(conf: dict):
                save_intermediate_blocklist(blocklists[domain], domain, conf.savedir)

    # Merge blocklists into an update dict
-    merged = merge_blocklists(blocklists)
+    merged = merge_blocklists(blocklists, conf.mergeplan)
    if conf.blocklist_savefile:
        log.info(f"Saving merged blocklist to {conf.blocklist_savefile}")
        save_blocklist_to_file(merged.values(), conf.blocklist_savefile)
@ -65,7 +90,7 @@ def sync_blocklists(conf: dict):
        for dest in conf.blocklist_instance_destinations:
            domain = dest['domain']
            token = dest['token']
-            push_blocklist(token, domain, merged.values())
+            push_blocklist(token, domain, merged.values(), conf.dryrun)

 def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
    """Merge fetched remote blocklists into a bulk update
@ -77,63 +102,90 @@ def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
    merged = {}

    for key, blist in blocklists.items():
-        log.debug(f"Merging blocks from {key} ...")
-        for blockdef in blist:
-            # log.debug(f"Checking blockdef {blockdef} ...")
-            domain = blockdef['domain']
+        log.debug(f"processing key {key} blist...")
+        for newblock in blist:
+            domain = newblock['domain']
            if domain in merged:
-                blockdata = merged[domain]
-
-                # If the public or private comment is different,
-                # append it to the existing comment, joined with a newline
-                if blockdef['public_comment'] != blockdata['public_comment'] and blockdata['public_comment'] != '':
-                    blockdata['public_comment'] = '\n'.join([blockdef['public_comment'], blockdata['public_comment']])
-
-                if blockdef['private_comment'] != blockdata['private_comment'] and blockdata['private_comment'] != '':
-                    blockdata['private_comment'] = '\n'.join([blockdef['private_comment'], blockdata['private_comment']])
-
-                # How do we override an earlier block definition?
-                if mergeplan in ['max', None]:
-                    # Use the highest block level found (the default)
-                    if blockdef['severity'] == 'suspend':
-                        blockdata['severity'] = 'suspend'
-
-                    if blockdef['reject_media'] == True:
-                        blockdata['reject_media'] = True
-
-                    if blockdef['reject_reports'] == True:
-                        blockdata['reject_reports'] = True
-
-                elif mergeplan in ['min']:
-                    # Use the lowest block level found
-                    if blockdef['severity'] == 'silence':
-                        blockdata['severity'] = 'silence'
-
-                    if blockdef['reject_media'] == False:
-                        blockdata['reject_media'] = False
-
-                    if blockdef['reject_reports'] == False:
-                        blockdata['reject_reports'] = False
-
-                else:
-                    raise NotImplementedError(f"Mergeplan '{mergeplan}' not implemented.")
-
+                log.debug(f"Overlapping block for domain {domain}. Merging...")
+                blockdata = apply_mergeplan(merged[domain], newblock, mergeplan)
            else:
                # New block
                blockdata = {
-                    'domain': blockdef['domain'],
+                    'domain': newblock['domain'],
                    # Default to Silence if nothing is specified
-                    'severity': blockdef.get('severity', 'silence'),
-                    'public_comment': blockdef['public_comment'],
-                    'private_comment': blockdef['private_comment'],
-                    'reject_media': blockdef.get('reject_media', False),
-                    'reject_reports': blockdef.get('reject_reports', False),
-                    'obfuscate': blockdef.get('obfuscate', False),
+                    'severity': newblock.get('severity', 'silence'),
+                    'public_comment': newblock.get('public_comment', ''),
+                    'private_comment': newblock.get('private_comment', ''),
+                    'obfuscate': newblock.get('obfuscate', True), # default obfuscate to True
                }
+                sev = blockdata['severity'] # convenience variable
+                blockdata['reject_media'] = newblock.get('reject_media', REJECT_MEDIA_DEFAULT[sev])
+                blockdata['reject_reports'] = newblock.get('reject_reports', REJECT_REPORTS_DEFAULT[sev])
+            # end if
+            log.debug(f"blockdata is: {blockdata}")
            merged[domain] = blockdata
-
+        # end for
    return merged

+def apply_mergeplan(oldblock: dict, newblock: dict, mergeplan: str='max') -> dict:
+    """Use a mergeplan to decide how to merge two overlapping block definitions
+    
+    @param oldblock: The exist block definition.
+    @param newblock: The new block definition we want to merge in.
+    @param mergeplan: How to merge. Choices are 'max', the default, and 'min'.
+    """
+    # Default to the existing block definition
+    blockdata = oldblock.copy()
+
+    # If the public or private comment is different,
+    # append it to the existing comment, joined with a newline
+    # unless the comment is None or an empty string
+    for key in ['public_comment', 'private_comment']:
+        key = 'public_comment' # convenience variable
+        if oldblock[key] != newblock[key] and newblock[key] not in ['', None]:
+            blockdata[key] = '\n'.join([oldblock[key], newblock[key]])
+
+    # How do we override an earlier block definition?
+    if mergeplan in ['max', None]:
+        # Use the highest block level found (the default)
+        log.debug(f"Using 'max' mergeplan.")
+
+        if SEVERITY[newblock['severity']] > SEVERITY[oldblock['severity']]:
+            log.debug(f"New block severity is higher. Using that.")
+            blockdata['severity'] = newblock['severity']
+        
+        # If obfuscate is set and is True for the domain in
+        # any blocklist then obfuscate is set to false.
+        if newblock.get('obfuscate', False):
+            blockdata['obfuscate'] = True
+
+    elif mergeplan in ['min']:
+        # Use the lowest block level found
+        log.debug(f"Using 'min' mergeplan.")
+
+        if SEVERITY[newblock['severity']] < SEVERITY[oldblock['severity']]:
+            blockdata['severity'] = newblock['severity']
+
+        # If obfuscate is set and is False for the domain in
+        # any blocklist then obfuscate is set to False.
+        if not newblock.get('obfuscate', True):
+            blockdata['obfuscate'] = False
+
+    else:
+        raise NotImplementedError(f"Mergeplan '{mergeplan}' not implemented.")
+
+    log.debug(f"Block severity set to {blockdata['severity']}")
+    # Use the severity level to set rejections, if not defined in newblock
+    # If severity level is 'suspend', it doesn't matter what the settings is for
+    # 'reject_media' or 'reject_reports'
+    blockdata['reject_media'] = newblock.get('reject_media', REJECT_MEDIA_DEFAULT[blockdata['severity']])
+    blockdata['reject_reports'] = newblock.get('reject_reports', REJECT_REPORTS_DEFAULT[blockdata['severity']])
+    
+    log.debug(f"set reject_media to: {blockdata['reject_media']}")
+    log.debug(f"set reject_reports to: {blockdata['reject_reports']}")
+
+    return blockdata
+
 def fetch_instance_blocklist(token: str, host: str) -> list:
    """Fetch existing block list from server

@ -223,7 +275,7 @@ def add_block(token: str, host: str, blockdata: dict):
    if response.status_code != 200:
        raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")

-def push_blocklist(token: str, host: str, blocklist: list[dict]):
+def push_blocklist(token: str, host: str, blocklist: list[dict], dryrun: bool=False):
    """Push a blocklist to a remote instance.
    
    Merging the blocklist with the existing list the instance has,
@ -240,14 +292,12 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]):
    # Convert serverblocks to a dictionary keyed by domain name
    knownblocks = {row['domain']: row for row in serverblocks}

-    for row in blocklist:
-        # log.debug(f"Importing definition: {row}")
-
-        if 'id' in row: del row['id']
+    for newblock in blocklist:

+        log.debug(f"applying newblock: {newblock}")
        try:
-            blockdict = knownblocks[row['domain']]
-            log.debug(f"Block already exists for {row['domain']}, merging data...")
+            oldblock = knownblocks[newblock['domain']]
+            log.debug(f"Block already exists for {newblock['domain']}, merging data...")

            # Check if anything is actually different and needs updating
            change_needed = False
@ -260,38 +310,53 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]):
                'obfuscate',
                ]:
                try:
-                    if blockdict[key] != knownblocks[key]:
+                    log.debug(f"Compare {key} '{oldblock[key]}' <> '{newblock[key]}'")
+                    oldval = oldblock[key]
+                    newval = newblock[key]
+                    if oldval != newval:
+                        log.debug("Difference detected. Change needed.")
                        change_needed = True
                        break
+
                except KeyError:
-                    break
+                    log.debug(f"KeyError comparing {key}")
+                    continue
            
            if change_needed:
-                log.debug(f"Change detected. Updating domain block for {row['domain']}")
-                blockdict.update(row)
-                update_known_block(token, host, blockdict)
-                # add a pause here so we don't melt the instance
-                time.sleep(1)
+                log.info(f"Change detected. Updating domain block for {oldblock['domain']}")
+                blockdata = oldblock.copy()
+                blockdata.update(newblock)
+                if not dryrun:
+                    update_known_block(token, host, blockdata)
+                    # add a pause here so we don't melt the instance
+                    time.sleep(1)
+                else:
+                    log.info("Dry run selected. Not applying changes.")

            else:
                log.debug("No differences detected. Not updating.")
+                pass

        except KeyError:
-            # domain doesn't have an entry, so we need to create one
+            # This is a new block for the target instance, so we
+            # need to add a block rather than update an existing one
            blockdata = {
-                'domain': row['domain'],
+                'domain': newblock['domain'],
                # Default to Silence if nothing is specified
-                'severity': row.get('severity', 'silence'),
-                'public_comment': row['public_comment'],
-                'private_comment': row['private_comment'],
-                'reject_media': row.get('reject_media', False),
-                'reject_reports': row.get('reject_reports', False),
-                'obfuscate': row.get('obfuscate', False),
+                'severity': newblock.get('severity', 'silence'),
+                'public_comment': newblock.get('public_comment', ''),
+                'private_comment': newblock.get('private_comment', ''),
+                'reject_media': newblock.get('reject_media', False),
+                'reject_reports': newblock.get('reject_reports', False),
+                'obfuscate': newblock.get('obfuscate', False),
            }
            log.info(f"Adding new block for {blockdata['domain']}...")
-            add_block(token, host, blockdata)
-            # add a pause here so we don't melt the instance
-            time.sleep(1)
+            if not dryrun:
+                add_block(token, host, blockdata)
+                # add a pause here so we don't melt the instance
+                time.sleep(1)
+            else:
+                log.info("Dry run selected. Not adding block.")

 def load_config(configfile: str):
    """Augment commandline arguments with config file parameters
@ -359,6 +424,17 @@ def augment_args(args):

    return args

+def str2bool(boolstring: str) -> bool:
+    """Helper function to convert boolean strings to actual Python bools
+    """
+    boolstring = boolstring.lower()
+    if boolstring in ['true', 't', '1', 'y', 'yes']:
+        return True
+    elif boolstring in ['false', 'f', '0', 'n', 'no']:
+        return False
+    else:
+        raise ValueError(f"Cannot parse value '{boolstring}' as boolean")
+
 if __name__ == '__main__':

    ap = argparse.ArgumentParser(description="Bulk blocklist tool",
@ -368,12 +444,14 @@ if __name__ == '__main__':
    ap.add_argument('-o', '--outfile', dest="blocklist_savefile", help="Save merged blocklist to a local file.")
    ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.")
    ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.")
-
+    ap.add_argument('-m', '--mergeplan', choices=['min', 'max'], default='max', help="Set mergeplan.")
+    
    ap.add_argument('--no-fetch-url', dest='no_fetch_url', action='store_true', help="Don't fetch from URLs, even if configured.")
    ap.add_argument('--no-fetch-instance', dest='no_fetch_instance', action='store_true', help="Don't fetch from instances, even if configured.")
    ap.add_argument('--no-push-instance', dest='no_push_instance', action='store_true', help="Don't push to instances, even if configured.")

    ap.add_argument('--loglevel', choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set log output level.")
+    ap.add_argument('--dryrun', action='store_true', help="Don't actually push updates, just show what would happen.")

    args = ap.parse_args()
    if args.loglevel is not None:
--- a/etc/sample.fediblockhole.conf.toml
+++ b/etc/sample.fediblockhole.conf.toml
@ -32,4 +32,10 @@ blocklist_instance_destinations = [
 # no_fetch_url = false

 ## Don't fetch blocklists from instances, even if they're defined above
-# no_fetch_instance = false
+# no_fetch_instance = false
+
+## Set the mergeplan to use when dealing with overlaps between blocklists
+# The default 'max' mergeplan will use the harshest severity block found for a domain.
+# The 'min' mergeplan will use the lightest severity block found for a domain.
+# mergeplan = 'max'
+