fediblockhole-misskey/bin/fediblock_sync.py

#!/usr/bin/python3
# Export and import blocklists via API

import argparse
import toml
import csv
import requests
import json
import csv
import time

import logging
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)s %(message)s')
import pprint

log = logging.getLogger('fediblock_sync')

CONFIGFILE = "/home/mastodon/etc/admin.conf"

def sync_blocklists(conf: dict):
    """Sync instance blocklists from remote sources.
    """
    # Build a dict of blocklists we retrieve from remote sources.
    # We will merge these later using a merge algorithm we choose.

    blocklists = {}
    # Fetch blocklists from URLs
    # for listurl in conf.blocklist_csv_sources:
    #     blocklists[listurl] = {}
    #     response = requests.get(url)
    #     log.debug(f"Fetched blocklist CSV file: {response.content}")

    # Fetch blocklists from remote instances
    for blocklist_src in conf['blocklist_instance_sources']:
        domain = blocklist_src['domain']
        token = blocklist_src['token']
        blocklists[domain] = fetch_instance_blocklist(token, domain)

    # Merge blocklists into an update dict
    merged = merge_blocklists(blocklists)

    # log.debug(f"Merged blocklist ready:\n")
    # pprint.pp(merged)

    # Push the blocklist to destination instances
    for dest in conf['blocklist_instance_destinations']:
        domain = dest['domain']
        token = dest['token']
        push_blocklist(token, domain, merged.values())

def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
    """Merge fetched remote blocklists into a bulk update

    @param mergeplan: An optional method of merging overlapping block definitions
        'max' (the default) uses the highest severity block found
        'min' uses the lowest severity block found
    """
    # log.debug(f"Merging blocklists {blocklists} ...")
    # Remote blocklists may have conflicting overlaps. We need to
    # decide whether or not to override earlier entries with later
    # ones or not. How to choose which entry is 'correct'?
    merged = {}

    for key, blist in blocklists.items():
        log.debug(f"Adding blocks from {key} ...")
        for blockdef in blist:
            # log.debug(f"Checking blockdef {blockdef} ...")
            domain = blockdef['domain']
            if domain in merged:
                blockdata = merged[domain]

                # If the public or private comment is different,
                # append it to the existing comment, joined with a newline
                if blockdef['public_comment'] != blockdata['public_comment'] and blockdata['public_comment'] != '':
                    blockdata['public_comment'] = '\n'.join([blockdef['public_comment'], blockdata['public_comment']])

                if blockdef['private_comment'] != blockdata['private_comment'] and blockdata['private_comment'] != '':
                    blockdata['private_comment'] = '\n'.join([blockdef['private_comment'], blockdata['private_comment']])

                # How do we override an earlier block definition?
                if mergeplan in ['max', None]:
                    # Use the highest block level found (the default)
                    if blockdef['severity'] == 'suspend':
                        blockdata['severity'] = 'suspend'

                    if blockdef['reject_media'] == True:
                        blockdata['reject_media'] = True

                    if blockdef['reject_reports'] == True:
                        blockdata['reject_reports'] = True

                elif mergeplan in ['min']:
                    # Use the lowest block level found
                    if blockdef['severity'] == 'silence':
                        blockdata['severity'] = 'silence'

                    if blockdef['reject_media'] == False:
                        blockdata['reject_media'] = False

                    if blockdef['reject_reports'] == False:
                        blockdata['reject_reports'] = False

                else:
                    raise NotImplementedError(f"Mergeplan '{mergeplan}' not implemented.")

            else:
                # New block
                blockdata = {
                    'domain': blockdef['domain'],
                    # Default to Silence if nothing is specified
                    'severity': blockdef.get('severity', 'silence'),
                    'public_comment': blockdef['public_comment'],
                    'private_comment': blockdef['private_comment'],
                    'reject_media': blockdef.get('reject_media', False),
                    'reject_reports': blockdef.get('reject_reports', False),
                    'obfuscate': blockdef.get('obfuscate', False),
                }
            merged[domain] = blockdata

    return merged

def fetch_instance_blocklist(token: str, host: str) -> list:
    """Fetch existing block list from server
    """
    api_path = "/api/v1/admin/domain_blocks"

    url = f"https://{host}{api_path}"

    domain_blocks = []
    link = True

    while link:
        response = requests.get(url, headers={'Authorization': f"Bearer {token}"})
        if response.status_code != 200:
            log.error(f"Cannot fetch remote blocklist: {response.content}")
            raise ValueError("Unable to fetch domain block list: %s", response)
        domain_blocks.extend(json.loads(response.content))
        
        # Parse the link header to find the next url to fetch
        # This is a weird and janky way of doing pagination but
        # hey nothing we can do about it we just have to deal
        link = response.headers['Link']
        pagination = link.split(', ')
        if len(pagination) != 2:
            link = None
            break
        else:
            next = pagination[0]
            prev = pagination[1]
        
            urlstring, rel = next.split('; ')
            url = urlstring.strip('<').rstrip('>')

    log.debug(f"Found {len(domain_blocks)} existing domain blocks.")
    return domain_blocks

def export_blocklist(token: str, host: str, outfile: str):
    """Export current server blocklist to a csv file"""
    blocklist = fetch_instance_blocklist(token, host)
    fieldnames = ['id', 'domain', 'severity', 'reject_media', 'reject_reports', 'private_comment', 'public_comment', 'obfuscate']

    blocklist = sorted(blocklist, key=lambda x: int(x['id']))

    with open(outfile, "w") as fp:
        writer = csv.DictWriter(fp, fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(blocklist)

def delete_blocklist(token: str, host: str, blockfile: str):
    """Delete domain blocks listed in blockfile"""
    with open(blockfile) as fp:
        reader = csv.DictReader(fp)
        for row in reader:
            domain = row['domain']
            id = row['id']
            log.debug(f"Deleting {domain} (id: {id}) from blocklist...")
            delete_block(token, host, id)

def delete_block(token: str, host: str, id: int):
    """Remove a domain block"""
    log.debug(f"Removing domain block {id} at {host}...")
    api_path = "/api/v1/admin/domain_blocks/"

    url = f"https://{host}{api_path}{id}"

    response = requests.delete(url,
        headers={'Authorization': f"Bearer {token}"}
    )
    if response.status_code != 200:
        if response.status_code == 404:
            log.warn(f"No such domain block: {id}")
            return

        raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")

def update_known_block(token: str, host: str, blockdict: dict):
    """Update an existing domain block with information in blockdict"""
    api_path = "/api/v1/admin/domain_blocks/"

    id = blockdict['id']
    blockdata = blockdict.copy()
    del blockdata['id']

    url = f"https://{host}{api_path}{id}"

    response = requests.put(url,
        headers={'Authorization': f"Bearer {token}"},
        data=blockdata
    )
    if response.status_code != 200:
        raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")

def add_block(token: str, host: str, blockdata: dict):
    """Block a domain on Mastodon host
    """
    log.debug(f"Blocking domain {blockdata['domain']} at {host}...")
    api_path = "/api/v1/admin/domain_blocks"

    url = f"https://{host}{api_path}"

    response = requests.post(url,
        headers={'Authorization': f"Bearer {token}"},
        data=blockdata
    )
    if response.status_code != 200:
        raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")

def push_blocklist(token: str, host: str, blocklist: list[dict]):
    """Push a blocklist to a remote instance.
    
    Merging the blocklist with the existing list the instance has,
    updating existing entries if they exist.

    @param token: The Bearer token for OAUTH API authentication
    @param host: The instance host, FQDN or IP
    @param blocklist: A list of block definitions. They must include the domain.
    """
    # Fetch the existing blocklist from the instance
    serverblocks = fetch_instance_blocklist(token, host)

    # Convert serverblocks to a dictionary keyed by domain name
    knownblocks = {row['domain']: row for row in serverblocks}

    for row in blocklist:
        # log.debug(f"Importing definition: {row}")

        if 'id' in row: del row['id']

        try:
            blockdict = knownblocks[row['domain']]
            log.info(f"Block already exists for {row['domain']}, merging data...")

            # Check if anything is actually different and needs updating
            change_needed = False
            for key in [
                'severity',
                'public_comment',
                'private_comment',
                'reject_media',
                'reject_reports',
                'obfuscate',
                ]:
                try:
                    if blockdict[key] != knownblocks[key]:
                        change_needed = True
                        break
                except KeyError:
                    break
            
            if change_needed:
                log.debug(f"Change detected. Updating domain block for {row['domain']}")
                blockdict.update(row)
                update_known_block(token, host, blockdict)
                # add a pause here so we don't melt the instance
                time.sleep(1)

            else:
                log.info("No differences detected. Not updating.")

        except KeyError:
            # domain doesn't have an entry, so we need to create one
            blockdata = {
                'domain': row['domain'],
                # Default to Silence if nothing is specified
                'severity': row.get('severity', 'silence'),
                'public_comment': row['public_comment'],
                'private_comment': row['private_comment'],
                'reject_media': row.get('reject_media', False),
                'reject_reports': row.get('reject_reports', False),
                'obfuscate': row.get('obfuscate', False),
            }
            log.info(f"Adding new block for {blockdata['domain']}...")
            add_block(token, host, blockdata)
            # add a pause here so we don't melt the instance
            time.sleep(1)

def load_config(configfile: str):
    """Augment commandline arguments with config file parameters
    
    Config file is expected to be in TOML format
    """
    conf = toml.load(configfile)
    return conf

def save_intermediate(blocklist: list, source: str, filedir: str):
    """Save a blocklist we've downloaded from a remote source

    Save a local copy of the remote blocklist after downloading it.
    """


if __name__ == '__main__':

    ap = argparse.ArgumentParser(description="Bulk blocklist tool",
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    ap.add_argument('-c', '--config', default='/etc/default/fediblockhole.conf.toml', help="Config file")

    ap.add_argument('--loglevel', choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set log output level.")

    args = ap.parse_args()
    if args.loglevel is not None:
        levelname = args.loglevel.upper()
        log.setLevel(getattr(logging, levelname))

    # Load the configuration file
    conf = load_config(args.config)
    
    # Do the work of syncing
    sync_blocklists(conf)
First working version. Only deals with instances directly, not files. Includes basic instructions on how to configure and use. Includes example config file. 2022-12-19 20:53:28 +00:00			`#!/usr/bin/python3`
			`# Export and import blocklists via API`

			`import argparse`
			`import toml`
			`import csv`
			`import requests`
			`import json`
			`import csv`
			`import time`

			`import logging`
			`logging.basicConfig(level=logging.DEBUG,`
			`format='%(asctime)s %(levelname)s %(message)s')`
			`import pprint`

			`log = logging.getLogger('fediblock_sync')`

			`CONFIGFILE = "/home/mastodon/etc/admin.conf"`

			`def sync_blocklists(conf: dict):`
			`"""Sync instance blocklists from remote sources.`
			`"""`
			`# Build a dict of blocklists we retrieve from remote sources.`
			`# We will merge these later using a merge algorithm we choose.`

			`blocklists = {}`
			`# Fetch blocklists from URLs`
			`# for listurl in conf.blocklist_csv_sources:`
			`# blocklists[listurl] = {}`
			`# response = requests.get(url)`
			`# log.debug(f"Fetched blocklist CSV file: {response.content}")`

			`# Fetch blocklists from remote instances`
			`for blocklist_src in conf['blocklist_instance_sources']:`
			`domain = blocklist_src['domain']`
			`token = blocklist_src['token']`
			`blocklists[domain] = fetch_instance_blocklist(token, domain)`

			`# Merge blocklists into an update dict`
			`merged = merge_blocklists(blocklists)`

			`# log.debug(f"Merged blocklist ready:\n")`
			`# pprint.pp(merged)`

			`# Push the blocklist to destination instances`
			`for dest in conf['blocklist_instance_destinations']:`
			`domain = dest['domain']`
			`token = dest['token']`
			`push_blocklist(token, domain, merged.values())`

			`def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:`
			`"""Merge fetched remote blocklists into a bulk update`

			`@param mergeplan: An optional method of merging overlapping block definitions`
			`'max' (the default) uses the highest severity block found`
			`'min' uses the lowest severity block found`
			`"""`
			`# log.debug(f"Merging blocklists {blocklists} ...")`
			`# Remote blocklists may have conflicting overlaps. We need to`
			`# decide whether or not to override earlier entries with later`
			`# ones or not. How to choose which entry is 'correct'?`
			`merged = {}`

			`for key, blist in blocklists.items():`
			`log.debug(f"Adding blocks from {key} ...")`
			`for blockdef in blist:`
			`# log.debug(f"Checking blockdef {blockdef} ...")`
			`domain = blockdef['domain']`
			`if domain in merged:`
			`blockdata = merged[domain]`

			`# If the public or private comment is different,`
			`# append it to the existing comment, joined with a newline`
			`if blockdef['public_comment'] != blockdata['public_comment'] and blockdata['public_comment'] != '':`
			`blockdata['public_comment'] = '\n'.join([blockdef['public_comment'], blockdata['public_comment']])`

			`if blockdef['private_comment'] != blockdata['private_comment'] and blockdata['private_comment'] != '':`
			`blockdata['private_comment'] = '\n'.join([blockdef['private_comment'], blockdata['private_comment']])`

			`# How do we override an earlier block definition?`
			`if mergeplan in ['max', None]:`
			`# Use the highest block level found (the default)`
			`if blockdef['severity'] == 'suspend':`
			`blockdata['severity'] = 'suspend'`

			`if blockdef['reject_media'] == True:`
			`blockdata['reject_media'] = True`

			`if blockdef['reject_reports'] == True:`
			`blockdata['reject_reports'] = True`

			`elif mergeplan in ['min']:`
			`# Use the lowest block level found`
			`if blockdef['severity'] == 'silence':`
			`blockdata['severity'] = 'silence'`

			`if blockdef['reject_media'] == False:`
			`blockdata['reject_media'] = False`

			`if blockdef['reject_reports'] == False:`
			`blockdata['reject_reports'] = False`

			`else:`
			`raise NotImplementedError(f"Mergeplan '{mergeplan}' not implemented.")`

			`else:`
			`# New block`
			`blockdata = {`
			`'domain': blockdef['domain'],`
			`# Default to Silence if nothing is specified`
			`'severity': blockdef.get('severity', 'silence'),`
			`'public_comment': blockdef['public_comment'],`
			`'private_comment': blockdef['private_comment'],`
			`'reject_media': blockdef.get('reject_media', False),`
			`'reject_reports': blockdef.get('reject_reports', False),`
			`'obfuscate': blockdef.get('obfuscate', False),`
			`}`
			`merged[domain] = blockdata`

			`return merged`

			`def fetch_instance_blocklist(token: str, host: str) -> list:`
			`"""Fetch existing block list from server`
			`"""`
			`api_path = "/api/v1/admin/domain_blocks"`

			`url = f"https://{host}{api_path}"`

			`domain_blocks = []`
			`link = True`

			`while link:`
			`response = requests.get(url, headers={'Authorization': f"Bearer {token}"})`
			`if response.status_code != 200:`
			`log.error(f"Cannot fetch remote blocklist: {response.content}")`
			`raise ValueError("Unable to fetch domain block list: %s", response)`
			`domain_blocks.extend(json.loads(response.content))`

			`# Parse the link header to find the next url to fetch`
			`# This is a weird and janky way of doing pagination but`
			`# hey nothing we can do about it we just have to deal`
			`link = response.headers['Link']`
			`pagination = link.split(', ')`
			`if len(pagination) != 2:`
			`link = None`
			`break`
			`else:`
			`next = pagination[0]`
			`prev = pagination[1]`

			`urlstring, rel = next.split('; ')`
			`url = urlstring.strip('<').rstrip('>')`

			`log.debug(f"Found {len(domain_blocks)} existing domain blocks.")`
			`return domain_blocks`

			`def export_blocklist(token: str, host: str, outfile: str):`
			`"""Export current server blocklist to a csv file"""`
			`blocklist = fetch_instance_blocklist(token, host)`
			`fieldnames = ['id', 'domain', 'severity', 'reject_media', 'reject_reports', 'private_comment', 'public_comment', 'obfuscate']`

			`blocklist = sorted(blocklist, key=lambda x: int(x['id']))`

			`with open(outfile, "w") as fp:`
			`writer = csv.DictWriter(fp, fieldnames, extrasaction='ignore')`
			`writer.writeheader()`
			`writer.writerows(blocklist)`

			`def delete_blocklist(token: str, host: str, blockfile: str):`
			`"""Delete domain blocks listed in blockfile"""`
			`with open(blockfile) as fp:`
			`reader = csv.DictReader(fp)`
			`for row in reader:`
			`domain = row['domain']`
			`id = row['id']`
			`log.debug(f"Deleting {domain} (id: {id}) from blocklist...")`
			`delete_block(token, host, id)`

			`def delete_block(token: str, host: str, id: int):`
			`"""Remove a domain block"""`
			`log.debug(f"Removing domain block {id} at {host}...")`
			`api_path = "/api/v1/admin/domain_blocks/"`

			`url = f"https://{host}{api_path}{id}"`

			`response = requests.delete(url,`
			`headers={'Authorization': f"Bearer {token}"}`
			`)`
			`if response.status_code != 200:`
			`if response.status_code == 404:`
			`log.warn(f"No such domain block: {id}")`
			`return`

			`raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")`

			`def update_known_block(token: str, host: str, blockdict: dict):`
			`"""Update an existing domain block with information in blockdict"""`
			`api_path = "/api/v1/admin/domain_blocks/"`

			`id = blockdict['id']`
			`blockdata = blockdict.copy()`
			`del blockdata['id']`

			`url = f"https://{host}{api_path}{id}"`

			`response = requests.put(url,`
			`headers={'Authorization': f"Bearer {token}"},`
			`data=blockdata`
			`)`
			`if response.status_code != 200:`
			`raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")`

			`def add_block(token: str, host: str, blockdata: dict):`
			`"""Block a domain on Mastodon host`
			`"""`
			`log.debug(f"Blocking domain {blockdata['domain']} at {host}...")`
			`api_path = "/api/v1/admin/domain_blocks"`

			`url = f"https://{host}{api_path}"`

			`response = requests.post(url,`
			`headers={'Authorization': f"Bearer {token}"},`
			`data=blockdata`
			`)`
			`if response.status_code != 200:`
			`raise ValueError(f"Something went wrong: {response.status_code}: {response.content}")`

			`def push_blocklist(token: str, host: str, blocklist: list[dict]):`
			`"""Push a blocklist to a remote instance.`

			`Merging the blocklist with the existing list the instance has,`
			`updating existing entries if they exist.`

			`@param token: The Bearer token for OAUTH API authentication`
			`@param host: The instance host, FQDN or IP`
			`@param blocklist: A list of block definitions. They must include the domain.`
			`"""`
			`# Fetch the existing blocklist from the instance`
			`serverblocks = fetch_instance_blocklist(token, host)`

			`# Convert serverblocks to a dictionary keyed by domain name`
			`knownblocks = {row['domain']: row for row in serverblocks}`

			`for row in blocklist:`
			`# log.debug(f"Importing definition: {row}")`

			`if 'id' in row: del row['id']`

			`try:`
			`blockdict = knownblocks[row['domain']]`
			`log.info(f"Block already exists for {row['domain']}, merging data...")`

			`# Check if anything is actually different and needs updating`
			`change_needed = False`
			`for key in [`
			`'severity',`
			`'public_comment',`
			`'private_comment',`
			`'reject_media',`
			`'reject_reports',`
			`'obfuscate',`
			`]:`
			`try:`
			`if blockdict[key] != knownblocks[key]:`
			`change_needed = True`
			`break`
			`except KeyError:`
			`break`

			`if change_needed:`
			`log.debug(f"Change detected. Updating domain block for {row['domain']}")`
			`blockdict.update(row)`
			`update_known_block(token, host, blockdict)`
			`# add a pause here so we don't melt the instance`
			`time.sleep(1)`

			`else:`
			`log.info("No differences detected. Not updating.")`

			`except KeyError:`
			`# domain doesn't have an entry, so we need to create one`
			`blockdata = {`
			`'domain': row['domain'],`
			`# Default to Silence if nothing is specified`
			`'severity': row.get('severity', 'silence'),`
			`'public_comment': row['public_comment'],`
			`'private_comment': row['private_comment'],`
			`'reject_media': row.get('reject_media', False),`
			`'reject_reports': row.get('reject_reports', False),`
			`'obfuscate': row.get('obfuscate', False),`
			`}`
			`log.info(f"Adding new block for {blockdata['domain']}...")`
			`add_block(token, host, blockdata)`
			`# add a pause here so we don't melt the instance`
			`time.sleep(1)`

			`def load_config(configfile: str):`
			`"""Augment commandline arguments with config file parameters`

			`Config file is expected to be in TOML format`
			`"""`
			`conf = toml.load(configfile)`
			`return conf`

			`def save_intermediate(blocklist: list, source: str, filedir: str):`
			`"""Save a blocklist we've downloaded from a remote source`

			`Save a local copy of the remote blocklist after downloading it.`
			`"""`


			`if __name__ == '__main__':`

			`ap = argparse.ArgumentParser(description="Bulk blocklist tool",`
			`formatter_class=argparse.ArgumentDefaultsHelpFormatter)`
			`ap.add_argument('-c', '--config', default='/etc/default/fediblockhole.conf.toml', help="Config file")`

			`ap.add_argument('--loglevel', choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set log output level.")`

			`args = ap.parse_args()`
			`if args.loglevel is not None:`
			`levelname = args.loglevel.upper()`
			`log.setLevel(getattr(logging, levelname))`

			`# Load the configuration file`
			`conf = load_config(args.config)`

			`# Do the work of syncing`
			`sync_blocklists(conf)`