From 96a48ec633beaad6658c75a1135c2e40a782ec0a Mon Sep 17 00:00:00 2001 From: Justin Warren Date: Tue, 20 Dec 2022 10:10:35 +1100 Subject: [PATCH] Added ability to fetch blocklist CSVs via URL. Added ability to save blocklists to file. Added ability to skip fetch and push actions. --- bin/fediblock_sync.py | 157 +++++++++++++++++++---------- etc/sample.fediblockhole.conf.toml | 41 +++++--- samples/demo-blocklist-01.csv | 10 ++ 3 files changed, 143 insertions(+), 65 deletions(-) create mode 100644 samples/demo-blocklist-01.csv diff --git a/bin/fediblock_sync.py b/bin/fediblock_sync.py index b8f565a..4da4404 100755 --- a/bin/fediblock_sync.py +++ b/bin/fediblock_sync.py @@ -6,13 +6,16 @@ import toml import csv import requests import json -import csv import time +import os.path +import urllib.request as urlr import logging -logging.basicConfig(level=logging.DEBUG, +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') -import pprint + +# Max size of a URL-fetched blocklist +URL_BLOCKLIST_MAXSIZE = 1024 ** 3 log = logging.getLogger('fediblock_sync') @@ -20,34 +23,49 @@ CONFIGFILE = "/home/mastodon/etc/admin.conf" def sync_blocklists(conf: dict): """Sync instance blocklists from remote sources. + + @param conf: A configuration dictionary """ # Build a dict of blocklists we retrieve from remote sources. # We will merge these later using a merge algorithm we choose. blocklists = {} # Fetch blocklists from URLs - # for listurl in conf.blocklist_csv_sources: - # blocklists[listurl] = {} - # response = requests.get(url) - # log.debug(f"Fetched blocklist CSV file: {response.content}") + if not conf.no_fetch_url: + log.info("Fetching domain blocks from URLs...") + for listurl in conf.blocklist_url_sources: + blocklists[listurl] = [] + with urlr.urlopen(listurl) as fp: + rawdata = fp.read(URL_BLOCKLIST_MAXSIZE).decode('utf-8') + reader = csv.DictReader(rawdata.split('\n')) + for row in reader: + blocklists[listurl].append(row) + if conf.save_intermediate: + save_intermediate_blocklist(blocklists[listurl], listurl, conf.savedir) # Fetch blocklists from remote instances - for blocklist_src in conf['blocklist_instance_sources']: - domain = blocklist_src['domain'] - token = blocklist_src['token'] - blocklists[domain] = fetch_instance_blocklist(token, domain) + if not conf.no_fetch_instance: + log.info("Fetching domain blocks from instances...") + for blocklist_src in conf.blocklist_instance_sources: + domain = blocklist_src['domain'] + token = blocklist_src['token'] + blocklists[domain] = fetch_instance_blocklist(token, domain) + if conf.save_intermediate: + save_intermediate_blocklist(blocklists[domain], domain, conf.savedir) # Merge blocklists into an update dict merged = merge_blocklists(blocklists) - - # log.debug(f"Merged blocklist ready:\n") - # pprint.pp(merged) + if conf.blocklist_savefile: + log.info(f"Saving merged blocklist to {conf.blocklist_savefile}") + save_blocklist_to_file(merged.values(), conf.blocklist_savefile) # Push the blocklist to destination instances - for dest in conf['blocklist_instance_destinations']: - domain = dest['domain'] - token = dest['token'] - push_blocklist(token, domain, merged.values()) + if not conf.no_push_instance: + log.info("Pushing domain blocks to instances...") + for dest in conf.blocklist_instance_destinations: + domain = dest['domain'] + token = dest['token'] + push_blocklist(token, domain, merged.values()) def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict: """Merge fetched remote blocklists into a bulk update @@ -56,14 +74,10 @@ def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict: 'max' (the default) uses the highest severity block found 'min' uses the lowest severity block found """ - # log.debug(f"Merging blocklists {blocklists} ...") - # Remote blocklists may have conflicting overlaps. We need to - # decide whether or not to override earlier entries with later - # ones or not. How to choose which entry is 'correct'? merged = {} for key, blist in blocklists.items(): - log.debug(f"Adding blocks from {key} ...") + log.debug(f"Merging blocks from {key} ...") for blockdef in blist: # log.debug(f"Checking blockdef {blockdef} ...") domain = blockdef['domain'] @@ -122,7 +136,12 @@ def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict: def fetch_instance_blocklist(token: str, host: str) -> list: """Fetch existing block list from server + + @param token: The OAuth Bearer token to authenticate with. + @param host: The remote host to connect to. + @returns: A list of the admin domain blocks from the instance. """ + log.info(f"Fetching instance blocklist from {host} ...") api_path = "/api/v1/admin/domain_blocks" url = f"https://{host}{api_path}" @@ -155,28 +174,6 @@ def fetch_instance_blocklist(token: str, host: str) -> list: log.debug(f"Found {len(domain_blocks)} existing domain blocks.") return domain_blocks -def export_blocklist(token: str, host: str, outfile: str): - """Export current server blocklist to a csv file""" - blocklist = fetch_instance_blocklist(token, host) - fieldnames = ['id', 'domain', 'severity', 'reject_media', 'reject_reports', 'private_comment', 'public_comment', 'obfuscate'] - - blocklist = sorted(blocklist, key=lambda x: int(x['id'])) - - with open(outfile, "w") as fp: - writer = csv.DictWriter(fp, fieldnames, extrasaction='ignore') - writer.writeheader() - writer.writerows(blocklist) - -def delete_blocklist(token: str, host: str, blockfile: str): - """Delete domain blocks listed in blockfile""" - with open(blockfile) as fp: - reader = csv.DictReader(fp) - for row in reader: - domain = row['domain'] - id = row['id'] - log.debug(f"Deleting {domain} (id: {id}) from blocklist...") - delete_block(token, host, id) - def delete_block(token: str, host: str, id: int): """Remove a domain block""" log.debug(f"Removing domain block {id} at {host}...") @@ -236,6 +233,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]): @param host: The instance host, FQDN or IP @param blocklist: A list of block definitions. They must include the domain. """ + log.info(f"Pushing blocklist to host {host} ...") # Fetch the existing blocklist from the instance serverblocks = fetch_instance_blocklist(token, host) @@ -249,7 +247,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]): try: blockdict = knownblocks[row['domain']] - log.info(f"Block already exists for {row['domain']}, merging data...") + log.debug(f"Block already exists for {row['domain']}, merging data...") # Check if anything is actually different and needs updating change_needed = False @@ -276,7 +274,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]): time.sleep(1) else: - log.info("No differences detected. Not updating.") + log.debug("No differences detected. Not updating.") except KeyError: # domain doesn't have an entry, so we need to create one @@ -303,12 +301,59 @@ def load_config(configfile: str): conf = toml.load(configfile) return conf -def save_intermediate(blocklist: list, source: str, filedir: str): +def save_intermediate_blocklist(blocklist: list[dict], source: str, filedir: str): + """Save a local copy of a blocklist we've downloaded + """ + # Invent a filename based on the remote source + # If the source was a URL, convert it to something less messy + # If the source was a remote domain, just use the name of the domain + log.debug(f"Saving intermediate blocklist from {source}") + source = source.replace('/','-') + filename = f"{source}.csv" + filepath = os.path.join(filedir, filename) + save_blocklist_to_file(blocklist, filepath) + +def save_blocklist_to_file(blocklist: list[dict], filepath: str): """Save a blocklist we've downloaded from a remote source - Save a local copy of the remote blocklist after downloading it. + @param blocklist: A dictionary of block definitions, keyed by domain + @param filepath: The path to the file the list should be saved in. """ + blocklist = sorted(blocklist, key=lambda x: x['domain']) + fieldnames = ['domain', 'severity', 'private_comment', 'public_comment', 'reject_media', 'reject_reports', 'obfuscate'] + with open(filepath, "w") as fp: + writer = csv.DictWriter(fp, fieldnames, extrasaction='ignore') + writer.writeheader() + writer.writerows(blocklist) + +def augment_args(args): + """Augment commandline arguments with config file parameters""" + conf = toml.load(args.config) + + if not args.no_fetch_url: + args.no_fetch_url = conf.get('no_fetch_url', False) + + if not args.no_fetch_instance: + args.no_fetch_instance = conf.get('no_fetch_instance', False) + + if not args.no_push_instance: + args.no_push_instance = conf.get('no_push_instance', False) + + if not args.blocklist_savefile: + args.blocklist_savefile = conf.get('blocklist_savefile', None) + + if not args.save_intermediate: + args.save_intermediate = conf.get('save_intermediate', False) + + if not args.savedir: + args.savedir = conf.get('savedir', '/tmp') + + args.blocklist_url_sources = conf.get('blocklist_url_sources') + args.blocklist_instance_sources = conf.get('blocklist_instance_sources') + args.blocklist_instance_destinations = conf.get('blocklist_instance_destinations') + + return args if __name__ == '__main__': @@ -316,6 +361,14 @@ if __name__ == '__main__': formatter_class=argparse.ArgumentDefaultsHelpFormatter) ap.add_argument('-c', '--config', default='/etc/default/fediblockhole.conf.toml', help="Config file") + ap.add_argument('-o', '--outfile', dest="blocklist_savefile", help="Save merged blocklist to a local file.") + ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.") + ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.") + + ap.add_argument('--no-fetch-url', dest='no_fetch_url', action='store_true', help="Don't fetch from URLs, even if configured.") + ap.add_argument('--no-fetch-instance', dest='no_fetch_instance', action='store_true', help="Don't fetch from instances, even if configured.") + ap.add_argument('--no-push-instance', dest='no_push_instance', action='store_true', help="Don't push to instances, even if configured.") + ap.add_argument('--loglevel', choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set log output level.") args = ap.parse_args() @@ -324,7 +377,7 @@ if __name__ == '__main__': log.setLevel(getattr(logging, levelname)) # Load the configuration file - conf = load_config(args.config) - + args = augment_args(args) + # Do the work of syncing - sync_blocklists(conf) \ No newline at end of file + sync_blocklists(args) \ No newline at end of file diff --git a/etc/sample.fediblockhole.conf.toml b/etc/sample.fediblockhole.conf.toml index fff89e0..836e18c 100644 --- a/etc/sample.fediblockhole.conf.toml +++ b/etc/sample.fediblockhole.conf.toml @@ -1,20 +1,35 @@ # List of instances to read blocklists from, # with the Bearer token authorised by the instance blocklist_instance_sources = [ - { - domain = 'eigenmagic.net', - token = '' - }, - { - domain = 'jorts.horse', - token = '' - }, + { domain = 'eigenmagic.net', token = '' }, + { domain = 'jorts.horse', token = '' }, +] + +# List of URLs to read csv blocklists from +blocklist_url_sources = [ + 'file:///etc/fediblockhole/blocklist-01.csv', + 'https://github.com/fediblockhole/samples/demo-blocklist-01.csv', ] # List of instances to write blocklist to blocklist_instance_destinations = [ - { - domain = 'eigenmagic.net', - token = '' - }, -] \ No newline at end of file + { domain = 'eigenmagic.net', token = '' }, +] + +## Store a local copy of the remote blocklists after we fetch them +#keep_intermediate = true + +## Directory to store the local blocklist copies +# savedir = '/tmp' + +## File to save the fully merged blocklist into +# blocklist_savefile = '/tmp/merged_blocklist.csv' + +## Don't push blocklist to instances, even if they're defined above +# no_push_instance = false + +## Don't fetch blocklists from URLs, even if they're defined above +# no_fetch_url = false + +## Don't fetch blocklists from instances, even if they're defined above +# no_fetch_instance = false \ No newline at end of file diff --git a/samples/demo-blocklist-01.csv b/samples/demo-blocklist-01.csv new file mode 100644 index 0000000..7b9345b --- /dev/null +++ b/samples/demo-blocklist-01.csv @@ -0,0 +1,10 @@ +"domain","severity","reject_media","reject_reports","private_comment","public_comment","obfuscate" +"qoto.org","suspend",TRUE,TRUE,,,TRUE +"sealion.club","suspend",TRUE,TRUE,,,TRUE +"develop.gab.com","suspend",TRUE,TRUE,,,TRUE +"gab.ai","suspend",TRUE,TRUE,,,TRUE +"gab.sleeck.eu","suspend",TRUE,TRUE,,,TRUE +"gab.com","suspend",TRUE,TRUE,,,TRUE +"kiwifarms.is","suspend",TRUE,TRUE,,,TRUE +"kiwifarms.net","suspend",TRUE,TRUE,,,TRUE +"gabfed.com","suspend",TRUE,TRUE,,,TRUE \ No newline at end of file