Added ability to fetch blocklist CSVs via URL.

Added ability to save blocklists to file.
Added ability to skip fetch and push actions.
This commit is contained in:
Justin Warren 2022-12-20 10:10:35 +11:00
parent c8e8aa50dc
commit 96a48ec633
No known key found for this signature in database
3 changed files with 143 additions and 65 deletions

View File

@ -6,13 +6,16 @@ import toml
import csv import csv
import requests import requests
import json import json
import csv
import time import time
import os.path
import urllib.request as urlr
import logging import logging
logging.basicConfig(level=logging.DEBUG, logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s') format='%(asctime)s %(levelname)s %(message)s')
import pprint
# Max size of a URL-fetched blocklist
URL_BLOCKLIST_MAXSIZE = 1024 ** 3
log = logging.getLogger('fediblock_sync') log = logging.getLogger('fediblock_sync')
@ -20,34 +23,49 @@ CONFIGFILE = "/home/mastodon/etc/admin.conf"
def sync_blocklists(conf: dict): def sync_blocklists(conf: dict):
"""Sync instance blocklists from remote sources. """Sync instance blocklists from remote sources.
@param conf: A configuration dictionary
""" """
# Build a dict of blocklists we retrieve from remote sources. # Build a dict of blocklists we retrieve from remote sources.
# We will merge these later using a merge algorithm we choose. # We will merge these later using a merge algorithm we choose.
blocklists = {} blocklists = {}
# Fetch blocklists from URLs # Fetch blocklists from URLs
# for listurl in conf.blocklist_csv_sources: if not conf.no_fetch_url:
# blocklists[listurl] = {} log.info("Fetching domain blocks from URLs...")
# response = requests.get(url) for listurl in conf.blocklist_url_sources:
# log.debug(f"Fetched blocklist CSV file: {response.content}") blocklists[listurl] = []
with urlr.urlopen(listurl) as fp:
rawdata = fp.read(URL_BLOCKLIST_MAXSIZE).decode('utf-8')
reader = csv.DictReader(rawdata.split('\n'))
for row in reader:
blocklists[listurl].append(row)
if conf.save_intermediate:
save_intermediate_blocklist(blocklists[listurl], listurl, conf.savedir)
# Fetch blocklists from remote instances # Fetch blocklists from remote instances
for blocklist_src in conf['blocklist_instance_sources']: if not conf.no_fetch_instance:
domain = blocklist_src['domain'] log.info("Fetching domain blocks from instances...")
token = blocklist_src['token'] for blocklist_src in conf.blocklist_instance_sources:
blocklists[domain] = fetch_instance_blocklist(token, domain) domain = blocklist_src['domain']
token = blocklist_src['token']
blocklists[domain] = fetch_instance_blocklist(token, domain)
if conf.save_intermediate:
save_intermediate_blocklist(blocklists[domain], domain, conf.savedir)
# Merge blocklists into an update dict # Merge blocklists into an update dict
merged = merge_blocklists(blocklists) merged = merge_blocklists(blocklists)
if conf.blocklist_savefile:
# log.debug(f"Merged blocklist ready:\n") log.info(f"Saving merged blocklist to {conf.blocklist_savefile}")
# pprint.pp(merged) save_blocklist_to_file(merged.values(), conf.blocklist_savefile)
# Push the blocklist to destination instances # Push the blocklist to destination instances
for dest in conf['blocklist_instance_destinations']: if not conf.no_push_instance:
domain = dest['domain'] log.info("Pushing domain blocks to instances...")
token = dest['token'] for dest in conf.blocklist_instance_destinations:
push_blocklist(token, domain, merged.values()) domain = dest['domain']
token = dest['token']
push_blocklist(token, domain, merged.values())
def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict: def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
"""Merge fetched remote blocklists into a bulk update """Merge fetched remote blocklists into a bulk update
@ -56,14 +74,10 @@ def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
'max' (the default) uses the highest severity block found 'max' (the default) uses the highest severity block found
'min' uses the lowest severity block found 'min' uses the lowest severity block found
""" """
# log.debug(f"Merging blocklists {blocklists} ...")
# Remote blocklists may have conflicting overlaps. We need to
# decide whether or not to override earlier entries with later
# ones or not. How to choose which entry is 'correct'?
merged = {} merged = {}
for key, blist in blocklists.items(): for key, blist in blocklists.items():
log.debug(f"Adding blocks from {key} ...") log.debug(f"Merging blocks from {key} ...")
for blockdef in blist: for blockdef in blist:
# log.debug(f"Checking blockdef {blockdef} ...") # log.debug(f"Checking blockdef {blockdef} ...")
domain = blockdef['domain'] domain = blockdef['domain']
@ -122,7 +136,12 @@ def merge_blocklists(blocklists: dict, mergeplan: str='max') -> dict:
def fetch_instance_blocklist(token: str, host: str) -> list: def fetch_instance_blocklist(token: str, host: str) -> list:
"""Fetch existing block list from server """Fetch existing block list from server
@param token: The OAuth Bearer token to authenticate with.
@param host: The remote host to connect to.
@returns: A list of the admin domain blocks from the instance.
""" """
log.info(f"Fetching instance blocklist from {host} ...")
api_path = "/api/v1/admin/domain_blocks" api_path = "/api/v1/admin/domain_blocks"
url = f"https://{host}{api_path}" url = f"https://{host}{api_path}"
@ -155,28 +174,6 @@ def fetch_instance_blocklist(token: str, host: str) -> list:
log.debug(f"Found {len(domain_blocks)} existing domain blocks.") log.debug(f"Found {len(domain_blocks)} existing domain blocks.")
return domain_blocks return domain_blocks
def export_blocklist(token: str, host: str, outfile: str):
"""Export current server blocklist to a csv file"""
blocklist = fetch_instance_blocklist(token, host)
fieldnames = ['id', 'domain', 'severity', 'reject_media', 'reject_reports', 'private_comment', 'public_comment', 'obfuscate']
blocklist = sorted(blocklist, key=lambda x: int(x['id']))
with open(outfile, "w") as fp:
writer = csv.DictWriter(fp, fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(blocklist)
def delete_blocklist(token: str, host: str, blockfile: str):
"""Delete domain blocks listed in blockfile"""
with open(blockfile) as fp:
reader = csv.DictReader(fp)
for row in reader:
domain = row['domain']
id = row['id']
log.debug(f"Deleting {domain} (id: {id}) from blocklist...")
delete_block(token, host, id)
def delete_block(token: str, host: str, id: int): def delete_block(token: str, host: str, id: int):
"""Remove a domain block""" """Remove a domain block"""
log.debug(f"Removing domain block {id} at {host}...") log.debug(f"Removing domain block {id} at {host}...")
@ -236,6 +233,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]):
@param host: The instance host, FQDN or IP @param host: The instance host, FQDN or IP
@param blocklist: A list of block definitions. They must include the domain. @param blocklist: A list of block definitions. They must include the domain.
""" """
log.info(f"Pushing blocklist to host {host} ...")
# Fetch the existing blocklist from the instance # Fetch the existing blocklist from the instance
serverblocks = fetch_instance_blocklist(token, host) serverblocks = fetch_instance_blocklist(token, host)
@ -249,7 +247,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]):
try: try:
blockdict = knownblocks[row['domain']] blockdict = knownblocks[row['domain']]
log.info(f"Block already exists for {row['domain']}, merging data...") log.debug(f"Block already exists for {row['domain']}, merging data...")
# Check if anything is actually different and needs updating # Check if anything is actually different and needs updating
change_needed = False change_needed = False
@ -276,7 +274,7 @@ def push_blocklist(token: str, host: str, blocklist: list[dict]):
time.sleep(1) time.sleep(1)
else: else:
log.info("No differences detected. Not updating.") log.debug("No differences detected. Not updating.")
except KeyError: except KeyError:
# domain doesn't have an entry, so we need to create one # domain doesn't have an entry, so we need to create one
@ -303,12 +301,59 @@ def load_config(configfile: str):
conf = toml.load(configfile) conf = toml.load(configfile)
return conf return conf
def save_intermediate(blocklist: list, source: str, filedir: str): def save_intermediate_blocklist(blocklist: list[dict], source: str, filedir: str):
"""Save a local copy of a blocklist we've downloaded
"""
# Invent a filename based on the remote source
# If the source was a URL, convert it to something less messy
# If the source was a remote domain, just use the name of the domain
log.debug(f"Saving intermediate blocklist from {source}")
source = source.replace('/','-')
filename = f"{source}.csv"
filepath = os.path.join(filedir, filename)
save_blocklist_to_file(blocklist, filepath)
def save_blocklist_to_file(blocklist: list[dict], filepath: str):
"""Save a blocklist we've downloaded from a remote source """Save a blocklist we've downloaded from a remote source
Save a local copy of the remote blocklist after downloading it. @param blocklist: A dictionary of block definitions, keyed by domain
@param filepath: The path to the file the list should be saved in.
""" """
blocklist = sorted(blocklist, key=lambda x: x['domain'])
fieldnames = ['domain', 'severity', 'private_comment', 'public_comment', 'reject_media', 'reject_reports', 'obfuscate']
with open(filepath, "w") as fp:
writer = csv.DictWriter(fp, fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(blocklist)
def augment_args(args):
"""Augment commandline arguments with config file parameters"""
conf = toml.load(args.config)
if not args.no_fetch_url:
args.no_fetch_url = conf.get('no_fetch_url', False)
if not args.no_fetch_instance:
args.no_fetch_instance = conf.get('no_fetch_instance', False)
if not args.no_push_instance:
args.no_push_instance = conf.get('no_push_instance', False)
if not args.blocklist_savefile:
args.blocklist_savefile = conf.get('blocklist_savefile', None)
if not args.save_intermediate:
args.save_intermediate = conf.get('save_intermediate', False)
if not args.savedir:
args.savedir = conf.get('savedir', '/tmp')
args.blocklist_url_sources = conf.get('blocklist_url_sources')
args.blocklist_instance_sources = conf.get('blocklist_instance_sources')
args.blocklist_instance_destinations = conf.get('blocklist_instance_destinations')
return args
if __name__ == '__main__': if __name__ == '__main__':
@ -316,6 +361,14 @@ if __name__ == '__main__':
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
ap.add_argument('-c', '--config', default='/etc/default/fediblockhole.conf.toml', help="Config file") ap.add_argument('-c', '--config', default='/etc/default/fediblockhole.conf.toml', help="Config file")
ap.add_argument('-o', '--outfile', dest="blocklist_savefile", help="Save merged blocklist to a local file.")
ap.add_argument('-S', '--save-intermediate', dest="save_intermediate", action='store_true', help="Save intermediate blocklists we fetch to local files.")
ap.add_argument('-D', '--savedir', dest="savedir", help="Directory path to save intermediate lists.")
ap.add_argument('--no-fetch-url', dest='no_fetch_url', action='store_true', help="Don't fetch from URLs, even if configured.")
ap.add_argument('--no-fetch-instance', dest='no_fetch_instance', action='store_true', help="Don't fetch from instances, even if configured.")
ap.add_argument('--no-push-instance', dest='no_push_instance', action='store_true', help="Don't push to instances, even if configured.")
ap.add_argument('--loglevel', choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set log output level.") ap.add_argument('--loglevel', choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set log output level.")
args = ap.parse_args() args = ap.parse_args()
@ -324,7 +377,7 @@ if __name__ == '__main__':
log.setLevel(getattr(logging, levelname)) log.setLevel(getattr(logging, levelname))
# Load the configuration file # Load the configuration file
conf = load_config(args.config) args = augment_args(args)
# Do the work of syncing # Do the work of syncing
sync_blocklists(conf) sync_blocklists(args)

View File

@ -1,20 +1,35 @@
# List of instances to read blocklists from, # List of instances to read blocklists from,
# with the Bearer token authorised by the instance # with the Bearer token authorised by the instance
blocklist_instance_sources = [ blocklist_instance_sources = [
{ { domain = 'eigenmagic.net', token = '<a_token_with_read_auth>' },
domain = 'eigenmagic.net', { domain = 'jorts.horse', token = '<a_different_token>' },
token = '<a_token_with_read_auth>' ]
},
{ # List of URLs to read csv blocklists from
domain = 'jorts.horse', blocklist_url_sources = [
token = '<a_different_token>' 'file:///etc/fediblockhole/blocklist-01.csv',
}, 'https://github.com/fediblockhole/samples/demo-blocklist-01.csv',
] ]
# List of instances to write blocklist to # List of instances to write blocklist to
blocklist_instance_destinations = [ blocklist_instance_destinations = [
{ { domain = 'eigenmagic.net', token = '<read_write_token>' },
domain = 'eigenmagic.net', ]
token = '<read_write_token>'
}, ## Store a local copy of the remote blocklists after we fetch them
] #keep_intermediate = true
## Directory to store the local blocklist copies
# savedir = '/tmp'
## File to save the fully merged blocklist into
# blocklist_savefile = '/tmp/merged_blocklist.csv'
## Don't push blocklist to instances, even if they're defined above
# no_push_instance = false
## Don't fetch blocklists from URLs, even if they're defined above
# no_fetch_url = false
## Don't fetch blocklists from instances, even if they're defined above
# no_fetch_instance = false

View File

@ -0,0 +1,10 @@
"domain","severity","reject_media","reject_reports","private_comment","public_comment","obfuscate"
"qoto.org","suspend",TRUE,TRUE,,,TRUE
"sealion.club","suspend",TRUE,TRUE,,,TRUE
"develop.gab.com","suspend",TRUE,TRUE,,,TRUE
"gab.ai","suspend",TRUE,TRUE,,,TRUE
"gab.sleeck.eu","suspend",TRUE,TRUE,,,TRUE
"gab.com","suspend",TRUE,TRUE,,,TRUE
"kiwifarms.is","suspend",TRUE,TRUE,,,TRUE
"kiwifarms.net","suspend",TRUE,TRUE,,,TRUE
"gabfed.com","suspend",TRUE,TRUE,,,TRUE
1 domain severity reject_media reject_reports private_comment public_comment obfuscate
2 qoto.org suspend TRUE TRUE TRUE
3 sealion.club suspend TRUE TRUE TRUE
4 develop.gab.com suspend TRUE TRUE TRUE
5 gab.ai suspend TRUE TRUE TRUE
6 gab.sleeck.eu suspend TRUE TRUE TRUE
7 gab.com suspend TRUE TRUE TRUE
8 kiwifarms.is suspend TRUE TRUE TRUE
9 kiwifarms.net suspend TRUE TRUE TRUE
10 gabfed.com suspend TRUE TRUE TRUE