misskey-safety-scan/verify-safety

159 lines
4.7 KiB
Plaintext
Raw Normal View History

2024-10-17 14:55:26 +00:00
#!/bin/bash
# Misskey Safety Scan
#
# Scans the descriptions of all federated instances using an LLM to
# analyze the text to determine if the instance is likely to host
# material considered inappropriate by most people.
#
# This doesn't mean the content on other instances is illegal or
# morally wrong, but it does mean that the instance is a good
# candidate for manual analysis by the local instance administrator.
#
# Usage Instructions:
#
# 1. Find your API key (the "i" parameter) from an API request in
# browser console.
# 2. Execute script like "scan-federated-instances <instance> <key> <model>"
# 3. Wait a really long time.
#
# Output:
# - unsafe.csv: Instances with descriptions considered inappropriate.
# - failures.csv: Instances that whose output could not be determined.
#
# What to do with output:
#
# 1. Run unsafe.csv through the verify-safety script. This will
# double check descriptions with sentiment analysis.
#
# 2. Take appropriate action against instances that you feel promote
# content that is against your instance's terms of service, or are
# illegal.
#
# 3. Check failures.csv manually for failure reasons.
###### MAIN CONFIG ######
export CSV_FILE="$1"
export LLM_MODEL="$2"
export UNSAFE="unsafe-verified.csv"
#########################
#### LLM INFORMATION ####
export MAX_REQUESTS=4
read -r -d '' DESCRIPTION_PROMPT <<'EOF'
Here is the description of a Mastodon instance.
Evaluate if it is safe or not:
EOF
#########################
# Run inference multiple times to account for variance in LLM
# responses.
get_llm_response() {
prompt="$1"
llm_output1=$(echo "$prompt" | aichat -S -m $LLM_MODEL | head -n 2)
llm_output2=$(echo "$prompt" | aichat -S -m $LLM_MODEL | head -n 2)
llm_output3=$(echo "$prompt" | aichat -S -m $LLM_MODEL | head -n 2)
safety_code1=$(echo "$llm_output1" | tail -n 1)
safety_status1=$(echo "$llm_output1" | head -n 1)
safety_code2=$(echo "$llm_output2" | tail -n 1)
safety_status2=$(echo "$llm_output2" | head -n 1)
safety_code3=$(echo "$llm_output3" | tail -n 1)
safety_status3=$(echo "$llm_output3" | head -n 1)
# If any response is safe, it's likely the description is actually
# safe.
if [[ "$safety_status1" == "safe" ||
"$safety_status2" == "safe" ||
"$safety_status3" == "safe" ]]; then
echo "safe"
else
# if all responses unsafe, then definitely unsafe. otherwise,
# unclear response (return all responses).
if [[ "$safety_status1" == "unsafe" &&
"$safety_status2" == "unsafe" &&
"$safety_status3" == "unsafe" ]]; then
echo -e "unsafe\n${safety_code1}"
else
echo "${llm_output1}. ${llm_output2}. ${llm_output3}." | tr --delete '\n'
fi
fi
}
export -f get_llm_response
evaluate_instance_safety() {
local id="$1"
local host="$2"
local original_code="$3"
local description="$4"
# For some reason, we can wind up with empty ids and hosts. Do
# nothing with those.
if [[ -z "$id" || -z "$host" ]]; then
return 0
fi
# if host does not respond, ignore it.
curl --max-time 3 -L -s -o /dev/null -XOPTIONS "https://$host"
if [[ $? != 0 ]]; then
return 0
fi
# Call aichat with the description and capture the output
prompt="$DESCRIPTION_PROMPT $description"
llm_output=$(get_llm_response "$prompt")
safety_code=$(echo "$llm_output" | tail -n 1)
safety_status=$(echo "$llm_output" | head -n 1)
display_desc=$(echo "${description:0:80}" | tr --delete '\n')
# we can ignore certain unsafe codes, because they're not related
# to inappropriate content. S6 = specialized advice, S7 = privacy,
# S8 = intellectual property
if [[ "$safety_status" == "unsafe" ]]; then
if [[ "$safety_code" == "S6" ||
"$safety_code" == "S7" ||
"$safety_code" == "S8" ]]; then
safety_status="safe"
safety_code=""
fi
fi
# Output readable info
echo "[$id] $host"
if [[ $safety_status == "safe" ]]; then
echo " ├Description: $display_desc"
echo " └safe"
elif [[ $safety_status == "unsafe" ]]; then
write_desc=$(echo "${description}" | tr --delete '\n')
echo " ├Description: $display_desc"
echo " ├Code: $safety_code"
echo " └unsafe"
echo "$id,$host,$safety_code,\"$write_desc\"" >> $UNSAFE
else
echo " ├Description: $display_desc"
echo " └unknown: $safety_status"
fi
}
export -f evaluate_instance_safety
# Signal handler to kill child processes and exit the script
trap 'echo EXITING...; killall -HUP parallel; kill $(jobs -p); exit' INT TERM
inputs=""
echo "" > $UNSAFE
while IFS= read -r line; do
printf -v inputs '%s\n' "$inputs" "$line"
done < "$CSV_FILE"
inputs=$(echo "$inputs" | sed '/^[[:blank:]]*$/ d')
parallel -P $MAX_REQUESTS --csv evaluate_instance_safety ::: "${inputs[@]}"