misskey-safety-scan/verify-scan

#!/bin/bash

# Verify Scan
#
# Double checks the output file one more time with the guard LLM.

###### MAIN CONFIG ######
export CSV_FILE="$1"
export LLM_MODEL="$2"
export UNSAFE="unsafe-verified.csv"
#########################

#### LLM INFORMATION ####
export MAX_REQUESTS=4

read -r -d '' DESCRIPTION_PROMPT <<'EOF'
Here is the description of a Mastodon instance.
Evaluate if it is safe or not:
EOF
#########################

# Run inference multiple times to account for variance in LLM
# responses.
get_llm_response() {
    prompt="$1"
    llm_output1=$(echo "$prompt" | aichat -S -m $LLM_MODEL | head -n 2)
    llm_output2=$(echo "$prompt" | aichat -S -m $LLM_MODEL | head -n 2)
    llm_output3=$(echo "$prompt" | aichat -S -m $LLM_MODEL | head -n 2)

    safety_code1=$(echo "$llm_output1" | tail -n 1)
    safety_status1=$(echo "$llm_output1" | head -n 1)

    safety_code2=$(echo "$llm_output2" | tail -n 1)
    safety_status2=$(echo "$llm_output2" | head -n 1)

    safety_code3=$(echo "$llm_output3" | tail -n 1)
    safety_status3=$(echo "$llm_output3" | head -n 1)

    # If any response is safe, it's likely the description is actually
    # safe.
    if [[ "$safety_status1" == "safe" ||
	      "$safety_status2" == "safe" ||
	      "$safety_status3" == "safe" ]]; then
	echo "safe"
    else
	# if all responses unsafe, then definitely unsafe. otherwise,
	# unclear response (return all responses).
	if [[ "$safety_status1" == "unsafe" &&
		  "$safety_status2" == "unsafe" &&
		  "$safety_status3" == "unsafe" ]]; then
	    echo -e "unsafe\n${safety_code1}"
	else
	    echo "${llm_output1}. ${llm_output2}. ${llm_output3}." | tr --delete '\n'
	fi
    fi
}

export -f get_llm_response

evaluate_instance_safety() {
    local id="$1"
    local host="$2"
    local original_code="$3"
    local description="$4"

    # For some reason, we can wind up with empty ids and hosts. Do
    # nothing with those.
    if [[ -z "$id" || -z "$host" ]]; then
	return 0
    fi

    # if host does not respond, ignore it.
    curl --max-time 3 -L -s -o /dev/null -XOPTIONS "https://$host"
    if [[ $? != 0 ]]; then
	return 0
    fi

    # Call aichat with the description and capture the output
    prompt="$DESCRIPTION_PROMPT $description"
    llm_output=$(get_llm_response "$prompt")

    safety_code=$(echo "$llm_output" | tail -n 1)
    safety_status=$(echo "$llm_output" | head -n 1)
    display_desc=$(echo "${description:0:80}" |  tr --delete '\n')

    # we can ignore certain unsafe codes, because they're not related
    # to inappropriate content. S6 = specialized advice, S7 = privacy,
    # S8 = intellectual property
    if [[ "$safety_status" == "unsafe" ]]; then
	if [[ "$safety_code" == "S6" ||
		  "$safety_code" == "S7" ||
		  "$safety_code" == "S8" ]]; then
	    safety_status="safe"
	    safety_code=""
	fi
    fi

    # Output readable info
    echo "[$id] $host"
    if [[ $safety_status == "safe" ]]; then
	echo " ├Description: $display_desc"
        echo " └safe"
    elif [[ $safety_status == "unsafe" ]]; then
	write_desc=$(echo "${description}" |  tr --delete '\n')
	echo " ├Description: $display_desc"
	echo " ├Code: $safety_code"
	echo " └unsafe"
	echo "$id,$host,$safety_code,\"$write_desc\"" >> $UNSAFE
    else
	echo " ├Description: $display_desc"
        echo " └unknown: $safety_status"
    fi
}

export -f evaluate_instance_safety

# Signal handler to kill child processes and exit the script
trap 'echo EXITING...; killall -HUP parallel; kill $(jobs -p); exit' INT TERM

inputs=""

echo "" > $UNSAFE

while IFS= read -r line; do
    printf -v inputs '%s\n' "$inputs" "$line"
done < "$CSV_FILE"

inputs=$(echo "$inputs" | sed '/^[[:blank:]]*$/ d')

parallel -P $MAX_REQUESTS --csv evaluate_instance_safety ::: "${inputs[@]}"