Check ZFS Cluster Health with UptimeKuma Alerts

Script to check a zfs cluster health and send alerts to UptimeKuma.

#!/bin/bash

NODES=("node1" "node2" "node3")

for node in "${NODES[@]}"; do
    echo "Checking node: $node"
    issues=()

    output=$(ssh -o ConnectTimeout=5 root@"$node" bash <<'EOF'
echo "===SERVICE_STATUS==="
systemctl is-active pve-cluster corosync pvedaemon pveproxy pvestatd pve-ha-lrm pve-ha-crm

echo "===ZFS_STATUS==="
zpool list -H -o name,health

echo "===HA_MIGRATIONS==="
journalctl -u pve-ha-lrm --since "5 min ago" -o cat | grep -i migrate
EOF
)

    if [[ -z "$output" ]]; then
        echo "No output from $node — likely SSH failed"
        curl -fsS "https://site/api/push/key?status=down&msg=$node:ssh-unreachable"
        echo "------"
        continue
    fi

    service_block=$(echo "$output" | awk '/===SERVICE_STATUS===/{flag=1; next}/===/{flag=0}flag')
    zfs_block=$(echo "$output" | awk '/===ZFS_STATUS===/{flag=1; next}/===/{flag=0}flag')
    migration_block=$(echo "$output" | awk '/===HA_MIGRATIONS===/{flag=1; next}/===/{flag=0}flag')

    if echo "$service_block" | grep -qv "active"; then
        issues+=("service-issue")
    fi

    if echo "$zfs_block" | awk '{ if ($2 != "ONLINE") exit 1 }'; then
        :  # All good
    else
        echo "ZFS issue(s) on $node:"
        echo "$zfs_block"
        issues+=("zfs-issue")
    fi

    if [[ -n "$migration_block" ]]; then
        echo "Recent HA migration(s) on $node:"
        echo "$migration_block"
        issues+=("recent-migration")
    fi

    if [ ${#issues[@]} -gt 0 ]; then
        msg=$(IFS='+'; echo "${issues[*]}")
        echo "Issues on $node: ${issues[*]}"
        curl -fsS "https://site/api/push/key?status=down&msg=$node:$msg"
    else
        echo "All checks passed on $node. Sending UP signal."
        curl -fsS "https://site/api/push/key?status=up&msg=$node"
    fi

    echo "------"
done