Script to check a ceph cluster health and send alerts to UptimeKuma.
#!/bin/bash
NODES=("node1" "node2" "node3")
for node in "${NODES[@]}"; do
echo "Checking node: $node"
issues=()
output=$(ssh -o ConnectTimeout=5 root@"$node" bash <<'EOF'
echo "===SERVICE_STATUS==="
systemctl is-active pve-cluster corosync pvedaemon pveproxy pvestatd pve-ha-lrm pve-ha-crm
echo "===OSD_STATUS==="
ceph osd tree | awk -v host="$(hostname)" '
BEGIN { in_block=0 }
$0 ~ "host " host { in_block=1; next }
in_block && NF == 0 { exit }
in_block && $1 ~ /^[0-9]+$/ && ($5 != "up" || $6 == "0.00000") { print }
'
echo "===HA_MIGRATIONS==="
journalctl -u pve-ha-lrm --since "5 min ago" -o cat | grep -i migrate
EOF
)
if [[ -z "$output" ]]; then
echo "No output from $node — likely SSH failed or output was suppressed"
curl -fsS "https://site/api/push/key?status=down&msg=$node:ssh-unreachable"
echo "------"
continue
fi
#Parse the output
service_block=$(echo "$output" | awk '/===SERVICE_STATUS===/{flag=1; next}/===/{flag=0}flag')
osd_block=$(echo "$output" | awk '/===OSD_STATUS===/{flag=1; next}/===/{flag=0}flag')
migration_block=$(echo "$output" | awk '/===HA_MIGRATIONS===/{flag=1; next}/===/{flag=0}flag')
if echo "$service_block" | grep -qv "active"; then
issues+=("service-issue")
fi
if [[ -n "$osd_block" ]]; then
echo "OSD issue(s) on $node:"
echo "$osd_block"
issues+=("osd-issue")
fi
if [[ -n "$migration_block" ]]; then
echo "Recent HA migration(s) on $node:"
echo "$migration_block"
issues+=("recent-migration")
fi
if [ ${#issues[@]} -gt 0 ]; then
msg=$(IFS='+'; echo "${issues[*]}")
echo "Issues on $node: ${issues[*]}"
curl -fsS "https://site/api/push/key?status=down&msg=$node:$msg"
else
echo "All checks passed on $node. Sending UP signal."
curl -fsS "https://site/api/push/key?status=up&msg=$node"
fi
echo "------"
done