homelab/k3s/scripts/check-health.sh
2026-04-18 18:28:55 -04:00

72 lines
2.9 KiB
Bash
Executable file

#!/usr/bin/env bash
# K3s cluster health check — run from workstation.
# Checks:
# - Cluster (via pve-control): nodes Ready, no failing pods, coredns healthy
# - Per node: wg0 up, flannel.1 up, /etc/resolv.conf non-empty, host DNS resolves
# - Mesh: each node can ping every other node's WG IP
set -uo pipefail
CONTROL="pve-control"
NODES=(pve-control adder-control game-control pve-worker adder-worker game-worker-hdd game-worker-ssd fat_mama)
# SSH alias → WG IP (see k3s/README.md). SSH alias `fat_mama`; k8s node name `fatmama`.
declare -A WG_IP=(
[pve-control]=10.0.0.6
[pve-worker]=10.0.0.7
[adder-control]=10.0.0.8
[adder-worker]=10.0.0.9
[game-control]=10.0.0.10
[game-worker-hdd]=10.0.0.11
[game-worker-ssd]=10.0.0.12
[fat_mama]=10.0.0.13
)
RC=0
pass() { printf ' \033[32m[OK]\033[0m %s\n' "$*"; }
fail() { printf ' \033[31m[FAIL]\033[0m %s\n' "$*"; RC=1; }
SSH="ssh -o ConnectTimeout=5 -o BatchMode=yes"
echo "=== Cluster-level checks (via $CONTROL) ==="
not_ready=$($SSH "$CONTROL" "sudo kubectl get nodes --no-headers | awk '\$2 != \"Ready\" {print \$1\"(\"\$2\")\"}'" 2>/dev/null)
[[ -z "$not_ready" ]] && pass "all nodes Ready" || fail "not Ready: $not_ready"
bad_pods=$($SSH "$CONTROL" "sudo kubectl get pods -A --no-headers | awk '\$4 != \"Running\" && \$4 != \"Completed\" {print \$1\"/\"\$2\"(\"\$4\")\"}'" 2>/dev/null)
[[ -z "$bad_pods" ]] && pass "no failing pods" || fail "bad pods: $bad_pods"
coredns=$($SSH "$CONTROL" "sudo kubectl get deploy coredns -n kube-system -o=jsonpath='{.status.readyReplicas}/{.status.replicas}'" 2>/dev/null)
[[ "$coredns" == "1/1" ]] && pass "coredns $coredns" || fail "coredns $coredns"
echo
echo "=== Per-node checks ==="
for node in "${NODES[@]}"; do
echo "-- $node --"
if ! $SSH "$node" true 2>/dev/null; then
fail "ssh unreachable"
continue
fi
$SSH "$node" "ip -br a show wg0 2>/dev/null | grep -q .." 2>/dev/null \
&& pass "wg0 up" || fail "wg0 missing"
$SSH "$node" "ip -br a show flannel.1 2>/dev/null | grep -q .." 2>/dev/null \
&& pass "flannel.1 up" || fail "flannel.1 missing"
$SSH "$node" "[ -s /etc/resolv.conf ]" 2>/dev/null \
&& pass "resolv.conf non-empty" || fail "resolv.conf empty"
$SSH "$node" "getent hosts registry-1.docker.io >/dev/null 2>&1" 2>/dev/null \
&& pass "host DNS resolves" || fail "host DNS broken"
done
echo
echo "=== Mesh reachability (each node → every peer's WG IP) ==="
for src in "${NODES[@]}"; do
unreachable=""
for dst in "${NODES[@]}"; do
[[ "$src" == "$dst" ]] && continue
$SSH "$src" "ping -c1 -W2 ${WG_IP[$dst]} >/dev/null 2>&1" 2>/dev/null \
|| unreachable="$unreachable ${dst}(${WG_IP[$dst]})"
done
[[ -z "$unreachable" ]] && pass "$src → all peers" || fail "$src unreachable:$unreachable"
done
echo
[[ $RC -eq 0 ]] && echo "ALL CHECKS PASSED" || echo "FAILURES DETECTED"
exit $RC