72 lines
2.9 KiB
Bash
Executable file
72 lines
2.9 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# K3s cluster health check — run from workstation.
|
|
# Checks:
|
|
# - Cluster (via pve-control): nodes Ready, no failing pods, coredns healthy
|
|
# - Per node: wg0 up, flannel.1 up, /etc/resolv.conf non-empty, host DNS resolves
|
|
# - Mesh: each node can ping every other node's WG IP
|
|
set -uo pipefail
|
|
|
|
CONTROL="pve-control"
|
|
NODES=(pve-control adder-control game-control pve-worker adder-worker game-worker-hdd game-worker-ssd fat_mama)
|
|
|
|
# SSH alias → WG IP (see k3s/README.md). SSH alias `fat_mama`; k8s node name `fatmama`.
|
|
declare -A WG_IP=(
|
|
[pve-control]=10.0.0.6
|
|
[pve-worker]=10.0.0.7
|
|
[adder-control]=10.0.0.8
|
|
[adder-worker]=10.0.0.9
|
|
[game-control]=10.0.0.10
|
|
[game-worker-hdd]=10.0.0.11
|
|
[game-worker-ssd]=10.0.0.12
|
|
[fat_mama]=10.0.0.13
|
|
)
|
|
|
|
RC=0
|
|
pass() { printf ' \033[32m[OK]\033[0m %s\n' "$*"; }
|
|
fail() { printf ' \033[31m[FAIL]\033[0m %s\n' "$*"; RC=1; }
|
|
|
|
SSH="ssh -o ConnectTimeout=5 -o BatchMode=yes"
|
|
|
|
echo "=== Cluster-level checks (via $CONTROL) ==="
|
|
not_ready=$($SSH "$CONTROL" "sudo kubectl get nodes --no-headers | awk '\$2 != \"Ready\" {print \$1\"(\"\$2\")\"}'" 2>/dev/null)
|
|
[[ -z "$not_ready" ]] && pass "all nodes Ready" || fail "not Ready: $not_ready"
|
|
|
|
bad_pods=$($SSH "$CONTROL" "sudo kubectl get pods -A --no-headers | awk '\$4 != \"Running\" && \$4 != \"Completed\" {print \$1\"/\"\$2\"(\"\$4\")\"}'" 2>/dev/null)
|
|
[[ -z "$bad_pods" ]] && pass "no failing pods" || fail "bad pods: $bad_pods"
|
|
|
|
coredns=$($SSH "$CONTROL" "sudo kubectl get deploy coredns -n kube-system -o=jsonpath='{.status.readyReplicas}/{.status.replicas}'" 2>/dev/null)
|
|
[[ "$coredns" == "1/1" ]] && pass "coredns $coredns" || fail "coredns $coredns"
|
|
|
|
echo
|
|
echo "=== Per-node checks ==="
|
|
for node in "${NODES[@]}"; do
|
|
echo "-- $node --"
|
|
if ! $SSH "$node" true 2>/dev/null; then
|
|
fail "ssh unreachable"
|
|
continue
|
|
fi
|
|
$SSH "$node" "ip -br a show wg0 2>/dev/null | grep -q .." 2>/dev/null \
|
|
&& pass "wg0 up" || fail "wg0 missing"
|
|
$SSH "$node" "ip -br a show flannel.1 2>/dev/null | grep -q .." 2>/dev/null \
|
|
&& pass "flannel.1 up" || fail "flannel.1 missing"
|
|
$SSH "$node" "[ -s /etc/resolv.conf ]" 2>/dev/null \
|
|
&& pass "resolv.conf non-empty" || fail "resolv.conf empty"
|
|
$SSH "$node" "getent hosts registry-1.docker.io >/dev/null 2>&1" 2>/dev/null \
|
|
&& pass "host DNS resolves" || fail "host DNS broken"
|
|
done
|
|
|
|
echo
|
|
echo "=== Mesh reachability (each node → every peer's WG IP) ==="
|
|
for src in "${NODES[@]}"; do
|
|
unreachable=""
|
|
for dst in "${NODES[@]}"; do
|
|
[[ "$src" == "$dst" ]] && continue
|
|
$SSH "$src" "ping -c1 -W2 ${WG_IP[$dst]} >/dev/null 2>&1" 2>/dev/null \
|
|
|| unreachable="$unreachable ${dst}(${WG_IP[$dst]})"
|
|
done
|
|
[[ -z "$unreachable" ]] && pass "$src → all peers" || fail "$src unreachable:$unreachable"
|
|
done
|
|
|
|
echo
|
|
[[ $RC -eq 0 ]] && echo "ALL CHECKS PASSED" || echo "FAILURES DETECTED"
|
|
exit $RC
|