#!/usr/bin/env bash # K3s cluster health check — run from workstation. # Checks: # - Cluster (via pve-control): nodes Ready, no failing pods, coredns healthy # - Per node: wg0 up, flannel.1 up, /etc/resolv.conf non-empty, host DNS resolves # - Mesh: each node can ping every other node's WG IP set -uo pipefail CONTROL="pve-control" NODES=(pve-control adder-control game-control pve-worker adder-worker game-worker-hdd game-worker-ssd fat_mama) # SSH alias → WG IP (see k3s/README.md). SSH alias `fat_mama`; k8s node name `fatmama`. declare -A WG_IP=( [pve-control]=10.0.0.6 [pve-worker]=10.0.0.7 [adder-control]=10.0.0.8 [adder-worker]=10.0.0.9 [game-control]=10.0.0.10 [game-worker-hdd]=10.0.0.11 [game-worker-ssd]=10.0.0.12 [fat_mama]=10.0.0.13 ) RC=0 pass() { printf ' \033[32m[OK]\033[0m %s\n' "$*"; } fail() { printf ' \033[31m[FAIL]\033[0m %s\n' "$*"; RC=1; } SSH="ssh -o ConnectTimeout=5 -o BatchMode=yes" echo "=== Cluster-level checks (via $CONTROL) ===" not_ready=$($SSH "$CONTROL" "sudo kubectl get nodes --no-headers | awk '\$2 != \"Ready\" {print \$1\"(\"\$2\")\"}'" 2>/dev/null) [[ -z "$not_ready" ]] && pass "all nodes Ready" || fail "not Ready: $not_ready" bad_pods=$($SSH "$CONTROL" "sudo kubectl get pods -A --no-headers | awk '\$4 != \"Running\" && \$4 != \"Completed\" {print \$1\"/\"\$2\"(\"\$4\")\"}'" 2>/dev/null) [[ -z "$bad_pods" ]] && pass "no failing pods" || fail "bad pods: $bad_pods" coredns=$($SSH "$CONTROL" "sudo kubectl get deploy coredns -n kube-system -o=jsonpath='{.status.readyReplicas}/{.status.replicas}'" 2>/dev/null) [[ "$coredns" == "1/1" ]] && pass "coredns $coredns" || fail "coredns $coredns" echo echo "=== Per-node checks ===" for node in "${NODES[@]}"; do echo "-- $node --" if ! $SSH "$node" true 2>/dev/null; then fail "ssh unreachable" continue fi $SSH "$node" "ip -br a show wg0 2>/dev/null | grep -q .." 2>/dev/null \ && pass "wg0 up" || fail "wg0 missing" $SSH "$node" "ip -br a show flannel.1 2>/dev/null | grep -q .." 2>/dev/null \ && pass "flannel.1 up" || fail "flannel.1 missing" $SSH "$node" "[ -s /etc/resolv.conf ]" 2>/dev/null \ && pass "resolv.conf non-empty" || fail "resolv.conf empty" $SSH "$node" "getent hosts registry-1.docker.io >/dev/null 2>&1" 2>/dev/null \ && pass "host DNS resolves" || fail "host DNS broken" done echo echo "=== Mesh reachability (each node → every peer's WG IP) ===" for src in "${NODES[@]}"; do unreachable="" for dst in "${NODES[@]}"; do [[ "$src" == "$dst" ]] && continue $SSH "$src" "ping -c1 -W2 ${WG_IP[$dst]} >/dev/null 2>&1" 2>/dev/null \ || unreachable="$unreachable ${dst}(${WG_IP[$dst]})" done [[ -z "$unreachable" ]] && pass "$src → all peers" || fail "$src unreachable:$unreachable" done echo [[ $RC -eq 0 ]] && echo "ALL CHECKS PASSED" || echo "FAILURES DETECTED" exit $RC