Add homelab internal health checker
Python checker runs on pve-control via systemd timer every 10 min, publishes issues to NATS subject homelab_health_issue. Checks NATS, Postgres, MariaDB, Ghost blogs, DB dependents, standalone services, and every NodePort. Silent when healthy. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
6364f70799
commit
58bfd422d4
7 changed files with 704 additions and 0 deletions
5
.opencode/opencode.json
Normal file
5
.opencode/opencode.json
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"plugin": [
|
||||||
|
"web"
|
||||||
|
]
|
||||||
|
}
|
||||||
308
k3s/health/checker.py
Normal file
308
k3s/health/checker.py
Normal file
|
|
@ -0,0 +1,308 @@
|
||||||
|
"""Homelab internal health checker.
|
||||||
|
|
||||||
|
Runs on pve-control every 10 minutes via systemd timer. Publishes issue
|
||||||
|
events to NATS subject `homelab_health_issue`. Silent when healthy.
|
||||||
|
See home_lab_health.md for design.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional, Sequence
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger("homelab-health")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- payload ----------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Issue:
|
||||||
|
component_name: str
|
||||||
|
issue_detail: str
|
||||||
|
detected_at: str
|
||||||
|
root_cause: Optional[str] = None
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
d = asdict(self)
|
||||||
|
if d["root_cause"] is None:
|
||||||
|
del d["root_cause"]
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- kubectl helpers ----------
|
||||||
|
|
||||||
|
def kubectl_json(args: Sequence[str], timeout: int = 15) -> dict:
|
||||||
|
cmd = ["kubectl", *args, "-o", "json"]
|
||||||
|
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||||
|
if r.returncode != 0:
|
||||||
|
raise RuntimeError(f"kubectl {' '.join(args)}: {r.stderr.strip()}")
|
||||||
|
return json.loads(r.stdout)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExecResult:
|
||||||
|
ok: bool
|
||||||
|
stdout: str
|
||||||
|
stderr: str
|
||||||
|
|
||||||
|
|
||||||
|
def kubectl_exec(namespace: str, pod_label: str, command: Sequence[str],
|
||||||
|
timeout: int = 10) -> ExecResult:
|
||||||
|
"""Run a command inside the first pod matching pod_label."""
|
||||||
|
try:
|
||||||
|
n = subprocess.run(
|
||||||
|
["kubectl", "get", "pod", "-n", namespace, "-l", pod_label,
|
||||||
|
"-o", "jsonpath={.items[0].metadata.name}"],
|
||||||
|
capture_output=True, text=True, timeout=5,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return ExecResult(False, "", "pod lookup timed out")
|
||||||
|
if n.returncode != 0 or not n.stdout.strip():
|
||||||
|
return ExecResult(False, "", f"no pod matched {pod_label} in {namespace}")
|
||||||
|
pod = n.stdout.strip()
|
||||||
|
try:
|
||||||
|
r = subprocess.run(
|
||||||
|
["kubectl", "exec", "-n", namespace, pod, "--", *command],
|
||||||
|
capture_output=True, text=True, timeout=timeout,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return ExecResult(False, "", f"kubectl exec timed out after {timeout}s")
|
||||||
|
return ExecResult(r.returncode == 0, r.stdout, r.stderr.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_nodeport(namespace: str, service: str) -> Optional[int]:
|
||||||
|
"""Return the first NodePort on the given service, or None."""
|
||||||
|
svc = kubectl_json(["get", "svc", "-n", namespace, service])
|
||||||
|
for port in svc.get("spec", {}).get("ports", []):
|
||||||
|
if "nodePort" in port:
|
||||||
|
return int(port["nodePort"])
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- probes ----------
|
||||||
|
|
||||||
|
def probe_service(svc: dict) -> list[Issue]:
|
||||||
|
if svc.get("disabled"):
|
||||||
|
return []
|
||||||
|
name = svc["name"]
|
||||||
|
try:
|
||||||
|
nodeport = resolve_nodeport(svc["namespace"], name)
|
||||||
|
except Exception as e:
|
||||||
|
return [Issue(name, f"kubectl failed: {e}", now_iso())]
|
||||||
|
if nodeport is None:
|
||||||
|
return [Issue(name, "no NodePort exposed", now_iso())]
|
||||||
|
|
||||||
|
url = f"http://localhost:{nodeport}{svc['probe_path']}"
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, timeout=10, allow_redirects=False)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
return [Issue(name, f"probe error at {url}: {e}", now_iso())]
|
||||||
|
if resp.status_code in svc["expected"]:
|
||||||
|
return []
|
||||||
|
return [Issue(name,
|
||||||
|
f"HTTP {resp.status_code} at {url} (expected {svc['expected']})",
|
||||||
|
now_iso())]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- check functions ----------
|
||||||
|
|
||||||
|
def check_nats(cfg: dict) -> list[Issue]:
|
||||||
|
port = cfg["nats"]["monitoring_nodeport"]
|
||||||
|
url = f"http://localhost:{port}/healthz"
|
||||||
|
try:
|
||||||
|
r = requests.get(url, timeout=5)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
return [Issue("nats", f"monitoring unreachable: {e}", now_iso())]
|
||||||
|
if r.status_code != 200:
|
||||||
|
return [Issue("nats", f"/healthz returned {r.status_code}", now_iso())]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def check_databases(cfg: dict) -> list[Issue]:
|
||||||
|
issues = []
|
||||||
|
for db in cfg.get("databases", []):
|
||||||
|
result = kubectl_exec(db["namespace"], db["pod_label"], db["probe_cmd"])
|
||||||
|
if not result.ok:
|
||||||
|
issues.append(Issue(
|
||||||
|
db["name"],
|
||||||
|
f"liveness probe failed: {result.stderr or '(no stderr)'}",
|
||||||
|
now_iso(),
|
||||||
|
))
|
||||||
|
return issues
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_probe(services, pred) -> list[Issue]:
|
||||||
|
out = []
|
||||||
|
for s in services:
|
||||||
|
if pred(s):
|
||||||
|
out.extend(probe_service(s))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def check_ghost_blogs(cfg):
|
||||||
|
return _filter_probe(cfg.get("services", []),
|
||||||
|
lambda s: s["name"].startswith("ghost"))
|
||||||
|
|
||||||
|
|
||||||
|
def check_mariadb_dependents(cfg):
|
||||||
|
return _filter_probe(cfg.get("services", []),
|
||||||
|
lambda s: s["db"] == "mariadb" and not s["name"].startswith("ghost"))
|
||||||
|
|
||||||
|
|
||||||
|
def check_postgres_dependents(cfg):
|
||||||
|
return _filter_probe(cfg.get("services", []),
|
||||||
|
lambda s: s["db"] == "postgres")
|
||||||
|
|
||||||
|
|
||||||
|
def check_standalone_services(cfg):
|
||||||
|
return _filter_probe(cfg.get("services", []),
|
||||||
|
lambda s: s["db"] is None)
|
||||||
|
|
||||||
|
|
||||||
|
def check_all_nodeports(_cfg) -> list[Issue]:
|
||||||
|
"""TCP connect to every NodePort in the cluster."""
|
||||||
|
svcs = kubectl_json(["get", "svc", "-A"])
|
||||||
|
issues = []
|
||||||
|
for item in svcs.get("items", []):
|
||||||
|
meta = item.get("metadata", {})
|
||||||
|
name = f"{meta.get('namespace')}/{meta.get('name')}"
|
||||||
|
for port in item.get("spec", {}).get("ports", []):
|
||||||
|
np = port.get("nodePort")
|
||||||
|
if np is None:
|
||||||
|
continue
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
|
s.settimeout(3)
|
||||||
|
rc = s.connect_ex(("127.0.0.1", int(np)))
|
||||||
|
if rc != 0:
|
||||||
|
issues.append(Issue(
|
||||||
|
name,
|
||||||
|
f"NodePort {np} not accepting TCP (errno {rc})",
|
||||||
|
now_iso(),
|
||||||
|
))
|
||||||
|
return issues
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- orchestration ----------
|
||||||
|
|
||||||
|
CHECKS = [
|
||||||
|
check_nats,
|
||||||
|
check_databases,
|
||||||
|
check_ghost_blogs,
|
||||||
|
check_mariadb_dependents,
|
||||||
|
check_postgres_dependents,
|
||||||
|
check_standalone_services,
|
||||||
|
check_all_nodeports,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run_all_checks(cfg: dict) -> list[Issue]:
|
||||||
|
buckets: dict[str, list[Issue]] = {}
|
||||||
|
for fn in CHECKS:
|
||||||
|
try:
|
||||||
|
buckets[fn.__name__] = fn(cfg)
|
||||||
|
except Exception as e:
|
||||||
|
buckets[fn.__name__] = [Issue(
|
||||||
|
f"healthcheck.{fn.__name__}",
|
||||||
|
f"check raised: {type(e).__name__}: {e}",
|
||||||
|
now_iso(),
|
||||||
|
root_cause="healthcheck bug",
|
||||||
|
)]
|
||||||
|
|
||||||
|
db_issues = buckets.get("check_databases", [])
|
||||||
|
mariadb_down = any(i.component_name == "mariadb" for i in db_issues)
|
||||||
|
postgres_down = any(i.component_name == "postgres" for i in db_issues)
|
||||||
|
|
||||||
|
if mariadb_down:
|
||||||
|
for i in buckets.get("check_mariadb_dependents", []) + buckets.get("check_ghost_blogs", []):
|
||||||
|
if i.root_cause is None:
|
||||||
|
i.root_cause = "mariadb unreachable"
|
||||||
|
if postgres_down:
|
||||||
|
for i in buckets.get("check_postgres_dependents", []):
|
||||||
|
if i.root_cause is None:
|
||||||
|
i.root_cause = "postgres unreachable"
|
||||||
|
|
||||||
|
out = []
|
||||||
|
for fn in CHECKS:
|
||||||
|
out.extend(buckets.get(fn.__name__, []))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- NATS publish ----------
|
||||||
|
|
||||||
|
async def _publish(url: str, subject: str, payloads: list[bytes]) -> None:
|
||||||
|
import nats # type: ignore[import-not-found]
|
||||||
|
nc = await asyncio.wait_for(
|
||||||
|
nats.connect(url, connect_timeout=3, allow_reconnect=False), # type: ignore[attr-defined]
|
||||||
|
timeout=8,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
for p in payloads:
|
||||||
|
await nc.publish(subject, p)
|
||||||
|
await nc.flush()
|
||||||
|
finally:
|
||||||
|
await nc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def publish_issues(issues: list[Issue], cfg: dict) -> None:
|
||||||
|
if not issues:
|
||||||
|
return
|
||||||
|
payloads = [json.dumps(i.to_dict()).encode("utf-8") for i in issues]
|
||||||
|
asyncio.run(_publish(cfg["nats"]["url"], cfg["nats"]["subject"], payloads))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- entry ----------
|
||||||
|
|
||||||
|
def load_config(path: str) -> dict:
|
||||||
|
with open(path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None) -> int:
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
datefmt="%Y-%m-%dT%H:%M:%S%z")
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--config", default="/home/samantha/homelab-health/checks.json")
|
||||||
|
ap.add_argument("--dry-run", action="store_true",
|
||||||
|
help="Print issues to stdout; do not publish to NATS")
|
||||||
|
args = ap.parse_args(argv)
|
||||||
|
|
||||||
|
cfg = load_config(args.config)
|
||||||
|
issues = run_all_checks(cfg)
|
||||||
|
|
||||||
|
for i in issues:
|
||||||
|
log.warning("issue: %s", json.dumps(i.to_dict()))
|
||||||
|
|
||||||
|
if not issues:
|
||||||
|
log.info("all checks green")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
for i in issues:
|
||||||
|
print(json.dumps(i.to_dict()))
|
||||||
|
return 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
publish_issues(issues, cfg)
|
||||||
|
except Exception as e:
|
||||||
|
log.error("NATS publish failed: %s (issues not delivered)", e)
|
||||||
|
return 1
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
45
k3s/health/checks.json
Normal file
45
k3s/health/checks.json
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
{
|
||||||
|
"nats": {
|
||||||
|
"url": "nats://10.0.0.6:32386",
|
||||||
|
"subject": "homelab_health_issue",
|
||||||
|
"monitoring_nodeport": 32388
|
||||||
|
},
|
||||||
|
"databases": [
|
||||||
|
{
|
||||||
|
"name": "postgres",
|
||||||
|
"namespace": "default",
|
||||||
|
"pod_label": "app=postgres",
|
||||||
|
"probe_cmd": ["pg_isready", "-U", "postgres"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mariadb",
|
||||||
|
"namespace": "default",
|
||||||
|
"pod_label": "app=mariadb",
|
||||||
|
"probe_cmd": ["mariadb-admin", "ping", "--silent"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"services": [
|
||||||
|
{"name": "ghost1", "namespace": "default", "db": "mariadb",
|
||||||
|
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
|
||||||
|
{"name": "ghost2", "namespace": "default", "db": "mariadb",
|
||||||
|
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
|
||||||
|
{"name": "ghost3", "namespace": "default", "db": "mariadb",
|
||||||
|
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
|
||||||
|
{"name": "mediawiki", "namespace": "default", "db": "mariadb",
|
||||||
|
"probe_path": "/", "expected": [200, 301, 302]},
|
||||||
|
{"name": "forgejo", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/api/healthz", "expected": [200]},
|
||||||
|
{"name": "authentik", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/-/health/live/", "expected": [200, 204]},
|
||||||
|
{"name": "listmonk", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/", "expected": [200, 302]},
|
||||||
|
{"name": "n8n", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/healthz", "expected": [200]},
|
||||||
|
{"name": "mattermost", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/api/v4/system/ping", "expected": [200]},
|
||||||
|
{"name": "vaultwarden", "namespace": "default", "db": null,
|
||||||
|
"probe_path": "/alive", "expected": [200]},
|
||||||
|
{"name": "garage-webui", "namespace": "default", "db": null,
|
||||||
|
"probe_path": "/", "expected": [200, 302]}
|
||||||
|
]
|
||||||
|
}
|
||||||
319
k3s/health/home_lab_health.md
Normal file
319
k3s/health/home_lab_health.md
Normal file
|
|
@ -0,0 +1,319 @@
|
||||||
|
# Homelab Health — Internal Checks Design
|
||||||
|
|
||||||
|
**Status: design approved 2026-04-20. Ready to write implementation plan.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Resume Notes (for next session)
|
||||||
|
|
||||||
|
You and I brainstormed this design across one session. All design questions answered,
|
||||||
|
all three design sections approved. The next step per the brainstorming skill is:
|
||||||
|
|
||||||
|
1. **This file exists** — design committed (or ready to commit).
|
||||||
|
2. **Next action:** invoke `superpowers:writing-plans` to turn this design into a
|
||||||
|
step-by-step implementation plan.
|
||||||
|
3. After the implementation plan is written, execute it (writing-plans → executing-plans).
|
||||||
|
|
||||||
|
Do NOT re-open any design decisions in the new session unless something here is
|
||||||
|
obviously wrong; the decisions below are settled.
|
||||||
|
|
||||||
|
**Test canary:** when verifying the installed system end-to-end, break **mediawiki**
|
||||||
|
(e.g. scale to 0 replicas), not the Ghost blogs. Ghosts are production, MediaWiki is
|
||||||
|
expendable for a "does the alert fire" test.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
Add a second layer of cluster health monitoring that runs **inside** the K3s cluster
|
||||||
|
and reports structural / semantic problems to NATS. The existing
|
||||||
|
`k3s/scripts/check-health.sh` (workstation-driven canary) stays in place unchanged.
|
||||||
|
|
||||||
|
Requirements, as given:
|
||||||
|
|
||||||
|
0. NATS itself up
|
||||||
|
1. MariaDB up
|
||||||
|
2. PostgreSQL up
|
||||||
|
3. Internal Ghost blog ports respond to HTTP correctly
|
||||||
|
4. All other services depending on MariaDB respond correctly
|
||||||
|
5. All services depending on PostgreSQL respond correctly
|
||||||
|
6. Something is listening at every NodePort
|
||||||
|
|
||||||
|
Plus implicit: standalone services (Vaultwarden, Garage, etc.) also get probed.
|
||||||
|
|
||||||
|
**Output contract:** publish NATS messages on subject `homelab_health_issue` with
|
||||||
|
JSON body:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"component_name": "<str>",
|
||||||
|
"issue_detail": "<str>",
|
||||||
|
"detected_at": "<ISO8601 timestamp>",
|
||||||
|
"root_cause": "<optional str>"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decisions (settled)
|
||||||
|
|
||||||
|
| Decision | Choice | Why |
|
||||||
|
|---|---|---|
|
||||||
|
| Where it runs | systemd timer on **pve-control** | Master K3s control node; kubectl locally; always on. |
|
||||||
|
| Language | **Python 3** | User expertise; structured JSON; clean error handling. |
|
||||||
|
| HTTP probes | `requests` library | No subprocess per probe; in-process. |
|
||||||
|
| NATS publish | `nats-py` library | In-process; one cohesive Python process. |
|
||||||
|
| kubectl use | **subprocess** (kept for now) | Only two call sites; revisit later with `kubernetes` client. |
|
||||||
|
| DB auth for probes | **sidestepped** | Use `kubectl exec <pod> -- pg_isready` / `mariadb-admin ping`; no creds on pve-control. |
|
||||||
|
| Orchestration | Single script, one function per check category | Simple; matches "one function per check" ask. |
|
||||||
|
| Schedule | Every **10 minutes** | User said no more frequent than that. |
|
||||||
|
| Deduplication | **Stateless** | Re-fires every tick while failing; consumer handles aggregation. |
|
||||||
|
| Healthy publishes | **None** | Silent when OK. Only problems on the wire. |
|
||||||
|
| Recovery events | **None** | Reports stop when fixed; absence = healthy. |
|
||||||
|
| Service config | **JSON file** (`checks.json`) | Pythonic; easy to edit/commit; lives alongside `checker.py`. |
|
||||||
|
| NodePort discovery | **Live from `kubectl get svc -A -o json`** | Source of truth is the cluster; no drift. |
|
||||||
|
| NATS-down fallback | **stdout + non-zero exit** | Workstation canary + `systemctl status` surface failures. Future leaf/LAN NATS fallback via env var hook (deferred). |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
**Deployment layout on pve-control:**
|
||||||
|
|
||||||
|
```
|
||||||
|
/opt/homelab-health/
|
||||||
|
├── checker.py # Python entrypoint, one function per check
|
||||||
|
├── checks.json # service catalog + NATS/DB config
|
||||||
|
├── venv/ # virtualenv with nats-py, requests
|
||||||
|
/etc/systemd/system/
|
||||||
|
├── homelab-health.service
|
||||||
|
└── homelab-health.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
**Source of truth in repo:**
|
||||||
|
|
||||||
|
```
|
||||||
|
k3s/health/
|
||||||
|
├── home_lab_health.md # this file
|
||||||
|
├── checker.py
|
||||||
|
├── checks.json
|
||||||
|
├── requirements.txt # nats-py, requests
|
||||||
|
├── install.sh # runs on pve-control, sets up venv + units
|
||||||
|
├── homelab-health.service
|
||||||
|
├── homelab-health.timer
|
||||||
|
└── tests/
|
||||||
|
└── test_checks.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Runtime flow each tick:**
|
||||||
|
|
||||||
|
1. Load `checks.json`.
|
||||||
|
2. Connect to NATS with a 3s timeout. On failure: log loud, still run checks, publish nothing, exit 1.
|
||||||
|
3. Run each check function in sequence, each wrapped in `try/except`; exceptions in one check never stop the others (they become a `healthcheck.<fn>` meta-issue).
|
||||||
|
4. Each check returns `list[Issue]`. Main loop aggregates.
|
||||||
|
5. Log every issue to stdout (journal).
|
||||||
|
6. For each issue, publish one NATS message to `homelab_health_issue`.
|
||||||
|
7. Exit 0 if zero issues, 1 otherwise. `systemctl status` + journalctl give humans visibility.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Config schema (`checks.json`)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"nats": {
|
||||||
|
"url": "nats://nats.default.svc.cluster.local:4222",
|
||||||
|
"subject": "homelab_health_issue",
|
||||||
|
"monitoring_nodeport": 32388
|
||||||
|
},
|
||||||
|
"databases": [
|
||||||
|
{
|
||||||
|
"name": "postgres",
|
||||||
|
"namespace": "default",
|
||||||
|
"pod_label": "app=postgres",
|
||||||
|
"probe_cmd": ["pg_isready", "-U", "postgres"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mariadb",
|
||||||
|
"namespace": "default",
|
||||||
|
"pod_label": "app=mariadb",
|
||||||
|
"probe_cmd": ["mariadb-admin", "ping", "--silent"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"services": [
|
||||||
|
{"name": "ghost1", "namespace": "fulfillment", "db": "mariadb",
|
||||||
|
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
|
||||||
|
{"name": "ghost2", "namespace": "fulfillment", "db": "mariadb",
|
||||||
|
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
|
||||||
|
{"name": "ghost3", "namespace": "fulfillment", "db": "mariadb",
|
||||||
|
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
|
||||||
|
{"name": "mediawiki", "namespace": "default", "db": "mariadb",
|
||||||
|
"probe_path": "/", "expected": [200, 302]},
|
||||||
|
{"name": "forgejo", "namespace": "sjasoft", "db": "postgres",
|
||||||
|
"probe_path": "/api/healthz", "expected": [200]},
|
||||||
|
{"name": "authentik-server", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/-/health/live/", "expected": [200, 204]},
|
||||||
|
{"name": "listmonk", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/api/health", "expected": [200]},
|
||||||
|
{"name": "n8n", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/healthz", "expected": [200]},
|
||||||
|
{"name": "mattermost", "namespace": "default", "db": "postgres",
|
||||||
|
"probe_path": "/api/v4/system/ping", "expected": [200]},
|
||||||
|
{"name": "vaultwarden", "namespace": "default", "db": null,
|
||||||
|
"probe_path": "/alive", "expected": [200]},
|
||||||
|
{"name": "garage", "namespace": "default", "db": null,
|
||||||
|
"probe_path": "/health", "expected": [200]},
|
||||||
|
{"name": "garage-webui", "namespace": "default", "db": null,
|
||||||
|
"probe_path": "/", "expected": [200, 302]}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Probe URL resolution:** at runtime, `kubectl get svc -n <ns> <name> -o json` →
|
||||||
|
extract `.spec.ports[].nodePort` → probe `http://localhost:<nodeport><probe_path>`.
|
||||||
|
|
||||||
|
**Per-service silence:** add `"disabled": true` to a service entry to skip it without
|
||||||
|
deleting it.
|
||||||
|
|
||||||
|
**Verify actual probe paths during implementation** — the paths above are reasonable
|
||||||
|
defaults but each needs a quick curl sanity check. Specifically double-check:
|
||||||
|
Authentik (`/-/health/live/` vs `/-/health/ready/`), Garage (root `/health` endpoint),
|
||||||
|
Vaultwarden (`/alive` returns 200 plain-text timestamp — confirmed), n8n (`/healthz`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Check catalog
|
||||||
|
|
||||||
|
One function per requirement, sharing an internal `probe_service(svc_cfg)` helper.
|
||||||
|
|
||||||
|
| Function | Covers | Mechanism |
|
||||||
|
|---|---|---|
|
||||||
|
| `check_nats()` | #0 | `kubectl exec` NATS pod to run `nats server check connection`; fallback HTTP GET `localhost:<monitoring_nodeport>/healthz` |
|
||||||
|
| `check_postgres()` | #2 | `kubectl exec` postgres pod to run `pg_isready -U postgres` |
|
||||||
|
| `check_mariadb()` | #1 | `kubectl exec` mariadb pod to run `mariadb-admin ping --silent` |
|
||||||
|
| `check_ghost_blogs()` | #3 | `probe_service` for every service whose name starts with `ghost` |
|
||||||
|
| `check_mariadb_dependents()` | #4 | `probe_service` for every non-ghost service where `db == "mariadb"` |
|
||||||
|
| `check_postgres_dependents()` | #5 | `probe_service` for every service where `db == "postgres"` |
|
||||||
|
| `check_standalone_services()` | implicit | `probe_service` for every service where `db == null` |
|
||||||
|
| `check_all_nodeports()` | #6 | `kubectl get svc -A -o json`; for every `nodePort`, TCP connect `localhost:<nodeport>`; failure = nothing listening |
|
||||||
|
|
||||||
|
**`probe_service(svc)`:** resolves NodePort via kubectl, calls
|
||||||
|
`requests.get(f"http://localhost:{nodeport}{svc['probe_path']}", timeout=10)`,
|
||||||
|
compares status to `expected`, returns an `Issue` on mismatch or on exception.
|
||||||
|
|
||||||
|
**Root-cause hints in payload:** if `check_mariadb()` produced an issue this run,
|
||||||
|
any `check_mariadb_dependents()` failure gets `"root_cause": "mariadb unreachable"`.
|
||||||
|
Same pattern for postgres. Decorative — consumers decide what to do with it.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Error handling
|
||||||
|
|
||||||
|
```python
|
||||||
|
def run_all_checks(cfg) -> list[Issue]:
|
||||||
|
issues = []
|
||||||
|
for fn in [check_nats, check_postgres, check_mariadb,
|
||||||
|
check_ghost_blogs, check_mariadb_dependents,
|
||||||
|
check_postgres_dependents, check_standalone_services,
|
||||||
|
check_all_nodeports]:
|
||||||
|
try:
|
||||||
|
issues.extend(fn(cfg))
|
||||||
|
except Exception as e:
|
||||||
|
issues.append(Issue(
|
||||||
|
component_name=f"healthcheck.{fn.__name__}",
|
||||||
|
issue_detail=f"check function raised: {type(e).__name__}: {e}",
|
||||||
|
detected_at=now_iso(),
|
||||||
|
root_cause="healthcheck bug or missing dependency"))
|
||||||
|
return issues
|
||||||
|
```
|
||||||
|
|
||||||
|
- No single check can halt the pipeline.
|
||||||
|
- NATS connect failure is loud-logged; checks still run; individual publish failures
|
||||||
|
are logged but don't stop the rest.
|
||||||
|
- `Issue` is a small dataclass; `to_dict()` serialises to the exact NATS payload schema.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
**`install.sh` (run once on pve-control as samantha, with sudo where needed):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
set -euo pipefail
|
||||||
|
sudo mkdir -p /opt/homelab-health
|
||||||
|
sudo rsync -a --delete ./ /opt/homelab-health/ --exclude=install.sh --exclude=tests
|
||||||
|
sudo chown -R samantha:samantha /opt/homelab-health
|
||||||
|
python3 -m venv /opt/homelab-health/venv
|
||||||
|
/opt/homelab-health/venv/bin/pip install -r /opt/homelab-health/requirements.txt
|
||||||
|
sudo cp homelab-health.service homelab-health.timer /etc/systemd/system/
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now homelab-health.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
**`homelab-health.service`:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=Homelab internal health checks
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=samantha
|
||||||
|
ExecStart=/opt/homelab-health/venv/bin/python /opt/homelab-health/checker.py
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
```
|
||||||
|
|
||||||
|
**`homelab-health.timer`:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=Run homelab health checks every 10 minutes
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnCalendar=*:0/10
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
**Unit tests** (`tests/test_checks.py`, pytest):
|
||||||
|
- Each check function takes a config object — easily stubbed.
|
||||||
|
- `probe_service` accepts an injected HTTP client so tests don't hit real services.
|
||||||
|
- Mock `subprocess.run` for kubectl calls.
|
||||||
|
- Assert the exact `Issue` list returned for each failure shape.
|
||||||
|
|
||||||
|
**Manual smoke test** — `checker.py --dry-run` logs all issues to stdout but skips
|
||||||
|
NATS publish. Run ad-hoc on pve-control during development.
|
||||||
|
|
||||||
|
**End-to-end verification after install:**
|
||||||
|
1. `systemctl list-timers homelab-health.timer` shows next fire time.
|
||||||
|
2. Manually fire once: `sudo systemctl start homelab-health.service`.
|
||||||
|
3. `journalctl -u homelab-health -n 200` shows outcome.
|
||||||
|
4. On workstation: `nats sub homelab_health_issue` (against the cluster NATS).
|
||||||
|
5. Break **mediawiki** (`kubectl scale deploy mediawiki -n default --replicas=0`) and
|
||||||
|
wait ≤10 min — expect a message on the subject, with `component_name:"mediawiki"`.
|
||||||
|
6. Restore (`--replicas=1`) and confirm alerts stop on the next tick.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open items / future
|
||||||
|
|
||||||
|
- **Leaf/LAN NATS fallback:** add `FALLBACK_NATS_URL` env-var hook in `checker.py`
|
||||||
|
(unused for now). When the leaf NATS comes online, publish there too on connect
|
||||||
|
failure to primary.
|
||||||
|
- **NATS auth:** current assumption is local anonymous publish is allowed. If auth is
|
||||||
|
added, introduce a `nats.creds_path` field in `checks.json` pointing at a creds
|
||||||
|
file on pve-control.
|
||||||
|
- **k8s Python client migration:** replace the two remaining `kubectl` subprocess
|
||||||
|
calls with the `kubernetes` library for a fully in-process script.
|
||||||
|
- **Recovery events:** if downstream consumers want a "resolved" signal, add a small
|
||||||
|
local state file (JSON on disk) to detect transitions and publish recovery events.
|
||||||
|
- **Per-namespace grouping:** not needed now; if service list grows beyond ~25,
|
||||||
|
reconsider organizing `checks.json` by namespace for readability.
|
||||||
14
k3s/health/homelab-health.service
Normal file
14
k3s/health/homelab-health.service
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Homelab internal health checks
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=samantha
|
||||||
|
WorkingDirectory=/home/samantha/homelab-health
|
||||||
|
Environment=KUBECONFIG=/home/samantha/.kube/config
|
||||||
|
Environment=PATH=/usr/local/bin:/usr/bin:/bin
|
||||||
|
ExecStart=/home/samantha/homelab-health/venv/bin/python /home/samantha/homelab-health/checker.py --config /home/samantha/homelab-health/checks.json
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
11
k3s/health/homelab-health.timer
Normal file
11
k3s/health/homelab-health.timer
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Run homelab health checks every 10 minutes
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=2min
|
||||||
|
OnUnitActiveSec=10min
|
||||||
|
Persistent=true
|
||||||
|
AccuracySec=30s
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
2
k3s/health/requirements.txt
Normal file
2
k3s/health/requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
nats-py==2.9.0
|
||||||
|
requests==2.32.3
|
||||||
Loading…
Reference in a new issue