Add homelab internal health checker
Python checker runs on pve-control via systemd timer every 10 min, publishes issues to NATS subject homelab_health_issue. Checks NATS, Postgres, MariaDB, Ghost blogs, DB dependents, standalone services, and every NodePort. Silent when healthy. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
6364f70799
commit
58bfd422d4
7 changed files with 704 additions and 0 deletions
5
.opencode/opencode.json
Normal file
5
.opencode/opencode.json
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"plugin": [
|
||||
"web"
|
||||
]
|
||||
}
|
||||
308
k3s/health/checker.py
Normal file
308
k3s/health/checker.py
Normal file
|
|
@ -0,0 +1,308 @@
|
|||
"""Homelab internal health checker.
|
||||
|
||||
Runs on pve-control every 10 minutes via systemd timer. Publishes issue
|
||||
events to NATS subject `homelab_health_issue`. Silent when healthy.
|
||||
See home_lab_health.md for design.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Sequence
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
log = logging.getLogger("homelab-health")
|
||||
|
||||
|
||||
# ---------- payload ----------
|
||||
|
||||
@dataclass
|
||||
class Issue:
|
||||
component_name: str
|
||||
issue_detail: str
|
||||
detected_at: str
|
||||
root_cause: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d = asdict(self)
|
||||
if d["root_cause"] is None:
|
||||
del d["root_cause"]
|
||||
return d
|
||||
|
||||
|
||||
def now_iso() -> str:
|
||||
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||||
|
||||
|
||||
# ---------- kubectl helpers ----------
|
||||
|
||||
def kubectl_json(args: Sequence[str], timeout: int = 15) -> dict:
|
||||
cmd = ["kubectl", *args, "-o", "json"]
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||
if r.returncode != 0:
|
||||
raise RuntimeError(f"kubectl {' '.join(args)}: {r.stderr.strip()}")
|
||||
return json.loads(r.stdout)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExecResult:
|
||||
ok: bool
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
|
||||
def kubectl_exec(namespace: str, pod_label: str, command: Sequence[str],
|
||||
timeout: int = 10) -> ExecResult:
|
||||
"""Run a command inside the first pod matching pod_label."""
|
||||
try:
|
||||
n = subprocess.run(
|
||||
["kubectl", "get", "pod", "-n", namespace, "-l", pod_label,
|
||||
"-o", "jsonpath={.items[0].metadata.name}"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return ExecResult(False, "", "pod lookup timed out")
|
||||
if n.returncode != 0 or not n.stdout.strip():
|
||||
return ExecResult(False, "", f"no pod matched {pod_label} in {namespace}")
|
||||
pod = n.stdout.strip()
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["kubectl", "exec", "-n", namespace, pod, "--", *command],
|
||||
capture_output=True, text=True, timeout=timeout,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return ExecResult(False, "", f"kubectl exec timed out after {timeout}s")
|
||||
return ExecResult(r.returncode == 0, r.stdout, r.stderr.strip())
|
||||
|
||||
|
||||
def resolve_nodeport(namespace: str, service: str) -> Optional[int]:
|
||||
"""Return the first NodePort on the given service, or None."""
|
||||
svc = kubectl_json(["get", "svc", "-n", namespace, service])
|
||||
for port in svc.get("spec", {}).get("ports", []):
|
||||
if "nodePort" in port:
|
||||
return int(port["nodePort"])
|
||||
return None
|
||||
|
||||
|
||||
# ---------- probes ----------
|
||||
|
||||
def probe_service(svc: dict) -> list[Issue]:
|
||||
if svc.get("disabled"):
|
||||
return []
|
||||
name = svc["name"]
|
||||
try:
|
||||
nodeport = resolve_nodeport(svc["namespace"], name)
|
||||
except Exception as e:
|
||||
return [Issue(name, f"kubectl failed: {e}", now_iso())]
|
||||
if nodeport is None:
|
||||
return [Issue(name, "no NodePort exposed", now_iso())]
|
||||
|
||||
url = f"http://localhost:{nodeport}{svc['probe_path']}"
|
||||
try:
|
||||
resp = requests.get(url, timeout=10, allow_redirects=False)
|
||||
except requests.RequestException as e:
|
||||
return [Issue(name, f"probe error at {url}: {e}", now_iso())]
|
||||
if resp.status_code in svc["expected"]:
|
||||
return []
|
||||
return [Issue(name,
|
||||
f"HTTP {resp.status_code} at {url} (expected {svc['expected']})",
|
||||
now_iso())]
|
||||
|
||||
|
||||
# ---------- check functions ----------
|
||||
|
||||
def check_nats(cfg: dict) -> list[Issue]:
|
||||
port = cfg["nats"]["monitoring_nodeport"]
|
||||
url = f"http://localhost:{port}/healthz"
|
||||
try:
|
||||
r = requests.get(url, timeout=5)
|
||||
except requests.RequestException as e:
|
||||
return [Issue("nats", f"monitoring unreachable: {e}", now_iso())]
|
||||
if r.status_code != 200:
|
||||
return [Issue("nats", f"/healthz returned {r.status_code}", now_iso())]
|
||||
return []
|
||||
|
||||
|
||||
def check_databases(cfg: dict) -> list[Issue]:
|
||||
issues = []
|
||||
for db in cfg.get("databases", []):
|
||||
result = kubectl_exec(db["namespace"], db["pod_label"], db["probe_cmd"])
|
||||
if not result.ok:
|
||||
issues.append(Issue(
|
||||
db["name"],
|
||||
f"liveness probe failed: {result.stderr or '(no stderr)'}",
|
||||
now_iso(),
|
||||
))
|
||||
return issues
|
||||
|
||||
|
||||
def _filter_probe(services, pred) -> list[Issue]:
|
||||
out = []
|
||||
for s in services:
|
||||
if pred(s):
|
||||
out.extend(probe_service(s))
|
||||
return out
|
||||
|
||||
|
||||
def check_ghost_blogs(cfg):
|
||||
return _filter_probe(cfg.get("services", []),
|
||||
lambda s: s["name"].startswith("ghost"))
|
||||
|
||||
|
||||
def check_mariadb_dependents(cfg):
|
||||
return _filter_probe(cfg.get("services", []),
|
||||
lambda s: s["db"] == "mariadb" and not s["name"].startswith("ghost"))
|
||||
|
||||
|
||||
def check_postgres_dependents(cfg):
|
||||
return _filter_probe(cfg.get("services", []),
|
||||
lambda s: s["db"] == "postgres")
|
||||
|
||||
|
||||
def check_standalone_services(cfg):
|
||||
return _filter_probe(cfg.get("services", []),
|
||||
lambda s: s["db"] is None)
|
||||
|
||||
|
||||
def check_all_nodeports(_cfg) -> list[Issue]:
|
||||
"""TCP connect to every NodePort in the cluster."""
|
||||
svcs = kubectl_json(["get", "svc", "-A"])
|
||||
issues = []
|
||||
for item in svcs.get("items", []):
|
||||
meta = item.get("metadata", {})
|
||||
name = f"{meta.get('namespace')}/{meta.get('name')}"
|
||||
for port in item.get("spec", {}).get("ports", []):
|
||||
np = port.get("nodePort")
|
||||
if np is None:
|
||||
continue
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.settimeout(3)
|
||||
rc = s.connect_ex(("127.0.0.1", int(np)))
|
||||
if rc != 0:
|
||||
issues.append(Issue(
|
||||
name,
|
||||
f"NodePort {np} not accepting TCP (errno {rc})",
|
||||
now_iso(),
|
||||
))
|
||||
return issues
|
||||
|
||||
|
||||
# ---------- orchestration ----------
|
||||
|
||||
CHECKS = [
|
||||
check_nats,
|
||||
check_databases,
|
||||
check_ghost_blogs,
|
||||
check_mariadb_dependents,
|
||||
check_postgres_dependents,
|
||||
check_standalone_services,
|
||||
check_all_nodeports,
|
||||
]
|
||||
|
||||
|
||||
def run_all_checks(cfg: dict) -> list[Issue]:
|
||||
buckets: dict[str, list[Issue]] = {}
|
||||
for fn in CHECKS:
|
||||
try:
|
||||
buckets[fn.__name__] = fn(cfg)
|
||||
except Exception as e:
|
||||
buckets[fn.__name__] = [Issue(
|
||||
f"healthcheck.{fn.__name__}",
|
||||
f"check raised: {type(e).__name__}: {e}",
|
||||
now_iso(),
|
||||
root_cause="healthcheck bug",
|
||||
)]
|
||||
|
||||
db_issues = buckets.get("check_databases", [])
|
||||
mariadb_down = any(i.component_name == "mariadb" for i in db_issues)
|
||||
postgres_down = any(i.component_name == "postgres" for i in db_issues)
|
||||
|
||||
if mariadb_down:
|
||||
for i in buckets.get("check_mariadb_dependents", []) + buckets.get("check_ghost_blogs", []):
|
||||
if i.root_cause is None:
|
||||
i.root_cause = "mariadb unreachable"
|
||||
if postgres_down:
|
||||
for i in buckets.get("check_postgres_dependents", []):
|
||||
if i.root_cause is None:
|
||||
i.root_cause = "postgres unreachable"
|
||||
|
||||
out = []
|
||||
for fn in CHECKS:
|
||||
out.extend(buckets.get(fn.__name__, []))
|
||||
return out
|
||||
|
||||
|
||||
# ---------- NATS publish ----------
|
||||
|
||||
async def _publish(url: str, subject: str, payloads: list[bytes]) -> None:
|
||||
import nats # type: ignore[import-not-found]
|
||||
nc = await asyncio.wait_for(
|
||||
nats.connect(url, connect_timeout=3, allow_reconnect=False), # type: ignore[attr-defined]
|
||||
timeout=8,
|
||||
)
|
||||
try:
|
||||
for p in payloads:
|
||||
await nc.publish(subject, p)
|
||||
await nc.flush()
|
||||
finally:
|
||||
await nc.close()
|
||||
|
||||
|
||||
def publish_issues(issues: list[Issue], cfg: dict) -> None:
|
||||
if not issues:
|
||||
return
|
||||
payloads = [json.dumps(i.to_dict()).encode("utf-8") for i in issues]
|
||||
asyncio.run(_publish(cfg["nats"]["url"], cfg["nats"]["subject"], payloads))
|
||||
|
||||
|
||||
# ---------- entry ----------
|
||||
|
||||
def load_config(path: str) -> dict:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S%z")
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--config", default="/home/samantha/homelab-health/checks.json")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="Print issues to stdout; do not publish to NATS")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
cfg = load_config(args.config)
|
||||
issues = run_all_checks(cfg)
|
||||
|
||||
for i in issues:
|
||||
log.warning("issue: %s", json.dumps(i.to_dict()))
|
||||
|
||||
if not issues:
|
||||
log.info("all checks green")
|
||||
return 0
|
||||
|
||||
if args.dry_run:
|
||||
for i in issues:
|
||||
print(json.dumps(i.to_dict()))
|
||||
return 1
|
||||
|
||||
try:
|
||||
publish_issues(issues, cfg)
|
||||
except Exception as e:
|
||||
log.error("NATS publish failed: %s (issues not delivered)", e)
|
||||
return 1
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
45
k3s/health/checks.json
Normal file
45
k3s/health/checks.json
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
{
|
||||
"nats": {
|
||||
"url": "nats://10.0.0.6:32386",
|
||||
"subject": "homelab_health_issue",
|
||||
"monitoring_nodeport": 32388
|
||||
},
|
||||
"databases": [
|
||||
{
|
||||
"name": "postgres",
|
||||
"namespace": "default",
|
||||
"pod_label": "app=postgres",
|
||||
"probe_cmd": ["pg_isready", "-U", "postgres"]
|
||||
},
|
||||
{
|
||||
"name": "mariadb",
|
||||
"namespace": "default",
|
||||
"pod_label": "app=mariadb",
|
||||
"probe_cmd": ["mariadb-admin", "ping", "--silent"]
|
||||
}
|
||||
],
|
||||
"services": [
|
||||
{"name": "ghost1", "namespace": "default", "db": "mariadb",
|
||||
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
|
||||
{"name": "ghost2", "namespace": "default", "db": "mariadb",
|
||||
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
|
||||
{"name": "ghost3", "namespace": "default", "db": "mariadb",
|
||||
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
|
||||
{"name": "mediawiki", "namespace": "default", "db": "mariadb",
|
||||
"probe_path": "/", "expected": [200, 301, 302]},
|
||||
{"name": "forgejo", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/api/healthz", "expected": [200]},
|
||||
{"name": "authentik", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/-/health/live/", "expected": [200, 204]},
|
||||
{"name": "listmonk", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/", "expected": [200, 302]},
|
||||
{"name": "n8n", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/healthz", "expected": [200]},
|
||||
{"name": "mattermost", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/api/v4/system/ping", "expected": [200]},
|
||||
{"name": "vaultwarden", "namespace": "default", "db": null,
|
||||
"probe_path": "/alive", "expected": [200]},
|
||||
{"name": "garage-webui", "namespace": "default", "db": null,
|
||||
"probe_path": "/", "expected": [200, 302]}
|
||||
]
|
||||
}
|
||||
319
k3s/health/home_lab_health.md
Normal file
319
k3s/health/home_lab_health.md
Normal file
|
|
@ -0,0 +1,319 @@
|
|||
# Homelab Health — Internal Checks Design
|
||||
|
||||
**Status: design approved 2026-04-20. Ready to write implementation plan.**
|
||||
|
||||
---
|
||||
|
||||
## Resume Notes (for next session)
|
||||
|
||||
You and I brainstormed this design across one session. All design questions answered,
|
||||
all three design sections approved. The next step per the brainstorming skill is:
|
||||
|
||||
1. **This file exists** — design committed (or ready to commit).
|
||||
2. **Next action:** invoke `superpowers:writing-plans` to turn this design into a
|
||||
step-by-step implementation plan.
|
||||
3. After the implementation plan is written, execute it (writing-plans → executing-plans).
|
||||
|
||||
Do NOT re-open any design decisions in the new session unless something here is
|
||||
obviously wrong; the decisions below are settled.
|
||||
|
||||
**Test canary:** when verifying the installed system end-to-end, break **mediawiki**
|
||||
(e.g. scale to 0 replicas), not the Ghost blogs. Ghosts are production, MediaWiki is
|
||||
expendable for a "does the alert fire" test.
|
||||
|
||||
---
|
||||
|
||||
## Goals
|
||||
|
||||
Add a second layer of cluster health monitoring that runs **inside** the K3s cluster
|
||||
and reports structural / semantic problems to NATS. The existing
|
||||
`k3s/scripts/check-health.sh` (workstation-driven canary) stays in place unchanged.
|
||||
|
||||
Requirements, as given:
|
||||
|
||||
0. NATS itself up
|
||||
1. MariaDB up
|
||||
2. PostgreSQL up
|
||||
3. Internal Ghost blog ports respond to HTTP correctly
|
||||
4. All other services depending on MariaDB respond correctly
|
||||
5. All services depending on PostgreSQL respond correctly
|
||||
6. Something is listening at every NodePort
|
||||
|
||||
Plus implicit: standalone services (Vaultwarden, Garage, etc.) also get probed.
|
||||
|
||||
**Output contract:** publish NATS messages on subject `homelab_health_issue` with
|
||||
JSON body:
|
||||
|
||||
```json
|
||||
{
|
||||
"component_name": "<str>",
|
||||
"issue_detail": "<str>",
|
||||
"detected_at": "<ISO8601 timestamp>",
|
||||
"root_cause": "<optional str>"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Decisions (settled)
|
||||
|
||||
| Decision | Choice | Why |
|
||||
|---|---|---|
|
||||
| Where it runs | systemd timer on **pve-control** | Master K3s control node; kubectl locally; always on. |
|
||||
| Language | **Python 3** | User expertise; structured JSON; clean error handling. |
|
||||
| HTTP probes | `requests` library | No subprocess per probe; in-process. |
|
||||
| NATS publish | `nats-py` library | In-process; one cohesive Python process. |
|
||||
| kubectl use | **subprocess** (kept for now) | Only two call sites; revisit later with `kubernetes` client. |
|
||||
| DB auth for probes | **sidestepped** | Use `kubectl exec <pod> -- pg_isready` / `mariadb-admin ping`; no creds on pve-control. |
|
||||
| Orchestration | Single script, one function per check category | Simple; matches "one function per check" ask. |
|
||||
| Schedule | Every **10 minutes** | User said no more frequent than that. |
|
||||
| Deduplication | **Stateless** | Re-fires every tick while failing; consumer handles aggregation. |
|
||||
| Healthy publishes | **None** | Silent when OK. Only problems on the wire. |
|
||||
| Recovery events | **None** | Reports stop when fixed; absence = healthy. |
|
||||
| Service config | **JSON file** (`checks.json`) | Pythonic; easy to edit/commit; lives alongside `checker.py`. |
|
||||
| NodePort discovery | **Live from `kubectl get svc -A -o json`** | Source of truth is the cluster; no drift. |
|
||||
| NATS-down fallback | **stdout + non-zero exit** | Workstation canary + `systemctl status` surface failures. Future leaf/LAN NATS fallback via env var hook (deferred). |
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
**Deployment layout on pve-control:**
|
||||
|
||||
```
|
||||
/opt/homelab-health/
|
||||
├── checker.py # Python entrypoint, one function per check
|
||||
├── checks.json # service catalog + NATS/DB config
|
||||
├── venv/ # virtualenv with nats-py, requests
|
||||
/etc/systemd/system/
|
||||
├── homelab-health.service
|
||||
└── homelab-health.timer
|
||||
```
|
||||
|
||||
**Source of truth in repo:**
|
||||
|
||||
```
|
||||
k3s/health/
|
||||
├── home_lab_health.md # this file
|
||||
├── checker.py
|
||||
├── checks.json
|
||||
├── requirements.txt # nats-py, requests
|
||||
├── install.sh # runs on pve-control, sets up venv + units
|
||||
├── homelab-health.service
|
||||
├── homelab-health.timer
|
||||
└── tests/
|
||||
└── test_checks.py
|
||||
```
|
||||
|
||||
**Runtime flow each tick:**
|
||||
|
||||
1. Load `checks.json`.
|
||||
2. Connect to NATS with a 3s timeout. On failure: log loud, still run checks, publish nothing, exit 1.
|
||||
3. Run each check function in sequence, each wrapped in `try/except`; exceptions in one check never stop the others (they become a `healthcheck.<fn>` meta-issue).
|
||||
4. Each check returns `list[Issue]`. Main loop aggregates.
|
||||
5. Log every issue to stdout (journal).
|
||||
6. For each issue, publish one NATS message to `homelab_health_issue`.
|
||||
7. Exit 0 if zero issues, 1 otherwise. `systemctl status` + journalctl give humans visibility.
|
||||
|
||||
---
|
||||
|
||||
## Config schema (`checks.json`)
|
||||
|
||||
```json
|
||||
{
|
||||
"nats": {
|
||||
"url": "nats://nats.default.svc.cluster.local:4222",
|
||||
"subject": "homelab_health_issue",
|
||||
"monitoring_nodeport": 32388
|
||||
},
|
||||
"databases": [
|
||||
{
|
||||
"name": "postgres",
|
||||
"namespace": "default",
|
||||
"pod_label": "app=postgres",
|
||||
"probe_cmd": ["pg_isready", "-U", "postgres"]
|
||||
},
|
||||
{
|
||||
"name": "mariadb",
|
||||
"namespace": "default",
|
||||
"pod_label": "app=mariadb",
|
||||
"probe_cmd": ["mariadb-admin", "ping", "--silent"]
|
||||
}
|
||||
],
|
||||
"services": [
|
||||
{"name": "ghost1", "namespace": "fulfillment", "db": "mariadb",
|
||||
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
|
||||
{"name": "ghost2", "namespace": "fulfillment", "db": "mariadb",
|
||||
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
|
||||
{"name": "ghost3", "namespace": "fulfillment", "db": "mariadb",
|
||||
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
|
||||
{"name": "mediawiki", "namespace": "default", "db": "mariadb",
|
||||
"probe_path": "/", "expected": [200, 302]},
|
||||
{"name": "forgejo", "namespace": "sjasoft", "db": "postgres",
|
||||
"probe_path": "/api/healthz", "expected": [200]},
|
||||
{"name": "authentik-server", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/-/health/live/", "expected": [200, 204]},
|
||||
{"name": "listmonk", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/api/health", "expected": [200]},
|
||||
{"name": "n8n", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/healthz", "expected": [200]},
|
||||
{"name": "mattermost", "namespace": "default", "db": "postgres",
|
||||
"probe_path": "/api/v4/system/ping", "expected": [200]},
|
||||
{"name": "vaultwarden", "namespace": "default", "db": null,
|
||||
"probe_path": "/alive", "expected": [200]},
|
||||
{"name": "garage", "namespace": "default", "db": null,
|
||||
"probe_path": "/health", "expected": [200]},
|
||||
{"name": "garage-webui", "namespace": "default", "db": null,
|
||||
"probe_path": "/", "expected": [200, 302]}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Probe URL resolution:** at runtime, `kubectl get svc -n <ns> <name> -o json` →
|
||||
extract `.spec.ports[].nodePort` → probe `http://localhost:<nodeport><probe_path>`.
|
||||
|
||||
**Per-service silence:** add `"disabled": true` to a service entry to skip it without
|
||||
deleting it.
|
||||
|
||||
**Verify actual probe paths during implementation** — the paths above are reasonable
|
||||
defaults but each needs a quick curl sanity check. Specifically double-check:
|
||||
Authentik (`/-/health/live/` vs `/-/health/ready/`), Garage (root `/health` endpoint),
|
||||
Vaultwarden (`/alive` returns 200 plain-text timestamp — confirmed), n8n (`/healthz`).
|
||||
|
||||
---
|
||||
|
||||
## Check catalog
|
||||
|
||||
One function per requirement, sharing an internal `probe_service(svc_cfg)` helper.
|
||||
|
||||
| Function | Covers | Mechanism |
|
||||
|---|---|---|
|
||||
| `check_nats()` | #0 | `kubectl exec` NATS pod to run `nats server check connection`; fallback HTTP GET `localhost:<monitoring_nodeport>/healthz` |
|
||||
| `check_postgres()` | #2 | `kubectl exec` postgres pod to run `pg_isready -U postgres` |
|
||||
| `check_mariadb()` | #1 | `kubectl exec` mariadb pod to run `mariadb-admin ping --silent` |
|
||||
| `check_ghost_blogs()` | #3 | `probe_service` for every service whose name starts with `ghost` |
|
||||
| `check_mariadb_dependents()` | #4 | `probe_service` for every non-ghost service where `db == "mariadb"` |
|
||||
| `check_postgres_dependents()` | #5 | `probe_service` for every service where `db == "postgres"` |
|
||||
| `check_standalone_services()` | implicit | `probe_service` for every service where `db == null` |
|
||||
| `check_all_nodeports()` | #6 | `kubectl get svc -A -o json`; for every `nodePort`, TCP connect `localhost:<nodeport>`; failure = nothing listening |
|
||||
|
||||
**`probe_service(svc)`:** resolves NodePort via kubectl, calls
|
||||
`requests.get(f"http://localhost:{nodeport}{svc['probe_path']}", timeout=10)`,
|
||||
compares status to `expected`, returns an `Issue` on mismatch or on exception.
|
||||
|
||||
**Root-cause hints in payload:** if `check_mariadb()` produced an issue this run,
|
||||
any `check_mariadb_dependents()` failure gets `"root_cause": "mariadb unreachable"`.
|
||||
Same pattern for postgres. Decorative — consumers decide what to do with it.
|
||||
|
||||
---
|
||||
|
||||
## Error handling
|
||||
|
||||
```python
|
||||
def run_all_checks(cfg) -> list[Issue]:
|
||||
issues = []
|
||||
for fn in [check_nats, check_postgres, check_mariadb,
|
||||
check_ghost_blogs, check_mariadb_dependents,
|
||||
check_postgres_dependents, check_standalone_services,
|
||||
check_all_nodeports]:
|
||||
try:
|
||||
issues.extend(fn(cfg))
|
||||
except Exception as e:
|
||||
issues.append(Issue(
|
||||
component_name=f"healthcheck.{fn.__name__}",
|
||||
issue_detail=f"check function raised: {type(e).__name__}: {e}",
|
||||
detected_at=now_iso(),
|
||||
root_cause="healthcheck bug or missing dependency"))
|
||||
return issues
|
||||
```
|
||||
|
||||
- No single check can halt the pipeline.
|
||||
- NATS connect failure is loud-logged; checks still run; individual publish failures
|
||||
are logged but don't stop the rest.
|
||||
- `Issue` is a small dataclass; `to_dict()` serialises to the exact NATS payload schema.
|
||||
|
||||
---
|
||||
|
||||
## Deployment
|
||||
|
||||
**`install.sh` (run once on pve-control as samantha, with sudo where needed):**
|
||||
|
||||
```bash
|
||||
set -euo pipefail
|
||||
sudo mkdir -p /opt/homelab-health
|
||||
sudo rsync -a --delete ./ /opt/homelab-health/ --exclude=install.sh --exclude=tests
|
||||
sudo chown -R samantha:samantha /opt/homelab-health
|
||||
python3 -m venv /opt/homelab-health/venv
|
||||
/opt/homelab-health/venv/bin/pip install -r /opt/homelab-health/requirements.txt
|
||||
sudo cp homelab-health.service homelab-health.timer /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now homelab-health.timer
|
||||
```
|
||||
|
||||
**`homelab-health.service`:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Homelab internal health checks
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=samantha
|
||||
ExecStart=/opt/homelab-health/venv/bin/python /opt/homelab-health/checker.py
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
```
|
||||
|
||||
**`homelab-health.timer`:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Run homelab health checks every 10 minutes
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*:0/10
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
**Unit tests** (`tests/test_checks.py`, pytest):
|
||||
- Each check function takes a config object — easily stubbed.
|
||||
- `probe_service` accepts an injected HTTP client so tests don't hit real services.
|
||||
- Mock `subprocess.run` for kubectl calls.
|
||||
- Assert the exact `Issue` list returned for each failure shape.
|
||||
|
||||
**Manual smoke test** — `checker.py --dry-run` logs all issues to stdout but skips
|
||||
NATS publish. Run ad-hoc on pve-control during development.
|
||||
|
||||
**End-to-end verification after install:**
|
||||
1. `systemctl list-timers homelab-health.timer` shows next fire time.
|
||||
2. Manually fire once: `sudo systemctl start homelab-health.service`.
|
||||
3. `journalctl -u homelab-health -n 200` shows outcome.
|
||||
4. On workstation: `nats sub homelab_health_issue` (against the cluster NATS).
|
||||
5. Break **mediawiki** (`kubectl scale deploy mediawiki -n default --replicas=0`) and
|
||||
wait ≤10 min — expect a message on the subject, with `component_name:"mediawiki"`.
|
||||
6. Restore (`--replicas=1`) and confirm alerts stop on the next tick.
|
||||
|
||||
---
|
||||
|
||||
## Open items / future
|
||||
|
||||
- **Leaf/LAN NATS fallback:** add `FALLBACK_NATS_URL` env-var hook in `checker.py`
|
||||
(unused for now). When the leaf NATS comes online, publish there too on connect
|
||||
failure to primary.
|
||||
- **NATS auth:** current assumption is local anonymous publish is allowed. If auth is
|
||||
added, introduce a `nats.creds_path` field in `checks.json` pointing at a creds
|
||||
file on pve-control.
|
||||
- **k8s Python client migration:** replace the two remaining `kubectl` subprocess
|
||||
calls with the `kubernetes` library for a fully in-process script.
|
||||
- **Recovery events:** if downstream consumers want a "resolved" signal, add a small
|
||||
local state file (JSON on disk) to detect transitions and publish recovery events.
|
||||
- **Per-namespace grouping:** not needed now; if service list grows beyond ~25,
|
||||
reconsider organizing `checks.json` by namespace for readability.
|
||||
14
k3s/health/homelab-health.service
Normal file
14
k3s/health/homelab-health.service
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
[Unit]
|
||||
Description=Homelab internal health checks
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=samantha
|
||||
WorkingDirectory=/home/samantha/homelab-health
|
||||
Environment=KUBECONFIG=/home/samantha/.kube/config
|
||||
Environment=PATH=/usr/local/bin:/usr/bin:/bin
|
||||
ExecStart=/home/samantha/homelab-health/venv/bin/python /home/samantha/homelab-health/checker.py --config /home/samantha/homelab-health/checks.json
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
11
k3s/health/homelab-health.timer
Normal file
11
k3s/health/homelab-health.timer
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
[Unit]
|
||||
Description=Run homelab health checks every 10 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnUnitActiveSec=10min
|
||||
Persistent=true
|
||||
AccuracySec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
2
k3s/health/requirements.txt
Normal file
2
k3s/health/requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
nats-py==2.9.0
|
||||
requests==2.32.3
|
||||
Loading…
Reference in a new issue