Add homelab internal health checker

Python checker runs on pve-control via systemd timer every 10 min,
publishes issues to NATS subject homelab_health_issue. Checks NATS,
Postgres, MariaDB, Ghost blogs, DB dependents, standalone services,
and every NodePort. Silent when healthy.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Samantha Atkins 2026-04-20 15:48:07 -04:00
parent 6364f70799
commit 58bfd422d4
7 changed files with 704 additions and 0 deletions

5
.opencode/opencode.json Normal file
View file

@ -0,0 +1,5 @@
{
"plugin": [
"web"
]
}

308
k3s/health/checker.py Normal file
View file

@ -0,0 +1,308 @@
"""Homelab internal health checker.
Runs on pve-control every 10 minutes via systemd timer. Publishes issue
events to NATS subject `homelab_health_issue`. Silent when healthy.
See home_lab_health.md for design.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import socket
import subprocess
import sys
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from typing import Optional, Sequence
import requests
log = logging.getLogger("homelab-health")
# ---------- payload ----------
@dataclass
class Issue:
component_name: str
issue_detail: str
detected_at: str
root_cause: Optional[str] = None
def to_dict(self) -> dict:
d = asdict(self)
if d["root_cause"] is None:
del d["root_cause"]
return d
def now_iso() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
# ---------- kubectl helpers ----------
def kubectl_json(args: Sequence[str], timeout: int = 15) -> dict:
cmd = ["kubectl", *args, "-o", "json"]
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
if r.returncode != 0:
raise RuntimeError(f"kubectl {' '.join(args)}: {r.stderr.strip()}")
return json.loads(r.stdout)
@dataclass
class ExecResult:
ok: bool
stdout: str
stderr: str
def kubectl_exec(namespace: str, pod_label: str, command: Sequence[str],
timeout: int = 10) -> ExecResult:
"""Run a command inside the first pod matching pod_label."""
try:
n = subprocess.run(
["kubectl", "get", "pod", "-n", namespace, "-l", pod_label,
"-o", "jsonpath={.items[0].metadata.name}"],
capture_output=True, text=True, timeout=5,
)
except subprocess.TimeoutExpired:
return ExecResult(False, "", "pod lookup timed out")
if n.returncode != 0 or not n.stdout.strip():
return ExecResult(False, "", f"no pod matched {pod_label} in {namespace}")
pod = n.stdout.strip()
try:
r = subprocess.run(
["kubectl", "exec", "-n", namespace, pod, "--", *command],
capture_output=True, text=True, timeout=timeout,
)
except subprocess.TimeoutExpired:
return ExecResult(False, "", f"kubectl exec timed out after {timeout}s")
return ExecResult(r.returncode == 0, r.stdout, r.stderr.strip())
def resolve_nodeport(namespace: str, service: str) -> Optional[int]:
"""Return the first NodePort on the given service, or None."""
svc = kubectl_json(["get", "svc", "-n", namespace, service])
for port in svc.get("spec", {}).get("ports", []):
if "nodePort" in port:
return int(port["nodePort"])
return None
# ---------- probes ----------
def probe_service(svc: dict) -> list[Issue]:
if svc.get("disabled"):
return []
name = svc["name"]
try:
nodeport = resolve_nodeport(svc["namespace"], name)
except Exception as e:
return [Issue(name, f"kubectl failed: {e}", now_iso())]
if nodeport is None:
return [Issue(name, "no NodePort exposed", now_iso())]
url = f"http://localhost:{nodeport}{svc['probe_path']}"
try:
resp = requests.get(url, timeout=10, allow_redirects=False)
except requests.RequestException as e:
return [Issue(name, f"probe error at {url}: {e}", now_iso())]
if resp.status_code in svc["expected"]:
return []
return [Issue(name,
f"HTTP {resp.status_code} at {url} (expected {svc['expected']})",
now_iso())]
# ---------- check functions ----------
def check_nats(cfg: dict) -> list[Issue]:
port = cfg["nats"]["monitoring_nodeport"]
url = f"http://localhost:{port}/healthz"
try:
r = requests.get(url, timeout=5)
except requests.RequestException as e:
return [Issue("nats", f"monitoring unreachable: {e}", now_iso())]
if r.status_code != 200:
return [Issue("nats", f"/healthz returned {r.status_code}", now_iso())]
return []
def check_databases(cfg: dict) -> list[Issue]:
issues = []
for db in cfg.get("databases", []):
result = kubectl_exec(db["namespace"], db["pod_label"], db["probe_cmd"])
if not result.ok:
issues.append(Issue(
db["name"],
f"liveness probe failed: {result.stderr or '(no stderr)'}",
now_iso(),
))
return issues
def _filter_probe(services, pred) -> list[Issue]:
out = []
for s in services:
if pred(s):
out.extend(probe_service(s))
return out
def check_ghost_blogs(cfg):
return _filter_probe(cfg.get("services", []),
lambda s: s["name"].startswith("ghost"))
def check_mariadb_dependents(cfg):
return _filter_probe(cfg.get("services", []),
lambda s: s["db"] == "mariadb" and not s["name"].startswith("ghost"))
def check_postgres_dependents(cfg):
return _filter_probe(cfg.get("services", []),
lambda s: s["db"] == "postgres")
def check_standalone_services(cfg):
return _filter_probe(cfg.get("services", []),
lambda s: s["db"] is None)
def check_all_nodeports(_cfg) -> list[Issue]:
"""TCP connect to every NodePort in the cluster."""
svcs = kubectl_json(["get", "svc", "-A"])
issues = []
for item in svcs.get("items", []):
meta = item.get("metadata", {})
name = f"{meta.get('namespace')}/{meta.get('name')}"
for port in item.get("spec", {}).get("ports", []):
np = port.get("nodePort")
if np is None:
continue
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(3)
rc = s.connect_ex(("127.0.0.1", int(np)))
if rc != 0:
issues.append(Issue(
name,
f"NodePort {np} not accepting TCP (errno {rc})",
now_iso(),
))
return issues
# ---------- orchestration ----------
CHECKS = [
check_nats,
check_databases,
check_ghost_blogs,
check_mariadb_dependents,
check_postgres_dependents,
check_standalone_services,
check_all_nodeports,
]
def run_all_checks(cfg: dict) -> list[Issue]:
buckets: dict[str, list[Issue]] = {}
for fn in CHECKS:
try:
buckets[fn.__name__] = fn(cfg)
except Exception as e:
buckets[fn.__name__] = [Issue(
f"healthcheck.{fn.__name__}",
f"check raised: {type(e).__name__}: {e}",
now_iso(),
root_cause="healthcheck bug",
)]
db_issues = buckets.get("check_databases", [])
mariadb_down = any(i.component_name == "mariadb" for i in db_issues)
postgres_down = any(i.component_name == "postgres" for i in db_issues)
if mariadb_down:
for i in buckets.get("check_mariadb_dependents", []) + buckets.get("check_ghost_blogs", []):
if i.root_cause is None:
i.root_cause = "mariadb unreachable"
if postgres_down:
for i in buckets.get("check_postgres_dependents", []):
if i.root_cause is None:
i.root_cause = "postgres unreachable"
out = []
for fn in CHECKS:
out.extend(buckets.get(fn.__name__, []))
return out
# ---------- NATS publish ----------
async def _publish(url: str, subject: str, payloads: list[bytes]) -> None:
import nats # type: ignore[import-not-found]
nc = await asyncio.wait_for(
nats.connect(url, connect_timeout=3, allow_reconnect=False), # type: ignore[attr-defined]
timeout=8,
)
try:
for p in payloads:
await nc.publish(subject, p)
await nc.flush()
finally:
await nc.close()
def publish_issues(issues: list[Issue], cfg: dict) -> None:
if not issues:
return
payloads = [json.dumps(i.to_dict()).encode("utf-8") for i in issues]
asyncio.run(_publish(cfg["nats"]["url"], cfg["nats"]["subject"], payloads))
# ---------- entry ----------
def load_config(path: str) -> dict:
with open(path) as f:
return json.load(f)
def main(argv=None) -> int:
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S%z")
ap = argparse.ArgumentParser()
ap.add_argument("--config", default="/home/samantha/homelab-health/checks.json")
ap.add_argument("--dry-run", action="store_true",
help="Print issues to stdout; do not publish to NATS")
args = ap.parse_args(argv)
cfg = load_config(args.config)
issues = run_all_checks(cfg)
for i in issues:
log.warning("issue: %s", json.dumps(i.to_dict()))
if not issues:
log.info("all checks green")
return 0
if args.dry_run:
for i in issues:
print(json.dumps(i.to_dict()))
return 1
try:
publish_issues(issues, cfg)
except Exception as e:
log.error("NATS publish failed: %s (issues not delivered)", e)
return 1
return 1
if __name__ == "__main__":
sys.exit(main())

45
k3s/health/checks.json Normal file
View file

@ -0,0 +1,45 @@
{
"nats": {
"url": "nats://10.0.0.6:32386",
"subject": "homelab_health_issue",
"monitoring_nodeport": 32388
},
"databases": [
{
"name": "postgres",
"namespace": "default",
"pod_label": "app=postgres",
"probe_cmd": ["pg_isready", "-U", "postgres"]
},
{
"name": "mariadb",
"namespace": "default",
"pod_label": "app=mariadb",
"probe_cmd": ["mariadb-admin", "ping", "--silent"]
}
],
"services": [
{"name": "ghost1", "namespace": "default", "db": "mariadb",
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
{"name": "ghost2", "namespace": "default", "db": "mariadb",
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
{"name": "ghost3", "namespace": "default", "db": "mariadb",
"probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
{"name": "mediawiki", "namespace": "default", "db": "mariadb",
"probe_path": "/", "expected": [200, 301, 302]},
{"name": "forgejo", "namespace": "default", "db": "postgres",
"probe_path": "/api/healthz", "expected": [200]},
{"name": "authentik", "namespace": "default", "db": "postgres",
"probe_path": "/-/health/live/", "expected": [200, 204]},
{"name": "listmonk", "namespace": "default", "db": "postgres",
"probe_path": "/", "expected": [200, 302]},
{"name": "n8n", "namespace": "default", "db": "postgres",
"probe_path": "/healthz", "expected": [200]},
{"name": "mattermost", "namespace": "default", "db": "postgres",
"probe_path": "/api/v4/system/ping", "expected": [200]},
{"name": "vaultwarden", "namespace": "default", "db": null,
"probe_path": "/alive", "expected": [200]},
{"name": "garage-webui", "namespace": "default", "db": null,
"probe_path": "/", "expected": [200, 302]}
]
}

View file

@ -0,0 +1,319 @@
# Homelab Health — Internal Checks Design
**Status: design approved 2026-04-20. Ready to write implementation plan.**
---
## Resume Notes (for next session)
You and I brainstormed this design across one session. All design questions answered,
all three design sections approved. The next step per the brainstorming skill is:
1. **This file exists** — design committed (or ready to commit).
2. **Next action:** invoke `superpowers:writing-plans` to turn this design into a
step-by-step implementation plan.
3. After the implementation plan is written, execute it (writing-plans → executing-plans).
Do NOT re-open any design decisions in the new session unless something here is
obviously wrong; the decisions below are settled.
**Test canary:** when verifying the installed system end-to-end, break **mediawiki**
(e.g. scale to 0 replicas), not the Ghost blogs. Ghosts are production, MediaWiki is
expendable for a "does the alert fire" test.
---
## Goals
Add a second layer of cluster health monitoring that runs **inside** the K3s cluster
and reports structural / semantic problems to NATS. The existing
`k3s/scripts/check-health.sh` (workstation-driven canary) stays in place unchanged.
Requirements, as given:
0. NATS itself up
1. MariaDB up
2. PostgreSQL up
3. Internal Ghost blog ports respond to HTTP correctly
4. All other services depending on MariaDB respond correctly
5. All services depending on PostgreSQL respond correctly
6. Something is listening at every NodePort
Plus implicit: standalone services (Vaultwarden, Garage, etc.) also get probed.
**Output contract:** publish NATS messages on subject `homelab_health_issue` with
JSON body:
```json
{
"component_name": "<str>",
"issue_detail": "<str>",
"detected_at": "<ISO8601 timestamp>",
"root_cause": "<optional str>"
}
```
---
## Decisions (settled)
| Decision | Choice | Why |
|---|---|---|
| Where it runs | systemd timer on **pve-control** | Master K3s control node; kubectl locally; always on. |
| Language | **Python 3** | User expertise; structured JSON; clean error handling. |
| HTTP probes | `requests` library | No subprocess per probe; in-process. |
| NATS publish | `nats-py` library | In-process; one cohesive Python process. |
| kubectl use | **subprocess** (kept for now) | Only two call sites; revisit later with `kubernetes` client. |
| DB auth for probes | **sidestepped** | Use `kubectl exec <pod> -- pg_isready` / `mariadb-admin ping`; no creds on pve-control. |
| Orchestration | Single script, one function per check category | Simple; matches "one function per check" ask. |
| Schedule | Every **10 minutes** | User said no more frequent than that. |
| Deduplication | **Stateless** | Re-fires every tick while failing; consumer handles aggregation. |
| Healthy publishes | **None** | Silent when OK. Only problems on the wire. |
| Recovery events | **None** | Reports stop when fixed; absence = healthy. |
| Service config | **JSON file** (`checks.json`) | Pythonic; easy to edit/commit; lives alongside `checker.py`. |
| NodePort discovery | **Live from `kubectl get svc -A -o json`** | Source of truth is the cluster; no drift. |
| NATS-down fallback | **stdout + non-zero exit** | Workstation canary + `systemctl status` surface failures. Future leaf/LAN NATS fallback via env var hook (deferred). |
---
## Architecture
**Deployment layout on pve-control:**
```
/opt/homelab-health/
├── checker.py # Python entrypoint, one function per check
├── checks.json # service catalog + NATS/DB config
├── venv/ # virtualenv with nats-py, requests
/etc/systemd/system/
├── homelab-health.service
└── homelab-health.timer
```
**Source of truth in repo:**
```
k3s/health/
├── home_lab_health.md # this file
├── checker.py
├── checks.json
├── requirements.txt # nats-py, requests
├── install.sh # runs on pve-control, sets up venv + units
├── homelab-health.service
├── homelab-health.timer
└── tests/
└── test_checks.py
```
**Runtime flow each tick:**
1. Load `checks.json`.
2. Connect to NATS with a 3s timeout. On failure: log loud, still run checks, publish nothing, exit 1.
3. Run each check function in sequence, each wrapped in `try/except`; exceptions in one check never stop the others (they become a `healthcheck.<fn>` meta-issue).
4. Each check returns `list[Issue]`. Main loop aggregates.
5. Log every issue to stdout (journal).
6. For each issue, publish one NATS message to `homelab_health_issue`.
7. Exit 0 if zero issues, 1 otherwise. `systemctl status` + journalctl give humans visibility.
---
## Config schema (`checks.json`)
```json
{
"nats": {
"url": "nats://nats.default.svc.cluster.local:4222",
"subject": "homelab_health_issue",
"monitoring_nodeport": 32388
},
"databases": [
{
"name": "postgres",
"namespace": "default",
"pod_label": "app=postgres",
"probe_cmd": ["pg_isready", "-U", "postgres"]
},
{
"name": "mariadb",
"namespace": "default",
"pod_label": "app=mariadb",
"probe_cmd": ["mariadb-admin", "ping", "--silent"]
}
],
"services": [
{"name": "ghost1", "namespace": "fulfillment", "db": "mariadb",
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
{"name": "ghost2", "namespace": "fulfillment", "db": "mariadb",
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
{"name": "ghost3", "namespace": "fulfillment", "db": "mariadb",
"probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
{"name": "mediawiki", "namespace": "default", "db": "mariadb",
"probe_path": "/", "expected": [200, 302]},
{"name": "forgejo", "namespace": "sjasoft", "db": "postgres",
"probe_path": "/api/healthz", "expected": [200]},
{"name": "authentik-server", "namespace": "default", "db": "postgres",
"probe_path": "/-/health/live/", "expected": [200, 204]},
{"name": "listmonk", "namespace": "default", "db": "postgres",
"probe_path": "/api/health", "expected": [200]},
{"name": "n8n", "namespace": "default", "db": "postgres",
"probe_path": "/healthz", "expected": [200]},
{"name": "mattermost", "namespace": "default", "db": "postgres",
"probe_path": "/api/v4/system/ping", "expected": [200]},
{"name": "vaultwarden", "namespace": "default", "db": null,
"probe_path": "/alive", "expected": [200]},
{"name": "garage", "namespace": "default", "db": null,
"probe_path": "/health", "expected": [200]},
{"name": "garage-webui", "namespace": "default", "db": null,
"probe_path": "/", "expected": [200, 302]}
]
}
```
**Probe URL resolution:** at runtime, `kubectl get svc -n <ns> <name> -o json`
extract `.spec.ports[].nodePort` → probe `http://localhost:<nodeport><probe_path>`.
**Per-service silence:** add `"disabled": true` to a service entry to skip it without
deleting it.
**Verify actual probe paths during implementation** — the paths above are reasonable
defaults but each needs a quick curl sanity check. Specifically double-check:
Authentik (`/-/health/live/` vs `/-/health/ready/`), Garage (root `/health` endpoint),
Vaultwarden (`/alive` returns 200 plain-text timestamp — confirmed), n8n (`/healthz`).
---
## Check catalog
One function per requirement, sharing an internal `probe_service(svc_cfg)` helper.
| Function | Covers | Mechanism |
|---|---|---|
| `check_nats()` | #0 | `kubectl exec` NATS pod to run `nats server check connection`; fallback HTTP GET `localhost:<monitoring_nodeport>/healthz` |
| `check_postgres()` | #2 | `kubectl exec` postgres pod to run `pg_isready -U postgres` |
| `check_mariadb()` | #1 | `kubectl exec` mariadb pod to run `mariadb-admin ping --silent` |
| `check_ghost_blogs()` | #3 | `probe_service` for every service whose name starts with `ghost` |
| `check_mariadb_dependents()` | #4 | `probe_service` for every non-ghost service where `db == "mariadb"` |
| `check_postgres_dependents()` | #5 | `probe_service` for every service where `db == "postgres"` |
| `check_standalone_services()` | implicit | `probe_service` for every service where `db == null` |
| `check_all_nodeports()` | #6 | `kubectl get svc -A -o json`; for every `nodePort`, TCP connect `localhost:<nodeport>`; failure = nothing listening |
**`probe_service(svc)`:** resolves NodePort via kubectl, calls
`requests.get(f"http://localhost:{nodeport}{svc['probe_path']}", timeout=10)`,
compares status to `expected`, returns an `Issue` on mismatch or on exception.
**Root-cause hints in payload:** if `check_mariadb()` produced an issue this run,
any `check_mariadb_dependents()` failure gets `"root_cause": "mariadb unreachable"`.
Same pattern for postgres. Decorative — consumers decide what to do with it.
---
## Error handling
```python
def run_all_checks(cfg) -> list[Issue]:
issues = []
for fn in [check_nats, check_postgres, check_mariadb,
check_ghost_blogs, check_mariadb_dependents,
check_postgres_dependents, check_standalone_services,
check_all_nodeports]:
try:
issues.extend(fn(cfg))
except Exception as e:
issues.append(Issue(
component_name=f"healthcheck.{fn.__name__}",
issue_detail=f"check function raised: {type(e).__name__}: {e}",
detected_at=now_iso(),
root_cause="healthcheck bug or missing dependency"))
return issues
```
- No single check can halt the pipeline.
- NATS connect failure is loud-logged; checks still run; individual publish failures
are logged but don't stop the rest.
- `Issue` is a small dataclass; `to_dict()` serialises to the exact NATS payload schema.
---
## Deployment
**`install.sh` (run once on pve-control as samantha, with sudo where needed):**
```bash
set -euo pipefail
sudo mkdir -p /opt/homelab-health
sudo rsync -a --delete ./ /opt/homelab-health/ --exclude=install.sh --exclude=tests
sudo chown -R samantha:samantha /opt/homelab-health
python3 -m venv /opt/homelab-health/venv
/opt/homelab-health/venv/bin/pip install -r /opt/homelab-health/requirements.txt
sudo cp homelab-health.service homelab-health.timer /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable --now homelab-health.timer
```
**`homelab-health.service`:**
```ini
[Unit]
Description=Homelab internal health checks
After=network-online.target
[Service]
Type=oneshot
User=samantha
ExecStart=/opt/homelab-health/venv/bin/python /opt/homelab-health/checker.py
StandardOutput=journal
StandardError=journal
```
**`homelab-health.timer`:**
```ini
[Unit]
Description=Run homelab health checks every 10 minutes
[Timer]
OnCalendar=*:0/10
Persistent=true
[Install]
WantedBy=timers.target
```
---
## Testing
**Unit tests** (`tests/test_checks.py`, pytest):
- Each check function takes a config object — easily stubbed.
- `probe_service` accepts an injected HTTP client so tests don't hit real services.
- Mock `subprocess.run` for kubectl calls.
- Assert the exact `Issue` list returned for each failure shape.
**Manual smoke test** — `checker.py --dry-run` logs all issues to stdout but skips
NATS publish. Run ad-hoc on pve-control during development.
**End-to-end verification after install:**
1. `systemctl list-timers homelab-health.timer` shows next fire time.
2. Manually fire once: `sudo systemctl start homelab-health.service`.
3. `journalctl -u homelab-health -n 200` shows outcome.
4. On workstation: `nats sub homelab_health_issue` (against the cluster NATS).
5. Break **mediawiki** (`kubectl scale deploy mediawiki -n default --replicas=0`) and
wait ≤10 min — expect a message on the subject, with `component_name:"mediawiki"`.
6. Restore (`--replicas=1`) and confirm alerts stop on the next tick.
---
## Open items / future
- **Leaf/LAN NATS fallback:** add `FALLBACK_NATS_URL` env-var hook in `checker.py`
(unused for now). When the leaf NATS comes online, publish there too on connect
failure to primary.
- **NATS auth:** current assumption is local anonymous publish is allowed. If auth is
added, introduce a `nats.creds_path` field in `checks.json` pointing at a creds
file on pve-control.
- **k8s Python client migration:** replace the two remaining `kubectl` subprocess
calls with the `kubernetes` library for a fully in-process script.
- **Recovery events:** if downstream consumers want a "resolved" signal, add a small
local state file (JSON on disk) to detect transitions and publish recovery events.
- **Per-namespace grouping:** not needed now; if service list grows beyond ~25,
reconsider organizing `checks.json` by namespace for readability.

View file

@ -0,0 +1,14 @@
[Unit]
Description=Homelab internal health checks
After=network-online.target
Wants=network-online.target
[Service]
Type=oneshot
User=samantha
WorkingDirectory=/home/samantha/homelab-health
Environment=KUBECONFIG=/home/samantha/.kube/config
Environment=PATH=/usr/local/bin:/usr/bin:/bin
ExecStart=/home/samantha/homelab-health/venv/bin/python /home/samantha/homelab-health/checker.py --config /home/samantha/homelab-health/checks.json
StandardOutput=journal
StandardError=journal

View file

@ -0,0 +1,11 @@
[Unit]
Description=Run homelab health checks every 10 minutes
[Timer]
OnBootSec=2min
OnUnitActiveSec=10min
Persistent=true
AccuracySec=30s
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,2 @@
nats-py==2.9.0
requests==2.32.3