Add homelab internal health checker

Python checker runs on pve-control via systemd timer every 10 min, publishes issues to NATS subject homelab_health_issue. Checks NATS, Postgres, MariaDB, Ghost blogs, DB dependents, standalone services, and every NodePort. Silent when healthy. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-20 15:48:07 -04:00 · 2026-04-20 15:48:07 -04:00 · 58bfd422d4
commit 58bfd422d4
parent 6364f70799
7 changed files with 704 additions and 0 deletions
--- a/.opencode/opencode.json
+++ b/.opencode/opencode.json
@ -0,0 +1,5 @@
 {
  "plugin": [
    "web"
  ]
 }
--- a/k3s/health/checker.py
+++ b/k3s/health/checker.py
@ -0,0 +1,308 @@
 """Homelab internal health checker.
 Runs on pve-control every 10 minutes via systemd timer. Publishes issue
 events to NATS subject `homelab_health_issue`. Silent when healthy.
 See home_lab_health.md for design.
 """
 from __future__ import annotations
 import argparse
 import asyncio
 import json
 import logging
 import socket
 import subprocess
 import sys
 from dataclasses import asdict, dataclass
 from datetime import datetime, timezone
 from typing import Optional, Sequence
 import requests
 log = logging.getLogger("homelab-health")
 # ---------- payload ----------
@dataclass
 class Issue:
    component_name: str
    issue_detail: str
    detected_at: str
    root_cause: Optional[str] = None
    def to_dict(self) -> dict:
        d = asdict(self)
        if d["root_cause"] is None:
            del d["root_cause"]
        return d
 def now_iso() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
 # ---------- kubectl helpers ----------
 def kubectl_json(args: Sequence[str], timeout: int = 15) -> dict:
    cmd = ["kubectl", *args, "-o", "json"]
    r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
    if r.returncode != 0:
        raise RuntimeError(f"kubectl {' '.join(args)}: {r.stderr.strip()}")
    return json.loads(r.stdout)
@dataclass
 class ExecResult:
    ok: bool
    stdout: str
    stderr: str
 def kubectl_exec(namespace: str, pod_label: str, command: Sequence[str],
                 timeout: int = 10) -> ExecResult:
    """Run a command inside the first pod matching pod_label."""
    try:
        n = subprocess.run(
            ["kubectl", "get", "pod", "-n", namespace, "-l", pod_label,
             "-o", "jsonpath={.items[0].metadata.name}"],
            capture_output=True, text=True, timeout=5,
        )
    except subprocess.TimeoutExpired:
        return ExecResult(False, "", "pod lookup timed out")
    if n.returncode != 0 or not n.stdout.strip():
        return ExecResult(False, "", f"no pod matched {pod_label} in {namespace}")
    pod = n.stdout.strip()
    try:
        r = subprocess.run(
            ["kubectl", "exec", "-n", namespace, pod, "--", *command],
            capture_output=True, text=True, timeout=timeout,
        )
    except subprocess.TimeoutExpired:
        return ExecResult(False, "", f"kubectl exec timed out after {timeout}s")
    return ExecResult(r.returncode == 0, r.stdout, r.stderr.strip())
 def resolve_nodeport(namespace: str, service: str) -> Optional[int]:
    """Return the first NodePort on the given service, or None."""
    svc = kubectl_json(["get", "svc", "-n", namespace, service])
    for port in svc.get("spec", {}).get("ports", []):
        if "nodePort" in port:
            return int(port["nodePort"])
    return None
 # ---------- probes ----------
 def probe_service(svc: dict) -> list[Issue]:
    if svc.get("disabled"):
        return []
    name = svc["name"]
    try:
        nodeport = resolve_nodeport(svc["namespace"], name)
    except Exception as e:
        return [Issue(name, f"kubectl failed: {e}", now_iso())]
    if nodeport is None:
        return [Issue(name, "no NodePort exposed", now_iso())]
    url = f"http://localhost:{nodeport}{svc['probe_path']}"
    try:
        resp = requests.get(url, timeout=10, allow_redirects=False)
    except requests.RequestException as e:
        return [Issue(name, f"probe error at {url}: {e}", now_iso())]
    if resp.status_code in svc["expected"]:
        return []
    return [Issue(name,
                  f"HTTP {resp.status_code} at {url} (expected {svc['expected']})",
                  now_iso())]
 # ---------- check functions ----------
 def check_nats(cfg: dict) -> list[Issue]:
    port = cfg["nats"]["monitoring_nodeport"]
    url = f"http://localhost:{port}/healthz"
    try:
        r = requests.get(url, timeout=5)
    except requests.RequestException as e:
        return [Issue("nats", f"monitoring unreachable: {e}", now_iso())]
    if r.status_code != 200:
        return [Issue("nats", f"/healthz returned {r.status_code}", now_iso())]
    return []
 def check_databases(cfg: dict) -> list[Issue]:
    issues = []
    for db in cfg.get("databases", []):
        result = kubectl_exec(db["namespace"], db["pod_label"], db["probe_cmd"])
        if not result.ok:
            issues.append(Issue(
                db["name"],
                f"liveness probe failed: {result.stderr or '(no stderr)'}",
                now_iso(),
            ))
    return issues
 def _filter_probe(services, pred) -> list[Issue]:
    out = []
    for s in services:
        if pred(s):
            out.extend(probe_service(s))
    return out
 def check_ghost_blogs(cfg):
    return _filter_probe(cfg.get("services", []),
                         lambda s: s["name"].startswith("ghost"))
 def check_mariadb_dependents(cfg):
    return _filter_probe(cfg.get("services", []),
                         lambda s: s["db"] == "mariadb" and not s["name"].startswith("ghost"))
 def check_postgres_dependents(cfg):
    return _filter_probe(cfg.get("services", []),
                         lambda s: s["db"] == "postgres")
 def check_standalone_services(cfg):
    return _filter_probe(cfg.get("services", []),
                         lambda s: s["db"] is None)
 def check_all_nodeports(_cfg) -> list[Issue]:
    """TCP connect to every NodePort in the cluster."""
    svcs = kubectl_json(["get", "svc", "-A"])
    issues = []
    for item in svcs.get("items", []):
        meta = item.get("metadata", {})
        name = f"{meta.get('namespace')}/{meta.get('name')}"
        for port in item.get("spec", {}).get("ports", []):
            np = port.get("nodePort")
            if np is None:
                continue
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.settimeout(3)
                rc = s.connect_ex(("127.0.0.1", int(np)))
            if rc != 0:
                issues.append(Issue(
                    name,
                    f"NodePort {np} not accepting TCP (errno {rc})",
                    now_iso(),
                ))
    return issues
 # ---------- orchestration ----------
 CHECKS = [
    check_nats,
    check_databases,
    check_ghost_blogs,
    check_mariadb_dependents,
    check_postgres_dependents,
    check_standalone_services,
    check_all_nodeports,
 ]
 def run_all_checks(cfg: dict) -> list[Issue]:
    buckets: dict[str, list[Issue]] = {}
    for fn in CHECKS:
        try:
            buckets[fn.__name__] = fn(cfg)
        except Exception as e:
            buckets[fn.__name__] = [Issue(
                f"healthcheck.{fn.__name__}",
                f"check raised: {type(e).__name__}: {e}",
                now_iso(),
                root_cause="healthcheck bug",
            )]
    db_issues = buckets.get("check_databases", [])
    mariadb_down = any(i.component_name == "mariadb" for i in db_issues)
    postgres_down = any(i.component_name == "postgres" for i in db_issues)
    if mariadb_down:
        for i in buckets.get("check_mariadb_dependents", []) + buckets.get("check_ghost_blogs", []):
            if i.root_cause is None:
                i.root_cause = "mariadb unreachable"
    if postgres_down:
        for i in buckets.get("check_postgres_dependents", []):
            if i.root_cause is None:
                i.root_cause = "postgres unreachable"
    out = []
    for fn in CHECKS:
        out.extend(buckets.get(fn.__name__, []))
    return out
 # ---------- NATS publish ----------
 async def _publish(url: str, subject: str, payloads: list[bytes]) -> None:
    import nats  # type: ignore[import-not-found]
    nc = await asyncio.wait_for(
        nats.connect(url, connect_timeout=3, allow_reconnect=False),  # type: ignore[attr-defined]
        timeout=8,
    )
    try:
        for p in payloads:
            await nc.publish(subject, p)
        await nc.flush()
    finally:
        await nc.close()
 def publish_issues(issues: list[Issue], cfg: dict) -> None:
    if not issues:
        return
    payloads = [json.dumps(i.to_dict()).encode("utf-8") for i in issues]
    asyncio.run(_publish(cfg["nats"]["url"], cfg["nats"]["subject"], payloads))
 # ---------- entry ----------
 def load_config(path: str) -> dict:
    with open(path) as f:
        return json.load(f)
 def main(argv=None) -> int:
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%Y-%m-%dT%H:%M:%S%z")
    ap = argparse.ArgumentParser()
    ap.add_argument("--config", default="/home/samantha/homelab-health/checks.json")
    ap.add_argument("--dry-run", action="store_true",
                    help="Print issues to stdout; do not publish to NATS")
    args = ap.parse_args(argv)
    cfg = load_config(args.config)
    issues = run_all_checks(cfg)
    for i in issues:
        log.warning("issue: %s", json.dumps(i.to_dict()))
    if not issues:
        log.info("all checks green")
        return 0
    if args.dry_run:
        for i in issues:
            print(json.dumps(i.to_dict()))
        return 1
    try:
        publish_issues(issues, cfg)
    except Exception as e:
        log.error("NATS publish failed: %s (issues not delivered)", e)
        return 1
    return 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/k3s/health/checks.json
+++ b/k3s/health/checks.json
@ -0,0 +1,45 @@
 {
  "nats": {
    "url": "nats://10.0.0.6:32386",
    "subject": "homelab_health_issue",
    "monitoring_nodeport": 32388
  },
  "databases": [
    {
      "name": "postgres",
      "namespace": "default",
      "pod_label": "app=postgres",
      "probe_cmd": ["pg_isready", "-U", "postgres"]
    },
    {
      "name": "mariadb",
      "namespace": "default",
      "pod_label": "app=mariadb",
      "probe_cmd": ["mariadb-admin", "ping", "--silent"]
    }
  ],
  "services": [
    {"name": "ghost1", "namespace": "default", "db": "mariadb",
     "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
    {"name": "ghost2", "namespace": "default", "db": "mariadb",
     "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
    {"name": "ghost3", "namespace": "default", "db": "mariadb",
     "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
    {"name": "mediawiki", "namespace": "default", "db": "mariadb",
     "probe_path": "/", "expected": [200, 301, 302]},
    {"name": "forgejo", "namespace": "default", "db": "postgres",
     "probe_path": "/api/healthz", "expected": [200]},
    {"name": "authentik", "namespace": "default", "db": "postgres",
     "probe_path": "/-/health/live/", "expected": [200, 204]},
    {"name": "listmonk", "namespace": "default", "db": "postgres",
     "probe_path": "/", "expected": [200, 302]},
    {"name": "n8n", "namespace": "default", "db": "postgres",
     "probe_path": "/healthz", "expected": [200]},
    {"name": "mattermost", "namespace": "default", "db": "postgres",
     "probe_path": "/api/v4/system/ping", "expected": [200]},
    {"name": "vaultwarden", "namespace": "default", "db": null,
     "probe_path": "/alive", "expected": [200]},
    {"name": "garage-webui", "namespace": "default", "db": null,
     "probe_path": "/", "expected": [200, 302]}
  ]
 }
--- a/k3s/health/home_lab_health.md
+++ b/k3s/health/home_lab_health.md
@ -0,0 +1,319 @@
 # Homelab Health — Internal Checks Design
 **Status: design approved 2026-04-20. Ready to write implementation plan.**
 ---
 ## Resume Notes (for next session)
 You and I brainstormed this design across one session. All design questions answered,
 all three design sections approved. The next step per the brainstorming skill is:
 1. **This file exists** — design committed (or ready to commit).
 2. **Next action:** invoke `superpowers:writing-plans` to turn this design into a
   step-by-step implementation plan.
 3. After the implementation plan is written, execute it (writing-plans → executing-plans).
 Do NOT re-open any design decisions in the new session unless something here is
 obviously wrong; the decisions below are settled.
 **Test canary:** when verifying the installed system end-to-end, break **mediawiki**
 (e.g. scale to 0 replicas), not the Ghost blogs. Ghosts are production, MediaWiki is
 expendable for a "does the alert fire" test.
 ---
 ## Goals
 Add a second layer of cluster health monitoring that runs **inside** the K3s cluster
 and reports structural / semantic problems to NATS. The existing
 `k3s/scripts/check-health.sh` (workstation-driven canary) stays in place unchanged.
 Requirements, as given:
 0. NATS itself up
 1. MariaDB up
 2. PostgreSQL up
 3. Internal Ghost blog ports respond to HTTP correctly
 4. All other services depending on MariaDB respond correctly
 5. All services depending on PostgreSQL respond correctly
 6. Something is listening at every NodePort
 Plus implicit: standalone services (Vaultwarden, Garage, etc.) also get probed.
 **Output contract:** publish NATS messages on subject `homelab_health_issue` with
 JSON body:
 ```json
 {
  "component_name": "<str>",
  "issue_detail": "<str>",
  "detected_at": "<ISO8601 timestamp>",
  "root_cause": "<optional str>"
 }
 ```
 ---
 ## Decisions (settled)
 | Decision | Choice | Why |
 |---|---|---|
 | Where it runs | systemd timer on **pve-control** | Master K3s control node; kubectl locally; always on. |
 | Language | **Python 3** | User expertise; structured JSON; clean error handling. |
 | HTTP probes | `requests` library | No subprocess per probe; in-process. |
 | NATS publish | `nats-py` library | In-process; one cohesive Python process. |
 | kubectl use | **subprocess** (kept for now) | Only two call sites; revisit later with `kubernetes` client. |
 | DB auth for probes | **sidestepped** | Use `kubectl exec <pod> -- pg_isready` / `mariadb-admin ping`; no creds on pve-control. |
 | Orchestration | Single script, one function per check category | Simple; matches "one function per check" ask. |
 | Schedule | Every **10 minutes** | User said no more frequent than that. |
 | Deduplication | **Stateless** | Re-fires every tick while failing; consumer handles aggregation. |
 | Healthy publishes | **None** | Silent when OK. Only problems on the wire. |
 | Recovery events | **None** | Reports stop when fixed; absence = healthy. |
 | Service config | **JSON file** (`checks.json`) | Pythonic; easy to edit/commit; lives alongside `checker.py`. |
 | NodePort discovery | **Live from `kubectl get svc -A -o json`** | Source of truth is the cluster; no drift. |
 | NATS-down fallback | **stdout + non-zero exit** | Workstation canary + `systemctl status` surface failures. Future leaf/LAN NATS fallback via env var hook (deferred). |
 ---
 ## Architecture
 **Deployment layout on pve-control:**
 ```
 /opt/homelab-health/
 ├── checker.py            # Python entrypoint, one function per check
 ├── checks.json           # service catalog + NATS/DB config
 ├── venv/                 # virtualenv with nats-py, requests
 /etc/systemd/system/
 ├── homelab-health.service
 └── homelab-health.timer
 ```
 **Source of truth in repo:**
 ```
 k3s/health/
 ├── home_lab_health.md         # this file
 ├── checker.py
 ├── checks.json
 ├── requirements.txt           # nats-py, requests
 ├── install.sh                 # runs on pve-control, sets up venv + units
 ├── homelab-health.service
 ├── homelab-health.timer
 └── tests/
    └── test_checks.py
 ```
 **Runtime flow each tick:**
 1. Load `checks.json`.
 2. Connect to NATS with a 3s timeout. On failure: log loud, still run checks, publish nothing, exit 1.
 3. Run each check function in sequence, each wrapped in `try/except`; exceptions in one check never stop the others (they become a `healthcheck.<fn>` meta-issue).
 4. Each check returns `list[Issue]`. Main loop aggregates.
 5. Log every issue to stdout (journal).
 6. For each issue, publish one NATS message to `homelab_health_issue`.
 7. Exit 0 if zero issues, 1 otherwise. `systemctl status` + journalctl give humans visibility.
 ---
 ## Config schema (`checks.json`)
 ```json
 {
  "nats": {
    "url": "nats://nats.default.svc.cluster.local:4222",
    "subject": "homelab_health_issue",
    "monitoring_nodeport": 32388
  },
  "databases": [
    {
      "name": "postgres",
      "namespace": "default",
      "pod_label": "app=postgres",
      "probe_cmd": ["pg_isready", "-U", "postgres"]
    },
    {
      "name": "mariadb",
      "namespace": "default",
      "pod_label": "app=mariadb",
      "probe_cmd": ["mariadb-admin", "ping", "--silent"]
    }
  ],
  "services": [
    {"name": "ghost1", "namespace": "fulfillment", "db": "mariadb",
     "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
    {"name": "ghost2", "namespace": "fulfillment", "db": "mariadb",
     "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
    {"name": "ghost3", "namespace": "fulfillment", "db": "mariadb",
     "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
    {"name": "mediawiki", "namespace": "default", "db": "mariadb",
     "probe_path": "/", "expected": [200, 302]},
    {"name": "forgejo", "namespace": "sjasoft", "db": "postgres",
     "probe_path": "/api/healthz", "expected": [200]},
    {"name": "authentik-server", "namespace": "default", "db": "postgres",
     "probe_path": "/-/health/live/", "expected": [200, 204]},
    {"name": "listmonk", "namespace": "default", "db": "postgres",
     "probe_path": "/api/health", "expected": [200]},
    {"name": "n8n", "namespace": "default", "db": "postgres",
     "probe_path": "/healthz", "expected": [200]},
    {"name": "mattermost", "namespace": "default", "db": "postgres",
     "probe_path": "/api/v4/system/ping", "expected": [200]},
    {"name": "vaultwarden", "namespace": "default", "db": null,
     "probe_path": "/alive", "expected": [200]},
    {"name": "garage", "namespace": "default", "db": null,
     "probe_path": "/health", "expected": [200]},
    {"name": "garage-webui", "namespace": "default", "db": null,
     "probe_path": "/", "expected": [200, 302]}
  ]
 }
 ```
 **Probe URL resolution:** at runtime, `kubectl get svc -n <ns> <name> -o json` →
 extract `.spec.ports[].nodePort` → probe `http://localhost:<nodeport><probe_path>`.
 **Per-service silence:** add `"disabled": true` to a service entry to skip it without
 deleting it.
 **Verify actual probe paths during implementation** — the paths above are reasonable
 defaults but each needs a quick curl sanity check. Specifically double-check:
 Authentik (`/-/health/live/` vs `/-/health/ready/`), Garage (root `/health` endpoint),
 Vaultwarden (`/alive` returns 200 plain-text timestamp — confirmed), n8n (`/healthz`).
 ---
 ## Check catalog
 One function per requirement, sharing an internal `probe_service(svc_cfg)` helper.
 | Function | Covers | Mechanism |
 |---|---|---|
 | `check_nats()` | #0 | `kubectl exec` NATS pod to run `nats server check connection`; fallback HTTP GET `localhost:<monitoring_nodeport>/healthz` |
 | `check_postgres()` | #2 | `kubectl exec` postgres pod to run `pg_isready -U postgres` |
 | `check_mariadb()` | #1 | `kubectl exec` mariadb pod to run `mariadb-admin ping --silent` |
 | `check_ghost_blogs()` | #3 | `probe_service` for every service whose name starts with `ghost` |
 | `check_mariadb_dependents()` | #4 | `probe_service` for every non-ghost service where `db == "mariadb"` |
 | `check_postgres_dependents()` | #5 | `probe_service` for every service where `db == "postgres"` |
 | `check_standalone_services()` | implicit | `probe_service` for every service where `db == null` |
 | `check_all_nodeports()` | #6 | `kubectl get svc -A -o json`; for every `nodePort`, TCP connect `localhost:<nodeport>`; failure = nothing listening |
 **`probe_service(svc)`:** resolves NodePort via kubectl, calls
 `requests.get(f"http://localhost:{nodeport}{svc['probe_path']}", timeout=10)`,
 compares status to `expected`, returns an `Issue` on mismatch or on exception.
 **Root-cause hints in payload:** if `check_mariadb()` produced an issue this run,
 any `check_mariadb_dependents()` failure gets `"root_cause": "mariadb unreachable"`.
 Same pattern for postgres. Decorative — consumers decide what to do with it.
 ---
 ## Error handling
 ```python
 def run_all_checks(cfg) -> list[Issue]:
    issues = []
    for fn in [check_nats, check_postgres, check_mariadb,
               check_ghost_blogs, check_mariadb_dependents,
               check_postgres_dependents, check_standalone_services,
               check_all_nodeports]:
        try:
            issues.extend(fn(cfg))
        except Exception as e:
            issues.append(Issue(
                component_name=f"healthcheck.{fn.__name__}",
                issue_detail=f"check function raised: {type(e).__name__}: {e}",
                detected_at=now_iso(),
                root_cause="healthcheck bug or missing dependency"))
    return issues
 ```
 - No single check can halt the pipeline.
 - NATS connect failure is loud-logged; checks still run; individual publish failures
  are logged but don't stop the rest.
 - `Issue` is a small dataclass; `to_dict()` serialises to the exact NATS payload schema.
 ---
 ## Deployment
 **`install.sh` (run once on pve-control as samantha, with sudo where needed):**
 ```bash
 set -euo pipefail
 sudo mkdir -p /opt/homelab-health
 sudo rsync -a --delete ./ /opt/homelab-health/ --exclude=install.sh --exclude=tests
 sudo chown -R samantha:samantha /opt/homelab-health
 python3 -m venv /opt/homelab-health/venv
 /opt/homelab-health/venv/bin/pip install -r /opt/homelab-health/requirements.txt
 sudo cp homelab-health.service homelab-health.timer /etc/systemd/system/
 sudo systemctl daemon-reload
 sudo systemctl enable --now homelab-health.timer
 ```
 **`homelab-health.service`:**
 ```ini
 [Unit]
 Description=Homelab internal health checks
 After=network-online.target
 [Service]
 Type=oneshot
 User=samantha
 ExecStart=/opt/homelab-health/venv/bin/python /opt/homelab-health/checker.py
 StandardOutput=journal
 StandardError=journal
 ```
 **`homelab-health.timer`:**
 ```ini
 [Unit]
 Description=Run homelab health checks every 10 minutes
 [Timer]
 OnCalendar=*:0/10
 Persistent=true
 [Install]
 WantedBy=timers.target
 ```
 ---
 ## Testing
 **Unit tests** (`tests/test_checks.py`, pytest):
 - Each check function takes a config object — easily stubbed.
 - `probe_service` accepts an injected HTTP client so tests don't hit real services.
 - Mock `subprocess.run` for kubectl calls.
 - Assert the exact `Issue` list returned for each failure shape.
 **Manual smoke test** — `checker.py --dry-run` logs all issues to stdout but skips
 NATS publish. Run ad-hoc on pve-control during development.
 **End-to-end verification after install:**
 1. `systemctl list-timers homelab-health.timer` shows next fire time.
 2. Manually fire once: `sudo systemctl start homelab-health.service`.
 3. `journalctl -u homelab-health -n 200` shows outcome.
 4. On workstation: `nats sub homelab_health_issue` (against the cluster NATS).
 5. Break **mediawiki** (`kubectl scale deploy mediawiki -n default --replicas=0`) and
   wait ≤10 min — expect a message on the subject, with `component_name:"mediawiki"`.
 6. Restore (`--replicas=1`) and confirm alerts stop on the next tick.
 ---
 ## Open items / future
 - **Leaf/LAN NATS fallback:** add `FALLBACK_NATS_URL` env-var hook in `checker.py`
  (unused for now). When the leaf NATS comes online, publish there too on connect
  failure to primary.
 - **NATS auth:** current assumption is local anonymous publish is allowed. If auth is
  added, introduce a `nats.creds_path` field in `checks.json` pointing at a creds
  file on pve-control.
 - **k8s Python client migration:** replace the two remaining `kubectl` subprocess
  calls with the `kubernetes` library for a fully in-process script.
 - **Recovery events:** if downstream consumers want a "resolved" signal, add a small
  local state file (JSON on disk) to detect transitions and publish recovery events.
 - **Per-namespace grouping:** not needed now; if service list grows beyond ~25,
  reconsider organizing `checks.json` by namespace for readability.
--- a/k3s/health/homelab-health.service
+++ b/k3s/health/homelab-health.service
@ -0,0 +1,14 @@
 [Unit]
 Description=Homelab internal health checks
 After=network-online.target
 Wants=network-online.target
 [Service]
 Type=oneshot
 User=samantha
 WorkingDirectory=/home/samantha/homelab-health
 Environment=KUBECONFIG=/home/samantha/.kube/config
 Environment=PATH=/usr/local/bin:/usr/bin:/bin
 ExecStart=/home/samantha/homelab-health/venv/bin/python /home/samantha/homelab-health/checker.py --config /home/samantha/homelab-health/checks.json
 StandardOutput=journal
 StandardError=journal
--- a/k3s/health/homelab-health.timer
+++ b/k3s/health/homelab-health.timer
@ -0,0 +1,11 @@
 [Unit]
 Description=Run homelab health checks every 10 minutes
 [Timer]
 OnBootSec=2min
 OnUnitActiveSec=10min
 Persistent=true
 AccuracySec=30s
 [Install]
 WantedBy=timers.target
--- a/k3s/health/requirements.txt
+++ b/k3s/health/requirements.txt
@ -0,0 +1,2 @@
 nats-py==2.9.0
 requests==2.32.3