Add homelab internal health checker

Python checker runs on pve-control via systemd timer every 10 min, publishes issues to NATS subject homelab_health_issue. Checks NATS, Postgres, MariaDB, Ghost blogs, DB dependents, standalone services, and every NodePort. Silent when healthy. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-20 15:48:07 -04:00 · 2026-04-20 15:48:07 -04:00 · 58bfd422d4
commit 58bfd422d4
parent 6364f70799
7 changed files with 704 additions and 0 deletions
--- a/.opencode/opencode.json
+++ b/.opencode/opencode.json
@ -0,0 +1,5 @@
+{
+  "plugin": [
+    "web"
+  ]
+}
--- a/k3s/health/checker.py
+++ b/k3s/health/checker.py
@ -0,0 +1,308 @@
+"""Homelab internal health checker.
+
+Runs on pve-control every 10 minutes via systemd timer. Publishes issue
+events to NATS subject `homelab_health_issue`. Silent when healthy.
+See home_lab_health.md for design.
+"""
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import socket
+import subprocess
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
+from typing import Optional, Sequence
+
+import requests
+
+
+log = logging.getLogger("homelab-health")
+
+
+# ---------- payload ----------
+
+@dataclass
+class Issue:
+    component_name: str
+    issue_detail: str
+    detected_at: str
+    root_cause: Optional[str] = None
+
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        if d["root_cause"] is None:
+            del d["root_cause"]
+        return d
+
+
+def now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
+
+
+# ---------- kubectl helpers ----------
+
+def kubectl_json(args: Sequence[str], timeout: int = 15) -> dict:
+    cmd = ["kubectl", *args, "-o", "json"]
+    r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+    if r.returncode != 0:
+        raise RuntimeError(f"kubectl {' '.join(args)}: {r.stderr.strip()}")
+    return json.loads(r.stdout)
+
+
+@dataclass
+class ExecResult:
+    ok: bool
+    stdout: str
+    stderr: str
+
+
+def kubectl_exec(namespace: str, pod_label: str, command: Sequence[str],
+                 timeout: int = 10) -> ExecResult:
+    """Run a command inside the first pod matching pod_label."""
+    try:
+        n = subprocess.run(
+            ["kubectl", "get", "pod", "-n", namespace, "-l", pod_label,
+             "-o", "jsonpath={.items[0].metadata.name}"],
+            capture_output=True, text=True, timeout=5,
+        )
+    except subprocess.TimeoutExpired:
+        return ExecResult(False, "", "pod lookup timed out")
+    if n.returncode != 0 or not n.stdout.strip():
+        return ExecResult(False, "", f"no pod matched {pod_label} in {namespace}")
+    pod = n.stdout.strip()
+    try:
+        r = subprocess.run(
+            ["kubectl", "exec", "-n", namespace, pod, "--", *command],
+            capture_output=True, text=True, timeout=timeout,
+        )
+    except subprocess.TimeoutExpired:
+        return ExecResult(False, "", f"kubectl exec timed out after {timeout}s")
+    return ExecResult(r.returncode == 0, r.stdout, r.stderr.strip())
+
+
+def resolve_nodeport(namespace: str, service: str) -> Optional[int]:
+    """Return the first NodePort on the given service, or None."""
+    svc = kubectl_json(["get", "svc", "-n", namespace, service])
+    for port in svc.get("spec", {}).get("ports", []):
+        if "nodePort" in port:
+            return int(port["nodePort"])
+    return None
+
+
+# ---------- probes ----------
+
+def probe_service(svc: dict) -> list[Issue]:
+    if svc.get("disabled"):
+        return []
+    name = svc["name"]
+    try:
+        nodeport = resolve_nodeport(svc["namespace"], name)
+    except Exception as e:
+        return [Issue(name, f"kubectl failed: {e}", now_iso())]
+    if nodeport is None:
+        return [Issue(name, "no NodePort exposed", now_iso())]
+
+    url = f"http://localhost:{nodeport}{svc['probe_path']}"
+    try:
+        resp = requests.get(url, timeout=10, allow_redirects=False)
+    except requests.RequestException as e:
+        return [Issue(name, f"probe error at {url}: {e}", now_iso())]
+    if resp.status_code in svc["expected"]:
+        return []
+    return [Issue(name,
+                  f"HTTP {resp.status_code} at {url} (expected {svc['expected']})",
+                  now_iso())]
+
+
+# ---------- check functions ----------
+
+def check_nats(cfg: dict) -> list[Issue]:
+    port = cfg["nats"]["monitoring_nodeport"]
+    url = f"http://localhost:{port}/healthz"
+    try:
+        r = requests.get(url, timeout=5)
+    except requests.RequestException as e:
+        return [Issue("nats", f"monitoring unreachable: {e}", now_iso())]
+    if r.status_code != 200:
+        return [Issue("nats", f"/healthz returned {r.status_code}", now_iso())]
+    return []
+
+
+def check_databases(cfg: dict) -> list[Issue]:
+    issues = []
+    for db in cfg.get("databases", []):
+        result = kubectl_exec(db["namespace"], db["pod_label"], db["probe_cmd"])
+        if not result.ok:
+            issues.append(Issue(
+                db["name"],
+                f"liveness probe failed: {result.stderr or '(no stderr)'}",
+                now_iso(),
+            ))
+    return issues
+
+
+def _filter_probe(services, pred) -> list[Issue]:
+    out = []
+    for s in services:
+        if pred(s):
+            out.extend(probe_service(s))
+    return out
+
+
+def check_ghost_blogs(cfg):
+    return _filter_probe(cfg.get("services", []),
+                         lambda s: s["name"].startswith("ghost"))
+
+
+def check_mariadb_dependents(cfg):
+    return _filter_probe(cfg.get("services", []),
+                         lambda s: s["db"] == "mariadb" and not s["name"].startswith("ghost"))
+
+
+def check_postgres_dependents(cfg):
+    return _filter_probe(cfg.get("services", []),
+                         lambda s: s["db"] == "postgres")
+
+
+def check_standalone_services(cfg):
+    return _filter_probe(cfg.get("services", []),
+                         lambda s: s["db"] is None)
+
+
+def check_all_nodeports(_cfg) -> list[Issue]:
+    """TCP connect to every NodePort in the cluster."""
+    svcs = kubectl_json(["get", "svc", "-A"])
+    issues = []
+    for item in svcs.get("items", []):
+        meta = item.get("metadata", {})
+        name = f"{meta.get('namespace')}/{meta.get('name')}"
+        for port in item.get("spec", {}).get("ports", []):
+            np = port.get("nodePort")
+            if np is None:
+                continue
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.settimeout(3)
+                rc = s.connect_ex(("127.0.0.1", int(np)))
+            if rc != 0:
+                issues.append(Issue(
+                    name,
+                    f"NodePort {np} not accepting TCP (errno {rc})",
+                    now_iso(),
+                ))
+    return issues
+
+
+# ---------- orchestration ----------
+
+CHECKS = [
+    check_nats,
+    check_databases,
+    check_ghost_blogs,
+    check_mariadb_dependents,
+    check_postgres_dependents,
+    check_standalone_services,
+    check_all_nodeports,
+]
+
+
+def run_all_checks(cfg: dict) -> list[Issue]:
+    buckets: dict[str, list[Issue]] = {}
+    for fn in CHECKS:
+        try:
+            buckets[fn.__name__] = fn(cfg)
+        except Exception as e:
+            buckets[fn.__name__] = [Issue(
+                f"healthcheck.{fn.__name__}",
+                f"check raised: {type(e).__name__}: {e}",
+                now_iso(),
+                root_cause="healthcheck bug",
+            )]
+
+    db_issues = buckets.get("check_databases", [])
+    mariadb_down = any(i.component_name == "mariadb" for i in db_issues)
+    postgres_down = any(i.component_name == "postgres" for i in db_issues)
+
+    if mariadb_down:
+        for i in buckets.get("check_mariadb_dependents", []) + buckets.get("check_ghost_blogs", []):
+            if i.root_cause is None:
+                i.root_cause = "mariadb unreachable"
+    if postgres_down:
+        for i in buckets.get("check_postgres_dependents", []):
+            if i.root_cause is None:
+                i.root_cause = "postgres unreachable"
+
+    out = []
+    for fn in CHECKS:
+        out.extend(buckets.get(fn.__name__, []))
+    return out
+
+
+# ---------- NATS publish ----------
+
+async def _publish(url: str, subject: str, payloads: list[bytes]) -> None:
+    import nats  # type: ignore[import-not-found]
+    nc = await asyncio.wait_for(
+        nats.connect(url, connect_timeout=3, allow_reconnect=False),  # type: ignore[attr-defined]
+        timeout=8,
+    )
+    try:
+        for p in payloads:
+            await nc.publish(subject, p)
+        await nc.flush()
+    finally:
+        await nc.close()
+
+
+def publish_issues(issues: list[Issue], cfg: dict) -> None:
+    if not issues:
+        return
+    payloads = [json.dumps(i.to_dict()).encode("utf-8") for i in issues]
+    asyncio.run(_publish(cfg["nats"]["url"], cfg["nats"]["subject"], payloads))
+
+
+# ---------- entry ----------
+
+def load_config(path: str) -> dict:
+    with open(path) as f:
+        return json.load(f)
+
+
+def main(argv=None) -> int:
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s %(levelname)s %(message)s",
+                        datefmt="%Y-%m-%dT%H:%M:%S%z")
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", default="/home/samantha/homelab-health/checks.json")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="Print issues to stdout; do not publish to NATS")
+    args = ap.parse_args(argv)
+
+    cfg = load_config(args.config)
+    issues = run_all_checks(cfg)
+
+    for i in issues:
+        log.warning("issue: %s", json.dumps(i.to_dict()))
+
+    if not issues:
+        log.info("all checks green")
+        return 0
+
+    if args.dry_run:
+        for i in issues:
+            print(json.dumps(i.to_dict()))
+        return 1
+
+    try:
+        publish_issues(issues, cfg)
+    except Exception as e:
+        log.error("NATS publish failed: %s (issues not delivered)", e)
+        return 1
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/k3s/health/checks.json
+++ b/k3s/health/checks.json
@ -0,0 +1,45 @@
+{
+  "nats": {
+    "url": "nats://10.0.0.6:32386",
+    "subject": "homelab_health_issue",
+    "monitoring_nodeport": 32388
+  },
+  "databases": [
+    {
+      "name": "postgres",
+      "namespace": "default",
+      "pod_label": "app=postgres",
+      "probe_cmd": ["pg_isready", "-U", "postgres"]
+    },
+    {
+      "name": "mariadb",
+      "namespace": "default",
+      "pod_label": "app=mariadb",
+      "probe_cmd": ["mariadb-admin", "ping", "--silent"]
+    }
+  ],
+  "services": [
+    {"name": "ghost1", "namespace": "default", "db": "mariadb",
+     "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
+    {"name": "ghost2", "namespace": "default", "db": "mariadb",
+     "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
+    {"name": "ghost3", "namespace": "default", "db": "mariadb",
+     "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]},
+    {"name": "mediawiki", "namespace": "default", "db": "mariadb",
+     "probe_path": "/", "expected": [200, 301, 302]},
+    {"name": "forgejo", "namespace": "default", "db": "postgres",
+     "probe_path": "/api/healthz", "expected": [200]},
+    {"name": "authentik", "namespace": "default", "db": "postgres",
+     "probe_path": "/-/health/live/", "expected": [200, 204]},
+    {"name": "listmonk", "namespace": "default", "db": "postgres",
+     "probe_path": "/", "expected": [200, 302]},
+    {"name": "n8n", "namespace": "default", "db": "postgres",
+     "probe_path": "/healthz", "expected": [200]},
+    {"name": "mattermost", "namespace": "default", "db": "postgres",
+     "probe_path": "/api/v4/system/ping", "expected": [200]},
+    {"name": "vaultwarden", "namespace": "default", "db": null,
+     "probe_path": "/alive", "expected": [200]},
+    {"name": "garage-webui", "namespace": "default", "db": null,
+     "probe_path": "/", "expected": [200, 302]}
+  ]
+}
--- a/k3s/health/home_lab_health.md
+++ b/k3s/health/home_lab_health.md
@ -0,0 +1,319 @@
+# Homelab Health — Internal Checks Design
+
+**Status: design approved 2026-04-20. Ready to write implementation plan.**
+
+---
+
+## Resume Notes (for next session)
+
+You and I brainstormed this design across one session. All design questions answered,
+all three design sections approved. The next step per the brainstorming skill is:
+
+1. **This file exists** — design committed (or ready to commit).
+2. **Next action:** invoke `superpowers:writing-plans` to turn this design into a
+   step-by-step implementation plan.
+3. After the implementation plan is written, execute it (writing-plans → executing-plans).
+
+Do NOT re-open any design decisions in the new session unless something here is
+obviously wrong; the decisions below are settled.
+
+**Test canary:** when verifying the installed system end-to-end, break **mediawiki**
+(e.g. scale to 0 replicas), not the Ghost blogs. Ghosts are production, MediaWiki is
+expendable for a "does the alert fire" test.
+
+---
+
+## Goals
+
+Add a second layer of cluster health monitoring that runs **inside** the K3s cluster
+and reports structural / semantic problems to NATS. The existing
+`k3s/scripts/check-health.sh` (workstation-driven canary) stays in place unchanged.
+
+Requirements, as given:
+
+0. NATS itself up
+1. MariaDB up
+2. PostgreSQL up
+3. Internal Ghost blog ports respond to HTTP correctly
+4. All other services depending on MariaDB respond correctly
+5. All services depending on PostgreSQL respond correctly
+6. Something is listening at every NodePort
+
+Plus implicit: standalone services (Vaultwarden, Garage, etc.) also get probed.
+
+**Output contract:** publish NATS messages on subject `homelab_health_issue` with
+JSON body:
+
+```json
+{
+  "component_name": "<str>",
+  "issue_detail": "<str>",
+  "detected_at": "<ISO8601 timestamp>",
+  "root_cause": "<optional str>"
+}
+```
+
+---
+
+## Decisions (settled)
+
+| Decision | Choice | Why |
+|---|---|---|
+| Where it runs | systemd timer on **pve-control** | Master K3s control node; kubectl locally; always on. |
+| Language | **Python 3** | User expertise; structured JSON; clean error handling. |
+| HTTP probes | `requests` library | No subprocess per probe; in-process. |
+| NATS publish | `nats-py` library | In-process; one cohesive Python process. |
+| kubectl use | **subprocess** (kept for now) | Only two call sites; revisit later with `kubernetes` client. |
+| DB auth for probes | **sidestepped** | Use `kubectl exec <pod> -- pg_isready` / `mariadb-admin ping`; no creds on pve-control. |
+| Orchestration | Single script, one function per check category | Simple; matches "one function per check" ask. |
+| Schedule | Every **10 minutes** | User said no more frequent than that. |
+| Deduplication | **Stateless** | Re-fires every tick while failing; consumer handles aggregation. |
+| Healthy publishes | **None** | Silent when OK. Only problems on the wire. |
+| Recovery events | **None** | Reports stop when fixed; absence = healthy. |
+| Service config | **JSON file** (`checks.json`) | Pythonic; easy to edit/commit; lives alongside `checker.py`. |
+| NodePort discovery | **Live from `kubectl get svc -A -o json`** | Source of truth is the cluster; no drift. |
+| NATS-down fallback | **stdout + non-zero exit** | Workstation canary + `systemctl status` surface failures. Future leaf/LAN NATS fallback via env var hook (deferred). |
+
+---
+
+## Architecture
+
+**Deployment layout on pve-control:**
+
+```
+/opt/homelab-health/
+├── checker.py            # Python entrypoint, one function per check
+├── checks.json           # service catalog + NATS/DB config
+├── venv/                 # virtualenv with nats-py, requests
+/etc/systemd/system/
+├── homelab-health.service
+└── homelab-health.timer
+```
+
+**Source of truth in repo:**
+
+```
+k3s/health/
+├── home_lab_health.md         # this file
+├── checker.py
+├── checks.json
+├── requirements.txt           # nats-py, requests
+├── install.sh                 # runs on pve-control, sets up venv + units
+├── homelab-health.service
+├── homelab-health.timer
+└── tests/
+    └── test_checks.py
+```
+
+**Runtime flow each tick:**
+
+1. Load `checks.json`.
+2. Connect to NATS with a 3s timeout. On failure: log loud, still run checks, publish nothing, exit 1.
+3. Run each check function in sequence, each wrapped in `try/except`; exceptions in one check never stop the others (they become a `healthcheck.<fn>` meta-issue).
+4. Each check returns `list[Issue]`. Main loop aggregates.
+5. Log every issue to stdout (journal).
+6. For each issue, publish one NATS message to `homelab_health_issue`.
+7. Exit 0 if zero issues, 1 otherwise. `systemctl status` + journalctl give humans visibility.
+
+---
+
+## Config schema (`checks.json`)
+
+```json
+{
+  "nats": {
+    "url": "nats://nats.default.svc.cluster.local:4222",
+    "subject": "homelab_health_issue",
+    "monitoring_nodeport": 32388
+  },
+  "databases": [
+    {
+      "name": "postgres",
+      "namespace": "default",
+      "pod_label": "app=postgres",
+      "probe_cmd": ["pg_isready", "-U", "postgres"]
+    },
+    {
+      "name": "mariadb",
+      "namespace": "default",
+      "pod_label": "app=mariadb",
+      "probe_cmd": ["mariadb-admin", "ping", "--silent"]
+    }
+  ],
+  "services": [
+    {"name": "ghost1", "namespace": "fulfillment", "db": "mariadb",
+     "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
+    {"name": "ghost2", "namespace": "fulfillment", "db": "mariadb",
+     "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
+    {"name": "ghost3", "namespace": "fulfillment", "db": "mariadb",
+     "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]},
+    {"name": "mediawiki", "namespace": "default", "db": "mariadb",
+     "probe_path": "/", "expected": [200, 302]},
+    {"name": "forgejo", "namespace": "sjasoft", "db": "postgres",
+     "probe_path": "/api/healthz", "expected": [200]},
+    {"name": "authentik-server", "namespace": "default", "db": "postgres",
+     "probe_path": "/-/health/live/", "expected": [200, 204]},
+    {"name": "listmonk", "namespace": "default", "db": "postgres",
+     "probe_path": "/api/health", "expected": [200]},
+    {"name": "n8n", "namespace": "default", "db": "postgres",
+     "probe_path": "/healthz", "expected": [200]},
+    {"name": "mattermost", "namespace": "default", "db": "postgres",
+     "probe_path": "/api/v4/system/ping", "expected": [200]},
+    {"name": "vaultwarden", "namespace": "default", "db": null,
+     "probe_path": "/alive", "expected": [200]},
+    {"name": "garage", "namespace": "default", "db": null,
+     "probe_path": "/health", "expected": [200]},
+    {"name": "garage-webui", "namespace": "default", "db": null,
+     "probe_path": "/", "expected": [200, 302]}
+  ]
+}
+```
+
+**Probe URL resolution:** at runtime, `kubectl get svc -n <ns> <name> -o json` →
+extract `.spec.ports[].nodePort` → probe `http://localhost:<nodeport><probe_path>`.
+
+**Per-service silence:** add `"disabled": true` to a service entry to skip it without
+deleting it.
+
+**Verify actual probe paths during implementation** — the paths above are reasonable
+defaults but each needs a quick curl sanity check. Specifically double-check:
+Authentik (`/-/health/live/` vs `/-/health/ready/`), Garage (root `/health` endpoint),
+Vaultwarden (`/alive` returns 200 plain-text timestamp — confirmed), n8n (`/healthz`).
+
+---
+
+## Check catalog
+
+One function per requirement, sharing an internal `probe_service(svc_cfg)` helper.
+
+| Function | Covers | Mechanism |
+|---|---|---|
+| `check_nats()` | #0 | `kubectl exec` NATS pod to run `nats server check connection`; fallback HTTP GET `localhost:<monitoring_nodeport>/healthz` |
+| `check_postgres()` | #2 | `kubectl exec` postgres pod to run `pg_isready -U postgres` |
+| `check_mariadb()` | #1 | `kubectl exec` mariadb pod to run `mariadb-admin ping --silent` |
+| `check_ghost_blogs()` | #3 | `probe_service` for every service whose name starts with `ghost` |
+| `check_mariadb_dependents()` | #4 | `probe_service` for every non-ghost service where `db == "mariadb"` |
+| `check_postgres_dependents()` | #5 | `probe_service` for every service where `db == "postgres"` |
+| `check_standalone_services()` | implicit | `probe_service` for every service where `db == null` |
+| `check_all_nodeports()` | #6 | `kubectl get svc -A -o json`; for every `nodePort`, TCP connect `localhost:<nodeport>`; failure = nothing listening |
+
+**`probe_service(svc)`:** resolves NodePort via kubectl, calls
+`requests.get(f"http://localhost:{nodeport}{svc['probe_path']}", timeout=10)`,
+compares status to `expected`, returns an `Issue` on mismatch or on exception.
+
+**Root-cause hints in payload:** if `check_mariadb()` produced an issue this run,
+any `check_mariadb_dependents()` failure gets `"root_cause": "mariadb unreachable"`.
+Same pattern for postgres. Decorative — consumers decide what to do with it.
+
+---
+
+## Error handling
+
+```python
+def run_all_checks(cfg) -> list[Issue]:
+    issues = []
+    for fn in [check_nats, check_postgres, check_mariadb,
+               check_ghost_blogs, check_mariadb_dependents,
+               check_postgres_dependents, check_standalone_services,
+               check_all_nodeports]:
+        try:
+            issues.extend(fn(cfg))
+        except Exception as e:
+            issues.append(Issue(
+                component_name=f"healthcheck.{fn.__name__}",
+                issue_detail=f"check function raised: {type(e).__name__}: {e}",
+                detected_at=now_iso(),
+                root_cause="healthcheck bug or missing dependency"))
+    return issues
+```
+
+- No single check can halt the pipeline.
+- NATS connect failure is loud-logged; checks still run; individual publish failures
+  are logged but don't stop the rest.
+- `Issue` is a small dataclass; `to_dict()` serialises to the exact NATS payload schema.
+
+---
+
+## Deployment
+
+**`install.sh` (run once on pve-control as samantha, with sudo where needed):**
+
+```bash
+set -euo pipefail
+sudo mkdir -p /opt/homelab-health
+sudo rsync -a --delete ./ /opt/homelab-health/ --exclude=install.sh --exclude=tests
+sudo chown -R samantha:samantha /opt/homelab-health
+python3 -m venv /opt/homelab-health/venv
+/opt/homelab-health/venv/bin/pip install -r /opt/homelab-health/requirements.txt
+sudo cp homelab-health.service homelab-health.timer /etc/systemd/system/
+sudo systemctl daemon-reload
+sudo systemctl enable --now homelab-health.timer
+```
+
+**`homelab-health.service`:**
+
+```ini
+[Unit]
+Description=Homelab internal health checks
+After=network-online.target
+
+[Service]
+Type=oneshot
+User=samantha
+ExecStart=/opt/homelab-health/venv/bin/python /opt/homelab-health/checker.py
+StandardOutput=journal
+StandardError=journal
+```
+
+**`homelab-health.timer`:**
+
+```ini
+[Unit]
+Description=Run homelab health checks every 10 minutes
+
+[Timer]
+OnCalendar=*:0/10
+Persistent=true
+
+[Install]
+WantedBy=timers.target
+```
+
+---
+
+## Testing
+
+**Unit tests** (`tests/test_checks.py`, pytest):
+- Each check function takes a config object — easily stubbed.
+- `probe_service` accepts an injected HTTP client so tests don't hit real services.
+- Mock `subprocess.run` for kubectl calls.
+- Assert the exact `Issue` list returned for each failure shape.
+
+**Manual smoke test** — `checker.py --dry-run` logs all issues to stdout but skips
+NATS publish. Run ad-hoc on pve-control during development.
+
+**End-to-end verification after install:**
+1. `systemctl list-timers homelab-health.timer` shows next fire time.
+2. Manually fire once: `sudo systemctl start homelab-health.service`.
+3. `journalctl -u homelab-health -n 200` shows outcome.
+4. On workstation: `nats sub homelab_health_issue` (against the cluster NATS).
+5. Break **mediawiki** (`kubectl scale deploy mediawiki -n default --replicas=0`) and
+   wait ≤10 min — expect a message on the subject, with `component_name:"mediawiki"`.
+6. Restore (`--replicas=1`) and confirm alerts stop on the next tick.
+
+---
+
+## Open items / future
+
+- **Leaf/LAN NATS fallback:** add `FALLBACK_NATS_URL` env-var hook in `checker.py`
+  (unused for now). When the leaf NATS comes online, publish there too on connect
+  failure to primary.
+- **NATS auth:** current assumption is local anonymous publish is allowed. If auth is
+  added, introduce a `nats.creds_path` field in `checks.json` pointing at a creds
+  file on pve-control.
+- **k8s Python client migration:** replace the two remaining `kubectl` subprocess
+  calls with the `kubernetes` library for a fully in-process script.
+- **Recovery events:** if downstream consumers want a "resolved" signal, add a small
+  local state file (JSON on disk) to detect transitions and publish recovery events.
+- **Per-namespace grouping:** not needed now; if service list grows beyond ~25,
+  reconsider organizing `checks.json` by namespace for readability.
--- a/k3s/health/homelab-health.service
+++ b/k3s/health/homelab-health.service
@ -0,0 +1,14 @@
+[Unit]
+Description=Homelab internal health checks
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=oneshot
+User=samantha
+WorkingDirectory=/home/samantha/homelab-health
+Environment=KUBECONFIG=/home/samantha/.kube/config
+Environment=PATH=/usr/local/bin:/usr/bin:/bin
+ExecStart=/home/samantha/homelab-health/venv/bin/python /home/samantha/homelab-health/checker.py --config /home/samantha/homelab-health/checks.json
+StandardOutput=journal
+StandardError=journal
--- a/k3s/health/homelab-health.timer
+++ b/k3s/health/homelab-health.timer
@ -0,0 +1,11 @@
+[Unit]
+Description=Run homelab health checks every 10 minutes
+
+[Timer]
+OnBootSec=2min
+OnUnitActiveSec=10min
+Persistent=true
+AccuracySec=30s
+
+[Install]
+WantedBy=timers.target
--- a/k3s/health/requirements.txt
+++ b/k3s/health/requirements.txt
@ -0,0 +1,2 @@
+nats-py==2.9.0
+requests==2.32.3