diff --git a/.opencode/opencode.json b/.opencode/opencode.json new file mode 100644 index 0000000..7c1fd86 --- /dev/null +++ b/.opencode/opencode.json @@ -0,0 +1,5 @@ +{ + "plugin": [ + "web" + ] +} \ No newline at end of file diff --git a/k3s/health/checker.py b/k3s/health/checker.py new file mode 100644 index 0000000..1bdecc2 --- /dev/null +++ b/k3s/health/checker.py @@ -0,0 +1,308 @@ +"""Homelab internal health checker. + +Runs on pve-control every 10 minutes via systemd timer. Publishes issue +events to NATS subject `homelab_health_issue`. Silent when healthy. +See home_lab_health.md for design. +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import socket +import subprocess +import sys +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from typing import Optional, Sequence + +import requests + + +log = logging.getLogger("homelab-health") + + +# ---------- payload ---------- + +@dataclass +class Issue: + component_name: str + issue_detail: str + detected_at: str + root_cause: Optional[str] = None + + def to_dict(self) -> dict: + d = asdict(self) + if d["root_cause"] is None: + del d["root_cause"] + return d + + +def now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +# ---------- kubectl helpers ---------- + +def kubectl_json(args: Sequence[str], timeout: int = 15) -> dict: + cmd = ["kubectl", *args, "-o", "json"] + r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + if r.returncode != 0: + raise RuntimeError(f"kubectl {' '.join(args)}: {r.stderr.strip()}") + return json.loads(r.stdout) + + +@dataclass +class ExecResult: + ok: bool + stdout: str + stderr: str + + +def kubectl_exec(namespace: str, pod_label: str, command: Sequence[str], + timeout: int = 10) -> ExecResult: + """Run a command inside the first pod matching pod_label.""" + try: + n = subprocess.run( + ["kubectl", "get", "pod", "-n", namespace, "-l", pod_label, + "-o", "jsonpath={.items[0].metadata.name}"], + capture_output=True, text=True, timeout=5, + ) + except subprocess.TimeoutExpired: + return ExecResult(False, "", "pod lookup timed out") + if n.returncode != 0 or not n.stdout.strip(): + return ExecResult(False, "", f"no pod matched {pod_label} in {namespace}") + pod = n.stdout.strip() + try: + r = subprocess.run( + ["kubectl", "exec", "-n", namespace, pod, "--", *command], + capture_output=True, text=True, timeout=timeout, + ) + except subprocess.TimeoutExpired: + return ExecResult(False, "", f"kubectl exec timed out after {timeout}s") + return ExecResult(r.returncode == 0, r.stdout, r.stderr.strip()) + + +def resolve_nodeport(namespace: str, service: str) -> Optional[int]: + """Return the first NodePort on the given service, or None.""" + svc = kubectl_json(["get", "svc", "-n", namespace, service]) + for port in svc.get("spec", {}).get("ports", []): + if "nodePort" in port: + return int(port["nodePort"]) + return None + + +# ---------- probes ---------- + +def probe_service(svc: dict) -> list[Issue]: + if svc.get("disabled"): + return [] + name = svc["name"] + try: + nodeport = resolve_nodeport(svc["namespace"], name) + except Exception as e: + return [Issue(name, f"kubectl failed: {e}", now_iso())] + if nodeport is None: + return [Issue(name, "no NodePort exposed", now_iso())] + + url = f"http://localhost:{nodeport}{svc['probe_path']}" + try: + resp = requests.get(url, timeout=10, allow_redirects=False) + except requests.RequestException as e: + return [Issue(name, f"probe error at {url}: {e}", now_iso())] + if resp.status_code in svc["expected"]: + return [] + return [Issue(name, + f"HTTP {resp.status_code} at {url} (expected {svc['expected']})", + now_iso())] + + +# ---------- check functions ---------- + +def check_nats(cfg: dict) -> list[Issue]: + port = cfg["nats"]["monitoring_nodeport"] + url = f"http://localhost:{port}/healthz" + try: + r = requests.get(url, timeout=5) + except requests.RequestException as e: + return [Issue("nats", f"monitoring unreachable: {e}", now_iso())] + if r.status_code != 200: + return [Issue("nats", f"/healthz returned {r.status_code}", now_iso())] + return [] + + +def check_databases(cfg: dict) -> list[Issue]: + issues = [] + for db in cfg.get("databases", []): + result = kubectl_exec(db["namespace"], db["pod_label"], db["probe_cmd"]) + if not result.ok: + issues.append(Issue( + db["name"], + f"liveness probe failed: {result.stderr or '(no stderr)'}", + now_iso(), + )) + return issues + + +def _filter_probe(services, pred) -> list[Issue]: + out = [] + for s in services: + if pred(s): + out.extend(probe_service(s)) + return out + + +def check_ghost_blogs(cfg): + return _filter_probe(cfg.get("services", []), + lambda s: s["name"].startswith("ghost")) + + +def check_mariadb_dependents(cfg): + return _filter_probe(cfg.get("services", []), + lambda s: s["db"] == "mariadb" and not s["name"].startswith("ghost")) + + +def check_postgres_dependents(cfg): + return _filter_probe(cfg.get("services", []), + lambda s: s["db"] == "postgres") + + +def check_standalone_services(cfg): + return _filter_probe(cfg.get("services", []), + lambda s: s["db"] is None) + + +def check_all_nodeports(_cfg) -> list[Issue]: + """TCP connect to every NodePort in the cluster.""" + svcs = kubectl_json(["get", "svc", "-A"]) + issues = [] + for item in svcs.get("items", []): + meta = item.get("metadata", {}) + name = f"{meta.get('namespace')}/{meta.get('name')}" + for port in item.get("spec", {}).get("ports", []): + np = port.get("nodePort") + if np is None: + continue + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(3) + rc = s.connect_ex(("127.0.0.1", int(np))) + if rc != 0: + issues.append(Issue( + name, + f"NodePort {np} not accepting TCP (errno {rc})", + now_iso(), + )) + return issues + + +# ---------- orchestration ---------- + +CHECKS = [ + check_nats, + check_databases, + check_ghost_blogs, + check_mariadb_dependents, + check_postgres_dependents, + check_standalone_services, + check_all_nodeports, +] + + +def run_all_checks(cfg: dict) -> list[Issue]: + buckets: dict[str, list[Issue]] = {} + for fn in CHECKS: + try: + buckets[fn.__name__] = fn(cfg) + except Exception as e: + buckets[fn.__name__] = [Issue( + f"healthcheck.{fn.__name__}", + f"check raised: {type(e).__name__}: {e}", + now_iso(), + root_cause="healthcheck bug", + )] + + db_issues = buckets.get("check_databases", []) + mariadb_down = any(i.component_name == "mariadb" for i in db_issues) + postgres_down = any(i.component_name == "postgres" for i in db_issues) + + if mariadb_down: + for i in buckets.get("check_mariadb_dependents", []) + buckets.get("check_ghost_blogs", []): + if i.root_cause is None: + i.root_cause = "mariadb unreachable" + if postgres_down: + for i in buckets.get("check_postgres_dependents", []): + if i.root_cause is None: + i.root_cause = "postgres unreachable" + + out = [] + for fn in CHECKS: + out.extend(buckets.get(fn.__name__, [])) + return out + + +# ---------- NATS publish ---------- + +async def _publish(url: str, subject: str, payloads: list[bytes]) -> None: + import nats # type: ignore[import-not-found] + nc = await asyncio.wait_for( + nats.connect(url, connect_timeout=3, allow_reconnect=False), # type: ignore[attr-defined] + timeout=8, + ) + try: + for p in payloads: + await nc.publish(subject, p) + await nc.flush() + finally: + await nc.close() + + +def publish_issues(issues: list[Issue], cfg: dict) -> None: + if not issues: + return + payloads = [json.dumps(i.to_dict()).encode("utf-8") for i in issues] + asyncio.run(_publish(cfg["nats"]["url"], cfg["nats"]["subject"], payloads)) + + +# ---------- entry ---------- + +def load_config(path: str) -> dict: + with open(path) as f: + return json.load(f) + + +def main(argv=None) -> int: + logging.basicConfig(level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S%z") + ap = argparse.ArgumentParser() + ap.add_argument("--config", default="/home/samantha/homelab-health/checks.json") + ap.add_argument("--dry-run", action="store_true", + help="Print issues to stdout; do not publish to NATS") + args = ap.parse_args(argv) + + cfg = load_config(args.config) + issues = run_all_checks(cfg) + + for i in issues: + log.warning("issue: %s", json.dumps(i.to_dict())) + + if not issues: + log.info("all checks green") + return 0 + + if args.dry_run: + for i in issues: + print(json.dumps(i.to_dict())) + return 1 + + try: + publish_issues(issues, cfg) + except Exception as e: + log.error("NATS publish failed: %s (issues not delivered)", e) + return 1 + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/k3s/health/checks.json b/k3s/health/checks.json new file mode 100644 index 0000000..7ac4e8f --- /dev/null +++ b/k3s/health/checks.json @@ -0,0 +1,45 @@ +{ + "nats": { + "url": "nats://10.0.0.6:32386", + "subject": "homelab_health_issue", + "monitoring_nodeport": 32388 + }, + "databases": [ + { + "name": "postgres", + "namespace": "default", + "pod_label": "app=postgres", + "probe_cmd": ["pg_isready", "-U", "postgres"] + }, + { + "name": "mariadb", + "namespace": "default", + "pod_label": "app=mariadb", + "probe_cmd": ["mariadb-admin", "ping", "--silent"] + } + ], + "services": [ + {"name": "ghost1", "namespace": "default", "db": "mariadb", + "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]}, + {"name": "ghost2", "namespace": "default", "db": "mariadb", + "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]}, + {"name": "ghost3", "namespace": "default", "db": "mariadb", + "probe_path": "/ghost/api/admin/site/", "expected": [200, 301, 401]}, + {"name": "mediawiki", "namespace": "default", "db": "mariadb", + "probe_path": "/", "expected": [200, 301, 302]}, + {"name": "forgejo", "namespace": "default", "db": "postgres", + "probe_path": "/api/healthz", "expected": [200]}, + {"name": "authentik", "namespace": "default", "db": "postgres", + "probe_path": "/-/health/live/", "expected": [200, 204]}, + {"name": "listmonk", "namespace": "default", "db": "postgres", + "probe_path": "/", "expected": [200, 302]}, + {"name": "n8n", "namespace": "default", "db": "postgres", + "probe_path": "/healthz", "expected": [200]}, + {"name": "mattermost", "namespace": "default", "db": "postgres", + "probe_path": "/api/v4/system/ping", "expected": [200]}, + {"name": "vaultwarden", "namespace": "default", "db": null, + "probe_path": "/alive", "expected": [200]}, + {"name": "garage-webui", "namespace": "default", "db": null, + "probe_path": "/", "expected": [200, 302]} + ] +} diff --git a/k3s/health/home_lab_health.md b/k3s/health/home_lab_health.md new file mode 100644 index 0000000..6e008ca --- /dev/null +++ b/k3s/health/home_lab_health.md @@ -0,0 +1,319 @@ +# Homelab Health — Internal Checks Design + +**Status: design approved 2026-04-20. Ready to write implementation plan.** + +--- + +## Resume Notes (for next session) + +You and I brainstormed this design across one session. All design questions answered, +all three design sections approved. The next step per the brainstorming skill is: + +1. **This file exists** — design committed (or ready to commit). +2. **Next action:** invoke `superpowers:writing-plans` to turn this design into a + step-by-step implementation plan. +3. After the implementation plan is written, execute it (writing-plans → executing-plans). + +Do NOT re-open any design decisions in the new session unless something here is +obviously wrong; the decisions below are settled. + +**Test canary:** when verifying the installed system end-to-end, break **mediawiki** +(e.g. scale to 0 replicas), not the Ghost blogs. Ghosts are production, MediaWiki is +expendable for a "does the alert fire" test. + +--- + +## Goals + +Add a second layer of cluster health monitoring that runs **inside** the K3s cluster +and reports structural / semantic problems to NATS. The existing +`k3s/scripts/check-health.sh` (workstation-driven canary) stays in place unchanged. + +Requirements, as given: + +0. NATS itself up +1. MariaDB up +2. PostgreSQL up +3. Internal Ghost blog ports respond to HTTP correctly +4. All other services depending on MariaDB respond correctly +5. All services depending on PostgreSQL respond correctly +6. Something is listening at every NodePort + +Plus implicit: standalone services (Vaultwarden, Garage, etc.) also get probed. + +**Output contract:** publish NATS messages on subject `homelab_health_issue` with +JSON body: + +```json +{ + "component_name": "", + "issue_detail": "", + "detected_at": "", + "root_cause": "" +} +``` + +--- + +## Decisions (settled) + +| Decision | Choice | Why | +|---|---|---| +| Where it runs | systemd timer on **pve-control** | Master K3s control node; kubectl locally; always on. | +| Language | **Python 3** | User expertise; structured JSON; clean error handling. | +| HTTP probes | `requests` library | No subprocess per probe; in-process. | +| NATS publish | `nats-py` library | In-process; one cohesive Python process. | +| kubectl use | **subprocess** (kept for now) | Only two call sites; revisit later with `kubernetes` client. | +| DB auth for probes | **sidestepped** | Use `kubectl exec -- pg_isready` / `mariadb-admin ping`; no creds on pve-control. | +| Orchestration | Single script, one function per check category | Simple; matches "one function per check" ask. | +| Schedule | Every **10 minutes** | User said no more frequent than that. | +| Deduplication | **Stateless** | Re-fires every tick while failing; consumer handles aggregation. | +| Healthy publishes | **None** | Silent when OK. Only problems on the wire. | +| Recovery events | **None** | Reports stop when fixed; absence = healthy. | +| Service config | **JSON file** (`checks.json`) | Pythonic; easy to edit/commit; lives alongside `checker.py`. | +| NodePort discovery | **Live from `kubectl get svc -A -o json`** | Source of truth is the cluster; no drift. | +| NATS-down fallback | **stdout + non-zero exit** | Workstation canary + `systemctl status` surface failures. Future leaf/LAN NATS fallback via env var hook (deferred). | + +--- + +## Architecture + +**Deployment layout on pve-control:** + +``` +/opt/homelab-health/ +├── checker.py # Python entrypoint, one function per check +├── checks.json # service catalog + NATS/DB config +├── venv/ # virtualenv with nats-py, requests +/etc/systemd/system/ +├── homelab-health.service +└── homelab-health.timer +``` + +**Source of truth in repo:** + +``` +k3s/health/ +├── home_lab_health.md # this file +├── checker.py +├── checks.json +├── requirements.txt # nats-py, requests +├── install.sh # runs on pve-control, sets up venv + units +├── homelab-health.service +├── homelab-health.timer +└── tests/ + └── test_checks.py +``` + +**Runtime flow each tick:** + +1. Load `checks.json`. +2. Connect to NATS with a 3s timeout. On failure: log loud, still run checks, publish nothing, exit 1. +3. Run each check function in sequence, each wrapped in `try/except`; exceptions in one check never stop the others (they become a `healthcheck.` meta-issue). +4. Each check returns `list[Issue]`. Main loop aggregates. +5. Log every issue to stdout (journal). +6. For each issue, publish one NATS message to `homelab_health_issue`. +7. Exit 0 if zero issues, 1 otherwise. `systemctl status` + journalctl give humans visibility. + +--- + +## Config schema (`checks.json`) + +```json +{ + "nats": { + "url": "nats://nats.default.svc.cluster.local:4222", + "subject": "homelab_health_issue", + "monitoring_nodeport": 32388 + }, + "databases": [ + { + "name": "postgres", + "namespace": "default", + "pod_label": "app=postgres", + "probe_cmd": ["pg_isready", "-U", "postgres"] + }, + { + "name": "mariadb", + "namespace": "default", + "pod_label": "app=mariadb", + "probe_cmd": ["mariadb-admin", "ping", "--silent"] + } + ], + "services": [ + {"name": "ghost1", "namespace": "fulfillment", "db": "mariadb", + "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]}, + {"name": "ghost2", "namespace": "fulfillment", "db": "mariadb", + "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]}, + {"name": "ghost3", "namespace": "fulfillment", "db": "mariadb", + "probe_path": "/ghost/api/admin/site/", "expected": [200, 401]}, + {"name": "mediawiki", "namespace": "default", "db": "mariadb", + "probe_path": "/", "expected": [200, 302]}, + {"name": "forgejo", "namespace": "sjasoft", "db": "postgres", + "probe_path": "/api/healthz", "expected": [200]}, + {"name": "authentik-server", "namespace": "default", "db": "postgres", + "probe_path": "/-/health/live/", "expected": [200, 204]}, + {"name": "listmonk", "namespace": "default", "db": "postgres", + "probe_path": "/api/health", "expected": [200]}, + {"name": "n8n", "namespace": "default", "db": "postgres", + "probe_path": "/healthz", "expected": [200]}, + {"name": "mattermost", "namespace": "default", "db": "postgres", + "probe_path": "/api/v4/system/ping", "expected": [200]}, + {"name": "vaultwarden", "namespace": "default", "db": null, + "probe_path": "/alive", "expected": [200]}, + {"name": "garage", "namespace": "default", "db": null, + "probe_path": "/health", "expected": [200]}, + {"name": "garage-webui", "namespace": "default", "db": null, + "probe_path": "/", "expected": [200, 302]} + ] +} +``` + +**Probe URL resolution:** at runtime, `kubectl get svc -n -o json` → +extract `.spec.ports[].nodePort` → probe `http://localhost:`. + +**Per-service silence:** add `"disabled": true` to a service entry to skip it without +deleting it. + +**Verify actual probe paths during implementation** — the paths above are reasonable +defaults but each needs a quick curl sanity check. Specifically double-check: +Authentik (`/-/health/live/` vs `/-/health/ready/`), Garage (root `/health` endpoint), +Vaultwarden (`/alive` returns 200 plain-text timestamp — confirmed), n8n (`/healthz`). + +--- + +## Check catalog + +One function per requirement, sharing an internal `probe_service(svc_cfg)` helper. + +| Function | Covers | Mechanism | +|---|---|---| +| `check_nats()` | #0 | `kubectl exec` NATS pod to run `nats server check connection`; fallback HTTP GET `localhost:/healthz` | +| `check_postgres()` | #2 | `kubectl exec` postgres pod to run `pg_isready -U postgres` | +| `check_mariadb()` | #1 | `kubectl exec` mariadb pod to run `mariadb-admin ping --silent` | +| `check_ghost_blogs()` | #3 | `probe_service` for every service whose name starts with `ghost` | +| `check_mariadb_dependents()` | #4 | `probe_service` for every non-ghost service where `db == "mariadb"` | +| `check_postgres_dependents()` | #5 | `probe_service` for every service where `db == "postgres"` | +| `check_standalone_services()` | implicit | `probe_service` for every service where `db == null` | +| `check_all_nodeports()` | #6 | `kubectl get svc -A -o json`; for every `nodePort`, TCP connect `localhost:`; failure = nothing listening | + +**`probe_service(svc)`:** resolves NodePort via kubectl, calls +`requests.get(f"http://localhost:{nodeport}{svc['probe_path']}", timeout=10)`, +compares status to `expected`, returns an `Issue` on mismatch or on exception. + +**Root-cause hints in payload:** if `check_mariadb()` produced an issue this run, +any `check_mariadb_dependents()` failure gets `"root_cause": "mariadb unreachable"`. +Same pattern for postgres. Decorative — consumers decide what to do with it. + +--- + +## Error handling + +```python +def run_all_checks(cfg) -> list[Issue]: + issues = [] + for fn in [check_nats, check_postgres, check_mariadb, + check_ghost_blogs, check_mariadb_dependents, + check_postgres_dependents, check_standalone_services, + check_all_nodeports]: + try: + issues.extend(fn(cfg)) + except Exception as e: + issues.append(Issue( + component_name=f"healthcheck.{fn.__name__}", + issue_detail=f"check function raised: {type(e).__name__}: {e}", + detected_at=now_iso(), + root_cause="healthcheck bug or missing dependency")) + return issues +``` + +- No single check can halt the pipeline. +- NATS connect failure is loud-logged; checks still run; individual publish failures + are logged but don't stop the rest. +- `Issue` is a small dataclass; `to_dict()` serialises to the exact NATS payload schema. + +--- + +## Deployment + +**`install.sh` (run once on pve-control as samantha, with sudo where needed):** + +```bash +set -euo pipefail +sudo mkdir -p /opt/homelab-health +sudo rsync -a --delete ./ /opt/homelab-health/ --exclude=install.sh --exclude=tests +sudo chown -R samantha:samantha /opt/homelab-health +python3 -m venv /opt/homelab-health/venv +/opt/homelab-health/venv/bin/pip install -r /opt/homelab-health/requirements.txt +sudo cp homelab-health.service homelab-health.timer /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable --now homelab-health.timer +``` + +**`homelab-health.service`:** + +```ini +[Unit] +Description=Homelab internal health checks +After=network-online.target + +[Service] +Type=oneshot +User=samantha +ExecStart=/opt/homelab-health/venv/bin/python /opt/homelab-health/checker.py +StandardOutput=journal +StandardError=journal +``` + +**`homelab-health.timer`:** + +```ini +[Unit] +Description=Run homelab health checks every 10 minutes + +[Timer] +OnCalendar=*:0/10 +Persistent=true + +[Install] +WantedBy=timers.target +``` + +--- + +## Testing + +**Unit tests** (`tests/test_checks.py`, pytest): +- Each check function takes a config object — easily stubbed. +- `probe_service` accepts an injected HTTP client so tests don't hit real services. +- Mock `subprocess.run` for kubectl calls. +- Assert the exact `Issue` list returned for each failure shape. + +**Manual smoke test** — `checker.py --dry-run` logs all issues to stdout but skips +NATS publish. Run ad-hoc on pve-control during development. + +**End-to-end verification after install:** +1. `systemctl list-timers homelab-health.timer` shows next fire time. +2. Manually fire once: `sudo systemctl start homelab-health.service`. +3. `journalctl -u homelab-health -n 200` shows outcome. +4. On workstation: `nats sub homelab_health_issue` (against the cluster NATS). +5. Break **mediawiki** (`kubectl scale deploy mediawiki -n default --replicas=0`) and + wait ≤10 min — expect a message on the subject, with `component_name:"mediawiki"`. +6. Restore (`--replicas=1`) and confirm alerts stop on the next tick. + +--- + +## Open items / future + +- **Leaf/LAN NATS fallback:** add `FALLBACK_NATS_URL` env-var hook in `checker.py` + (unused for now). When the leaf NATS comes online, publish there too on connect + failure to primary. +- **NATS auth:** current assumption is local anonymous publish is allowed. If auth is + added, introduce a `nats.creds_path` field in `checks.json` pointing at a creds + file on pve-control. +- **k8s Python client migration:** replace the two remaining `kubectl` subprocess + calls with the `kubernetes` library for a fully in-process script. +- **Recovery events:** if downstream consumers want a "resolved" signal, add a small + local state file (JSON on disk) to detect transitions and publish recovery events. +- **Per-namespace grouping:** not needed now; if service list grows beyond ~25, + reconsider organizing `checks.json` by namespace for readability. diff --git a/k3s/health/homelab-health.service b/k3s/health/homelab-health.service new file mode 100644 index 0000000..19e965f --- /dev/null +++ b/k3s/health/homelab-health.service @@ -0,0 +1,14 @@ +[Unit] +Description=Homelab internal health checks +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=samantha +WorkingDirectory=/home/samantha/homelab-health +Environment=KUBECONFIG=/home/samantha/.kube/config +Environment=PATH=/usr/local/bin:/usr/bin:/bin +ExecStart=/home/samantha/homelab-health/venv/bin/python /home/samantha/homelab-health/checker.py --config /home/samantha/homelab-health/checks.json +StandardOutput=journal +StandardError=journal diff --git a/k3s/health/homelab-health.timer b/k3s/health/homelab-health.timer new file mode 100644 index 0000000..dd6e0f6 --- /dev/null +++ b/k3s/health/homelab-health.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Run homelab health checks every 10 minutes + +[Timer] +OnBootSec=2min +OnUnitActiveSec=10min +Persistent=true +AccuracySec=30s + +[Install] +WantedBy=timers.target diff --git a/k3s/health/requirements.txt b/k3s/health/requirements.txt new file mode 100644 index 0000000..4f3713e --- /dev/null +++ b/k3s/health/requirements.txt @@ -0,0 +1,2 @@ +nats-py==2.9.0 +requests==2.32.3