diff --git a/CLAUDE.md b/CLAUDE.md index e655893..2becef8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -19,6 +19,8 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co The `sancta-choir` configuration (x86_64 Hetzner VPS) hosts the **OpenClaw AI gateway**. It is built in CI but deployed separately. Only build or deploy `sancta-choir` when the user explicitly asks for it by name. +> **⚠️ Remote VPS deploys are NOT atomic.** On the GRUB-based Hetzner hosts (`sancta-choir`, `sancta-claw`, `hermes-claw`), `nixos-rebuild switch` builds → updates GRUB → activates; an OOM-killed build on a small-RAM box can leave an unbootable system (the ~10h #252 outage). For any remote VPS deploy: gate the switch on a successful build **and throttle it** — `nixos-rebuild build --flake .# --max-jobs 1 --cores 1 && nixos-rebuild switch …` — and prefer `nixos-rebuild boot` (or `system.autoUpgrade.operation = "boot"`) for risky changes, so a bad generation only needs a reboot. `scripts/deploy.sh` and `scripts/install.sh` already apply the `--max-jobs 1 --cores 1` throttle; all three VPS hosts carry a swapfile. + **Tailscale hostname:** - `rpi5` or `rpi5.tail4249a9.ts.net` diff --git a/hosts/hermes-claw/configuration.nix b/hosts/hermes-claw/configuration.nix index 0d6b772..ace4d32 100644 --- a/hosts/hermes-claw/configuration.nix +++ b/hosts/hermes-claw/configuration.nix @@ -53,6 +53,18 @@ useUserPackages = true; }; + # ── Swap (4GB — OOM headroom for builds/manual switch on this CX33 GRUB VPS) ── + # No disk swap otherwise (only RAM-backed zram from common.nix, useless under a + # build RSS spike). Mirrors sancta-choir; guards against the #451/#252 build-OOM + # brick on a remote GRUB host. No kernel pin — hermes-claw has no corrupted-store + # history, and a prophylactic pin would just be unretired maintenance debt. + swapDevices = [ + { + device = "/swapfile"; + size = 4096; # 4GB + } + ]; + # ── SSH authorized keys (cross-host management from sancta-choir + rpi5) ── users.users.root.openssh.authorizedKeys.keys = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPw5RFrFfZQUWlyfGSU1Q8BlEHnvIdBtcnCn+uYtEzal nixos-sancta-choir" diff --git a/hosts/sancta-choir/configuration.nix b/hosts/sancta-choir/configuration.nix index 7a7043f..59a392f 100644 --- a/hosts/sancta-choir/configuration.nix +++ b/hosts/sancta-choir/configuration.nix @@ -6,7 +6,14 @@ }: { - # Pin kernel to 6.6 LTS to avoid store corruption from incomplete 6.12 build + # Pin kernel to 6.6 LTS — workaround for the Feb-2026 #252 incident, where an + # OOM-killed build left corrupted 6.12.63 store paths that would not boot. The + # 6.12 kernel itself is not broken; the corruption was build-time only. + # EXIT CRITERIA (unpin only when ALL hold): (1) GC the corrupt paths on-host + # (`nix-collect-garbage -d`); (2) a clean `nixos-rebuild build` of a 6.12 kernel + # succeeds on-host; (3) validate via `nixos-rebuild boot` + manual reboot first. + # Do NOT unpin remotely-untested — a bad kernel on a headless VPS needs + # rescue-mode recovery. boot.kernelPackages = pkgs.linuxPackages_6_6; # Enable aarch64 emulation for cross-building RPi5 images diff --git a/hosts/sancta-claw/configuration.nix b/hosts/sancta-claw/configuration.nix index 5980a67..b18278d 100644 --- a/hosts/sancta-claw/configuration.nix +++ b/hosts/sancta-claw/configuration.nix @@ -107,10 +107,14 @@ # --update-input is intentionally omitted: with a remote GitHub flake URL # there is no local path to write an updated lock file back to, so the flag # would be a no-op. allowReboot=false: never reboots automatically (VPS — - # schedule manual reboots for kernel updates). + # schedule manual reboots for kernel updates). operation="boot": build the new + # generation and set it as the boot default WITHOUT activating it — avoids the + # non-atomic GRUB-mutating `switch` on an unattended remote box (#451/#252); the + # update takes effect on the next manual reboot. system.autoUpgrade = { enable = true; flake = "github:alexandru-savinov/nixos-config#sancta-claw"; + operation = "boot"; dates = "04:30"; randomizedDelaySec = "30min"; allowReboot = false;