diff --git a/CLAUDE.md b/CLAUDE.md index 1fa3a8b..e655893 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -330,7 +330,9 @@ On memory-constrained hosts (RPi5, 4GB RAM), n8n loop iterations accumulate item 1. **Write to disk early** — In Extract nodes, write raw bytes to `{jobDir}/` and return only the file path 2. **Pass paths, not data** — Loop items carry `imageFile`/`audioFile` paths instead of base64 strings -3. **Read back late** — In the final assembly node (e.g., Prepare APKG Input), read files from disk just before output +3. **Stream output to disk, never re-materialize** — The final assembly node (e.g., Prepare APKG Input) must **stream** its output: open the target file once (`fs.openSync`), then for each item read its binary from disk, `fs.writeSync` it into the output, and immediately null the per-item base64 vars before the next iteration. Return only the output file path — never build one in-memory object that holds every item's bytes at once. + +⚠️ Do NOT "read all files back into memory just before output" — that re-materializes the entire payload and OOM'd the host again after PR #438. The streaming fix is `e098d06`; the task-runner heap bump in that commit was added headroom, not the mechanism — the real fix is never holding the full payload in memory. This prevents OOM when processing 40+ items with ~200KB each of image + audio data. diff --git a/hosts/zero-kuzea/configuration.nix b/hosts/zero-kuzea/configuration.nix index 088cd1d..9dc616e 100644 --- a/hosts/zero-kuzea/configuration.nix +++ b/hosts/zero-kuzea/configuration.nix @@ -22,6 +22,14 @@ networking.hostName = "zero-kuzea"; system.stateVersion = "25.05"; + # Swap space (2GB for OOM headroom on 4GB CX22 — no disk swap otherwise) + swapDevices = [ + { + device = "/swapfile"; + size = 2048; # 2GB + } + ]; + # ── SSH authorized keys ───────────────────────────────────────────── users.users.root.openssh.authorizedKeys.keys = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPw5RFrFfZQUWlyfGSU1Q8BlEHnvIdBtcnCn+uYtEzal nixos-sancta-choir" diff --git a/modules/services/backup-pull.nix b/modules/services/backup-pull.nix index 0d6cf63..a706d44 100644 --- a/modules/services/backup-pull.nix +++ b/modules/services/backup-pull.nix @@ -198,6 +198,9 @@ in ExecStart = "${pkgs.restic}/bin/restic -r ${cfg.repository} --password-file ${cfg.resticPasswordFile} check"; Nice = 19; IOSchedulingClass = "idle"; + # A full repo check can far exceed the 90s host default; unbounded so it + # is not SIGTERM'd mid-check (which also fires a false backup-failure-alert). + TimeoutStartSec = 0; }; }; @@ -217,6 +220,9 @@ in OnFailure = [ "backup-failure-alert@%N.service" ]; RequiresMountsFor = [ cfg.stagingDir ]; }; + # A large backup run can exceed the 90s host default; give it generous + # headroom so it is not SIGTERM'd mid-run (which fires a false alert). + serviceConfig.TimeoutStartSec = "4h"; }; diff --git a/modules/services/n8n.nix b/modules/services/n8n.nix index 1b8a5e5..d95b1cb 100644 --- a/modules/services/n8n.nix +++ b/modules/services/n8n.nix @@ -854,6 +854,9 @@ in Type = "oneshot"; RemainAfterExit = true; # Run as root to read password file and restart n8n + # ~180s of internal wait-loops (healthz + login retries); raise above + # the 90s host default so it is not SIGTERM'd mid-install. + TimeoutStartSec = 200; }; path = [ pkgs.curl pkgs.jq ]; diff --git a/modules/services/open-webui-functions/openrouter_zdr_pipe.py b/modules/services/open-webui-functions/openrouter_zdr_pipe.py index e2affd2..3698a5d 100644 --- a/modules/services/open-webui-functions/openrouter_zdr_pipe.py +++ b/modules/services/open-webui-functions/openrouter_zdr_pipe.py @@ -89,15 +89,25 @@ def pipes(self) -> list[dict]: zdr_models = [] for item in zdr_data: - # Extract model ID from "Provider | model-id" format - # Use rsplit to handle edge cases where provider names might contain " | " - # Use .get() to avoid KeyError on malformed data - name = item.get("name", "") - if not name: - continue # Skip items without a name field - - name_parts = name.rsplit(" | ", 1) - model_id = name_parts[1] if len(name_parts) > 1 else name + if not isinstance(item, dict): + continue + + # The canonical model identifier on each `/endpoints/zdr` entry + # is `model_id` ("qwen/qwen3-coder:free") — the slug inbound + # requests actually send. `name` is human-readable + # ("Venice | Qwen3 Coder"); deriving the id from a + # `name.rsplit(" | ", 1)` tail leaks the display fragment + # ("Qwen3 Coder") for providers whose right half is the + # model_name, so the selector filled with ids no request could + # match. Read the canonical field first; keep `id` and the + # name-rsplit path only as last-resort fallbacks. + model_id = item.get("model_id") or item.get("id") or "" + if not model_id: + name = item.get("name", "") + if not name: + continue # Skip items without any identifiable field + name_parts = name.rsplit(" | ", 1) + model_id = name_parts[1] if len(name_parts) > 1 else name # Skip if model_id is empty or already seen if not model_id or model_id in seen_ids: diff --git a/modules/services/open-webui.nix b/modules/services/open-webui.nix index 341c6d9..25ed09e 100644 --- a/modules/services/open-webui.nix +++ b/modules/services/open-webui.nix @@ -1155,6 +1155,9 @@ in # Run as open-webui user for database and state directory access User = "open-webui"; Group = "open-webui"; + # ~90s of sequential wait-loops; raise above the 90s host default so it + # is not SIGTERM'd mid-migration. + TimeoutStartSec = 300; }; script = '' @@ -1311,6 +1314,9 @@ in # Run as open-webui user for database access User = "open-webui"; Group = "open-webui"; + # ~90s of sequential wait-loops; raise above the 90s host default so it + # is not SIGTERM'd mid-provision. + TimeoutStartSec = 300; }; script = '' @@ -1482,6 +1488,9 @@ in RemainAfterExit = true; User = "open-webui"; Group = "open-webui"; + # ~90s of sequential wait-loops; raise above the 90s host default so it + # is not SIGTERM'd mid-provision. + TimeoutStartSec = 300; }; script = '' diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 35244b7..ce5532b 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -118,7 +118,7 @@ echo "" # Build configuration echo "Building configuration..." -if nixos-rebuild build --flake "$FLAKE_REF"; then +if nixos-rebuild build --flake "$FLAKE_REF" --max-jobs 1 --cores 1; then echo "Build successful!" else echo "Build failed!" @@ -149,7 +149,7 @@ else echo "Applying configuration..." fi -if $SUDO nixos-rebuild switch --flake "$FLAKE_REF"; then +if $SUDO nixos-rebuild switch --flake "$FLAKE_REF" --max-jobs 1 --cores 1; then echo "" echo "Deployment complete!" echo "" diff --git a/scripts/install.sh b/scripts/install.sh index 482302b..dfa6037 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -97,7 +97,7 @@ echo "" # First, try to build without applying echo "🔨 Testing build (dry-run)..." -if $SUDO nixos-rebuild build --flake "$FLAKE_URL"; then +if $SUDO nixos-rebuild build --flake "$FLAKE_URL" --max-jobs 1 --cores 1; then echo "✅ Build successful!" echo "" else @@ -123,7 +123,7 @@ fi # Apply the configuration echo "🚀 Applying configuration..." -if $SUDO nixos-rebuild switch --flake "$FLAKE_URL"; then +if $SUDO nixos-rebuild switch --flake "$FLAKE_URL" --max-jobs 1 --cores 1; then echo "" echo "✅ Installation complete!" echo "" diff --git a/tests/test_openrouter_zdr_pipe.py b/tests/test_openrouter_zdr_pipe.py index fb45a5f..0b43f99 100644 --- a/tests/test_openrouter_zdr_pipe.py +++ b/tests/test_openrouter_zdr_pipe.py @@ -39,17 +39,23 @@ def mock_requests_get(monkeypatch): def _mock_get(url, headers=None, timeout=None): # Determine which endpoint is being called if url.endswith("/endpoints/zdr"): - # Return a list of ZDR-compliant models (format: "Provider | model-id") + # Mirror the REAL upstream shape: `name` is human-readable + # (" | ") and the canonical slug lives in + # a separate `model_id` field. If the parser fell back to + # rsplit-on-name it would leak "GPT-4o Mini" into the id, which the + # canonical-field assertions below would catch. return mock.Mock( status_code=200, json=lambda: { "data": [ { - "name": "OpenAI | openrouter/gpt-4o-mini", + "name": "OpenAI | GPT-4o Mini", + "model_id": "openrouter/gpt-4o-mini", "model_name": "GPT-4o Mini", }, { - "name": "OpenAI | openrouter/gpt-4o", + "name": "OpenAI | GPT-4o", + "model_id": "openrouter/gpt-4o", "model_name": "GPT-4o", }, ] @@ -108,6 +114,24 @@ def test_pipes_returns_only_zdr_models(pipe, mock_requests_get): assert m["name"].startswith("ZDR/") +def test_pipes_id_is_canonical_model_id_not_display_name(pipe, mock_requests_get): + """Regression for #457: produced ids must equal the canonical model_id, + never a human-readable display fragment leaked from `name.rsplit`.""" + pipe.valves.OPENROUTER_API_KEY = "dummy-key" + + models = pipe.pipes() + + # The canonical model_id values from the stub upstream response. + expected_ids = {"openrouter/gpt-4o-mini", "openrouter/gpt-4o"} + produced_ids = {m["id"] for m in models} + assert produced_ids == expected_ids + + # No display fragment (the human-readable right half of `name`) should ever + # appear as a model id. + display_fragments = {"GPT-4o Mini", "GPT-4o"} + assert produced_ids.isdisjoint(display_fragments) + + def test_pipes_handles_missing_api_key(pipe): """If no API key is provided, an error model is returned.""" pipe.valves.OPENROUTER_API_KEY = ""