amd · kovtcharov · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -144,10 +144,15 @@ def _changed_software_versions(existing: List[Dict]) -> List[str]:
 # Constants
 # ============================================================================
 
-#: Embedding model served by Lemonade — 768-dim, MOE architecture.
+#: Default embedder served by Lemonade — 768-dim GGUF (GPU/CPU profiles).
+#: The active embedder is per-instance (``self._embedding_model``) and may be
+#: the NPU-native FLM embedder instead; see ``init_memory`` (#1744). These
+#: module constants remain the fallback default.
 EMBEDDING_MODEL = "nomic-embed-text-v2-moe-GGUF"
 
-#: Embedding dimensionality for nomic-embed-text-v2-moe.
+#: Default embedding dimensionality (nomic-embed-text-v2-moe). The active dim is
+#: derived from the live embedder at startup (``self._embedding_dim``); this is
+#: only the pre-probe fallback.
 EMBEDDING_DIM = 768
 
 #: Cross-encoder model for reranking (~22 MB, runs on CPU).
@@ -306,7 +311,10 @@ class MemoryMixin:
     """
 
     def init_memory(
-        self, db_path: Optional[Path] = None, context: str = "global"
+        self,
+        db_path: Optional[Path] = None,
+        context: str = "global",
+        embedding_model: Optional[str] = None,
     ) -> None:
         """Initialize the memory subsystem (v2 startup sequence).
 
@@ -319,6 +327,11 @@ def init_memory(
         Args:
             db_path: Optional path for the DB file. Default: ~/.gaia/memory.db
             context: Active context scope (e.g., 'work', 'personal', 'global').
+            embedding_model: Embedder model id. Defaults to ``EMBEDDING_MODEL``
+                (GGUF nomic). The NPU profile passes the FLM-native embedder so
+                chat and embeddings stay co-resident on the NPU backend (#1744).
+                The embedding dimension is derived from the live embedder, not
+                this id, so a model with a different dim works without changes.
 
         Raises:
             RuntimeError: If Lemonade embedding service is unreachable
@@ -329,6 +342,10 @@ def init_memory(
         # Explicit opt-out for environments that don't need memory (security
         # tests, lint-time imports, etc.).  This is NOT a silent fallback —
         # the user/test author has explicitly set the env var.
+        self._embedding_model = embedding_model or EMBEDDING_MODEL
+        # Pre-probe default; refined from the live embedder below.
+        self._embedding_dim = EMBEDDING_DIM
+
         if os.environ.get("GAIA_MEMORY_DISABLED") == "1":
             logger.info(
                 "[MemoryMixin] memory disabled via GAIA_MEMORY_DISABLED=1; "
@@ -381,17 +398,39 @@ def init_memory(
         # via ``_memory_store is None`` checks at every memory operation.
         try:
             self._get_embedder()
-            # Validate connectivity with a small test embedding
+            # Validate connectivity AND derive the embedding dimension from the
+            # live embedder — different embedders (e.g. the NPU FLM embedder)
+            # have different dims, so the FAISS index must match the active
+            # model rather than a hardcoded constant (#1744).
             test_vec = self._embed_text("connectivity test")
-            if test_vec.shape[0] != EMBEDDING_DIM:
+            dim = int(test_vec.shape[0])
+            if dim <= 0:
                 raise RuntimeError(
-                    f"Embedding dimension mismatch: expected {EMBEDDING_DIM}, "
-                    f"got {test_vec.shape[0]}"
+                    f"Embedder '{self._embedding_model}' returned a 0-length "
+                    "vector. Check that the model is loaded in Lemonade."
                 )
+            self._embedding_dim = dim
             logger.info(
-                "[MemoryMixin] Lemonade embedding service validated (%d-dim)",
-                EMBEDDING_DIM,
+                "[MemoryMixin] Lemonade embedding service validated "
+                "(model=%s, %d-dim)",
+                self._embedding_model,
+                self._embedding_dim,
             )
+            # Invalidate stored vectors when the embedder changed. Vectors from
+            # a different model live in a different vector space (even at the
+            # same dim), so reusing them silently corrupts similarity search.
+            # Clearing forces backfill to re-embed with the active model.
+            prior = self._memory_store.get_embedder_id()
+            if prior is not None and prior != self._embedding_model:
+                cleared = self._memory_store.clear_all_embeddings()
+                logger.warning(
+                    "[MemoryMixin] embedder changed (%s -> %s); cleared %d stored "
+                    "embedding(s) for re-embedding",
+                    prior,
+                    self._embedding_model,
+                    cleared,
+                )
+            self._memory_store.set_embedder_id(self._embedding_model)
         except Exception as e:
             logger.warning(
                 "[MemoryMixin] Lemonade embedding service unreachable — "
@@ -604,7 +643,7 @@ def _get_embedder(self) -> Any:
         try:
             from gaia.llm.providers.lemonade import LemonadeProvider
 
-            self._embedder = LemonadeProvider(model=EMBEDDING_MODEL)
+            self._embedder = LemonadeProvider(model=self._embedding_model)
             logger.debug("[MemoryMixin] LemonadeProvider initialized for embeddings")
             return self._embedder
         except Exception as e:
@@ -613,20 +652,20 @@ def _get_embedder(self) -> Any:
             ) from e
 
     def _embed_text(self, text: str) -> np.ndarray:
-        """Embed text via Lemonade (nomic-embed-text-v2-moe-GGUF, 768-dim).
+        """Embed text via Lemonade using the active embedder.
 
         Required, not optional. Raises RuntimeError if embedding fails.
 
         Args:
             text: Text to embed.
 
         Returns:
-            L2-normalized float32 numpy array of shape (768,).
+            L2-normalized float32 numpy array of shape ``(self._embedding_dim,)``.
         """
         embedder = self._get_embedder()
         try:
             # LemonadeProvider.embed() returns list[list[float]]
-            results = embedder.embed([text], model=EMBEDDING_MODEL)
+            results = embedder.embed([text], model=self._embedding_model)
             vec = np.array(results[0], dtype=np.float32)
 
             # L2-normalize for cosine similarity via IndexFlatIP
@@ -684,13 +723,13 @@ def _rebuild_faiss_index(self) -> None:
         # Get all active knowledge items that have embeddings
         items = store.get_items_with_embeddings(include_sensitive=True)
 
-        index = faiss.IndexFlatIP(EMBEDDING_DIM)
+        index = faiss.IndexFlatIP(self._embedding_dim)
         id_map = []
 
         for item in items:
             try:
                 vec = _blob_to_embedding(item["embedding"])
-                if vec.shape[0] != EMBEDDING_DIM:
+                if vec.shape[0] != self._embedding_dim:
                     logger.debug(
                         "[MemoryMixin] skipping embedding for %s: wrong dim %d",
                         item["id"],
@@ -750,19 +789,19 @@ def _faiss_remove(self, knowledge_id: str) -> None:
             # Reconstruct all vectors except the removed one
             n = self._faiss_index.ntotal
             if n <= 1:
-                self._faiss_index = faiss.IndexFlatIP(EMBEDDING_DIM)
+                self._faiss_index = faiss.IndexFlatIP(self._embedding_dim)
                 self._faiss_id_map = []
                 return
 
-            all_vecs = np.zeros((n, EMBEDDING_DIM), dtype=np.float32)
+            all_vecs = np.zeros((n, self._embedding_dim), dtype=np.float32)
             for i in range(n):
                 all_vecs[i] = self._faiss_index.reconstruct(i)
 
             # Remove the target vector
             keep_vecs = np.delete(all_vecs, idx, axis=0)
             keep_ids = self._faiss_id_map[:idx] + self._faiss_id_map[idx + 1 :]
 
-            new_index = faiss.IndexFlatIP(EMBEDDING_DIM)
+            new_index = faiss.IndexFlatIP(self._embedding_dim)
             new_index.add(keep_vecs)
             self._faiss_index = new_index
             self._faiss_id_map = keep_ids
@@ -1411,7 +1450,7 @@ def reconcile_memory(self, max_pairs: int = 20) -> Dict:
         for item in items:
             try:
                 vec = _blob_to_embedding(item["embedding"])
-                if vec.shape[0] != EMBEDDING_DIM:
+                if vec.shape[0] != self._embedding_dim:
                     continue
                 norm = np.linalg.norm(vec)
                 if norm > 0:
@@ -1431,7 +1470,7 @@ def reconcile_memory(self, max_pairs: int = 20) -> Dict:
         try:
             import faiss
 
-            temp_index = faiss.IndexFlatIP(EMBEDDING_DIM)
+            temp_index = faiss.IndexFlatIP(self._embedding_dim)
             temp_index.add(mat)
 
             # Search each item for its top-5 neighbors

@@ -237,6 +237,15 @@ def _safe_json_loads(value) -> object:
 -- Knowledge FTS5 (standalone, manually synced, porter stemmer for morphological matching)
 CREATE VIRTUAL TABLE IF NOT EXISTS knowledge_fts USING fts5(content, domain, category, tokenize='porter unicode61');
 
+-- Key/value metadata (e.g. the embedder id that produced stored vectors).
+-- Lives in the DB (not a sidecar file) so it is atomic with the data it
+-- describes and visible to every connection — the agent and the UI memory
+-- router open the same DB from different code paths (#1744).
+CREATE TABLE IF NOT EXISTS meta (
+    key   TEXT PRIMARY KEY,
+    value TEXT
+);
+
 -- Tool history
 CREATE TABLE IF NOT EXISTS tool_history (
     id          INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -1391,6 +1400,55 @@ def store_embedding(self, knowledge_id: str, embedding: bytes) -> bool:
                 self._conn.rollback()
                 raise
 
+    def clear_all_embeddings(self) -> int:
+        """Null out every stored embedding so they get re-embedded on backfill.
+
+        Used when the active embedder changes: vectors from a different model
+        live in a different vector space (and possibly a different dimension),
+        so reusing them would silently corrupt similarity search. Returns the
+        number of rows cleared.
+        """
+        with self._lock:
+            try:
+                rowcount = self._conn.execute(
+                    "UPDATE knowledge SET embedding = NULL WHERE embedding IS NOT NULL"
+                ).rowcount
+                self._conn.commit()
+                return rowcount
+            except Exception:
+                self._conn.rollback()
+                raise
+
+    #: ``meta`` key recording which embedder produced the stored vectors.
+    _EMBEDDER_META_KEY = "embedder_id"
+
+    def get_embedder_id(self) -> Optional[str]:
+        """Return the embedder model id that produced the stored embeddings.
+
+        ``None`` when nothing has been stamped yet (fresh DB) — callers treat
+        that as "no change to detect". Read from the ``meta`` table so every
+        connection (agent + UI router) sees the same value.
+        """
+        with self._lock:
+            row = self._conn.execute(
+                "SELECT value FROM meta WHERE key = ?", (self._EMBEDDER_META_KEY,)
+            ).fetchone()
+        return row[0] if row else None
+
+    def set_embedder_id(self, model_id: str) -> None:
+        """Record the embedder model id that produced the stored embeddings."""
+        with self._lock:
+            try:
+                self._conn.execute(
+                    "INSERT INTO meta (key, value) VALUES (?, ?) "
+                    "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
+                    (self._EMBEDDER_META_KEY, model_id),
+                )
+                self._conn.commit()
+            except Exception:
+                self._conn.rollback()
+                raise
+
     def get_items_with_embeddings(
         self,
         category: str | None = None,

@@ -18,12 +18,13 @@
 
 from gaia.agents.base.agent import Agent, default_max_steps
 from gaia.agents.base.console import AgentConsole
-from gaia.agents.base.memory import EMBEDDING_MODEL, MemoryMixin
+from gaia.agents.base.memory import MemoryMixin
 from gaia.agents.base.tool_loader import ToolLoader
 from gaia.agents.base.tools import _TOOL_REGISTRY
 from gaia.agents.chat.session import SessionManager
 from gaia.agents.chat.tool_bundles import DOC_BUNDLES, DOC_CORE_TOOLS
 from gaia.agents.chat.tools import FileToolsMixin
+from gaia.agents.registry import get_embedding_model_for_device
 from gaia.agents.tools import FileSystemToolsMixin  # Enhanced file system navigation
 from gaia.agents.tools import ScratchpadToolsMixin  # Structured data analysis
 from gaia.agents.tools import (  # Web browsing and search; Shared tools
@@ -253,10 +254,17 @@ def __init__(self, config: Optional[ChatAgentConfig] = None):
             else os.getenv("LEMONADE_BASE_URL", "http://localhost:13305/api/v1")
         )
 
+        # Embedder is device-scoped: the NPU profile uses the FLM-native
+        # embedder so chat and embeddings stay co-resident on the NPU backend
+        # (a GGUF embedder runs on Vulkan and evicts the FLM chat model every
+        # turn — #1744). GPU/CPU keep the GGUF nomic embedder.
+        effective_embedding_model = get_embedding_model_for_device(config.device)
+
         # Initialize RAG SDK (optional - will be None if dependencies not installed)
         try:
             rag_config = RAGConfig(
                 model=effective_model_id,
+                embedding_model=effective_embedding_model,
                 chunk_size=config.chunk_size,
                 chunk_overlap=config.chunk_overlap,  # Configurable overlap for context preservation
                 max_chunks=config.max_chunks,
@@ -347,7 +355,7 @@ def __init__(self, config: Optional[ChatAgentConfig] = None):
         self.tool_loader = self._maybe_build_tool_loader()
 
         # Initialize memory subsystem (before super().__init__ which calls _register_tools)
-        self.init_memory()
+        self.init_memory(embedding_model=effective_embedding_model)
 
         # Store base URL for use in _register_tools() (VLM, etc.)
         self._base_url = effective_base_url
@@ -498,7 +506,7 @@ def _embed_texts_batch(self, texts) -> "Any":
         """
         import numpy as np
 
-        results = self._get_embedder().embed(list(texts), model=EMBEDDING_MODEL)
+        results = self._get_embedder().embed(list(texts), model=self._embedding_model)
         vecs = np.asarray(results, dtype=np.float32)
         norms = np.linalg.norm(vecs, axis=1, keepdims=True)
         norms[norms == 0] = 1.0

@@ -223,6 +223,11 @@ def _compute_custom_origin_hash(py_file: Path) -> str:
     return hashlib.sha256(py_file.read_bytes()).hexdigest()[:16]
 
 
+# Default embedder (GPU/CPU). The NPU device overrides this with the FLM-native
+# embedder so chat + embeddings stay co-resident on the NPU backend (#1744).
+DEFAULT_EMBEDDING_MODEL = "nomic-embed-text-v2-moe-GGUF"
+
+
 @dataclass
 class DeviceConfig:
     """A verified (device, model, recipe, backend) configuration for an agent.
@@ -241,6 +246,10 @@ class DeviceConfig:
         verified: Whether this combination has been tested end-to-end via
             agent eval.  Unverified configs show a warning badge in the UI.
         ctx_size: Default context window size for this configuration.
+        embedding_model: Embedder model id for RAG/memory on this device. NPU
+            uses the FLM-native embedder so the chat model and embedder stay
+            co-resident on the NPU backend; a GGUF embedder runs on Vulkan and
+            evicts the FLM chat model every turn on a shared-memory APU (#1744).
     """
 
     device: Literal["cpu", "gpu", "npu"]
@@ -249,6 +258,7 @@ class DeviceConfig:
     backend: str
     verified: bool = False
     ctx_size: int = 32768
+    embedding_model: str = DEFAULT_EMBEDDING_MODEL
 
 
 # Default device configurations for built-in agents using Gemma 4 E4B.
@@ -277,10 +287,28 @@ class DeviceConfig:
         backend="flm:npu",
         verified=True,
         ctx_size=4096,
+        # FLM-native embedder so chat + embeddings stay co-resident on the NPU
+        # backend and don't thrash NPU<->Vulkan every turn (#1744).
+        embedding_model="embed-gemma-300m-FLM",
     ),
 ]
 
 
+def get_embedding_model_for_device(device: Optional[str]) -> str:
+    """Return the embedder model id for a device target.
+
+    Single source of truth: reads ``DEFAULT_DEVICE_CONFIGS`` so the embedder
+    choice lives next to the chat model/recipe/backend for each device. The NPU
+    profile uses the FLM-native embedder (see ``DeviceConfig.embedding_model``);
+    GPU/CPU and an unspecified device default to the GGUF nomic embedder, which
+    matches the GPU-default policy elsewhere in the CLI.
+    """
+    for dc in DEFAULT_DEVICE_CONFIGS:
+        if dc.device == device:
+            return dc.embedding_model
+    return DEFAULT_EMBEDDING_MODEL
+
+
 @dataclass
 class ModelTier:
     """A selectable model size for an agent (#1162).

@@ -98,7 +98,11 @@
     "npu": {
         "description": "Ryzen AI NPU acceleration via FLM backend (requires XDNA2 NPU)",
         "agent": "chat",
-        "models": ["gemma4-it-e2b-FLM"],
+        # FLM chat model + FLM-native embedder so chat and embeddings stay
+        # co-resident on the NPU backend. A GGUF embedder would run on Vulkan
+        # and evict the FLM chat model every turn (#1744). Both are built-in
+        # Lemonade *-FLM models, pulled by name only (no recipe — #1655).
+        "models": ["gemma4-it-e2b-FLM", "embed-gemma-300m-FLM"],
         "approx_size": "~3 GB",
         "min_lemonade_version": "10.2.0",
         # FLM default context on NPU. Smaller than GPU (32768) because NPU