Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 59 additions & 20 deletions src/gaia/agents/base/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,15 @@ def _changed_software_versions(existing: List[Dict]) -> List[str]:
# Constants
# ============================================================================

#: Embedding model served by Lemonade — 768-dim, MOE architecture.
#: Default embedder served by Lemonade — 768-dim GGUF (GPU/CPU profiles).
#: The active embedder is per-instance (``self._embedding_model``) and may be
#: the NPU-native FLM embedder instead; see ``init_memory`` (#1744). These
#: module constants remain the fallback default.
EMBEDDING_MODEL = "nomic-embed-text-v2-moe-GGUF"

#: Embedding dimensionality for nomic-embed-text-v2-moe.
#: Default embedding dimensionality (nomic-embed-text-v2-moe). The active dim is
#: derived from the live embedder at startup (``self._embedding_dim``); this is
#: only the pre-probe fallback.
EMBEDDING_DIM = 768

#: Cross-encoder model for reranking (~22 MB, runs on CPU).
Expand Down Expand Up @@ -306,7 +311,10 @@ class MemoryMixin:
"""

def init_memory(
self, db_path: Optional[Path] = None, context: str = "global"
self,
db_path: Optional[Path] = None,
context: str = "global",
embedding_model: Optional[str] = None,
) -> None:
"""Initialize the memory subsystem (v2 startup sequence).

Expand All @@ -319,6 +327,11 @@ def init_memory(
Args:
db_path: Optional path for the DB file. Default: ~/.gaia/memory.db
context: Active context scope (e.g., 'work', 'personal', 'global').
embedding_model: Embedder model id. Defaults to ``EMBEDDING_MODEL``
(GGUF nomic). The NPU profile passes the FLM-native embedder so
chat and embeddings stay co-resident on the NPU backend (#1744).
The embedding dimension is derived from the live embedder, not
this id, so a model with a different dim works without changes.

Raises:
RuntimeError: If Lemonade embedding service is unreachable
Expand All @@ -329,6 +342,10 @@ def init_memory(
# Explicit opt-out for environments that don't need memory (security
# tests, lint-time imports, etc.). This is NOT a silent fallback —
# the user/test author has explicitly set the env var.
self._embedding_model = embedding_model or EMBEDDING_MODEL
# Pre-probe default; refined from the live embedder below.
self._embedding_dim = EMBEDDING_DIM

if os.environ.get("GAIA_MEMORY_DISABLED") == "1":
logger.info(
"[MemoryMixin] memory disabled via GAIA_MEMORY_DISABLED=1; "
Expand Down Expand Up @@ -381,17 +398,39 @@ def init_memory(
# via ``_memory_store is None`` checks at every memory operation.
try:
self._get_embedder()
# Validate connectivity with a small test embedding
# Validate connectivity AND derive the embedding dimension from the
# live embedder — different embedders (e.g. the NPU FLM embedder)
# have different dims, so the FAISS index must match the active
# model rather than a hardcoded constant (#1744).
test_vec = self._embed_text("connectivity test")
if test_vec.shape[0] != EMBEDDING_DIM:
dim = int(test_vec.shape[0])
if dim <= 0:
raise RuntimeError(
f"Embedding dimension mismatch: expected {EMBEDDING_DIM}, "
f"got {test_vec.shape[0]}"
f"Embedder '{self._embedding_model}' returned a 0-length "
"vector. Check that the model is loaded in Lemonade."
)
self._embedding_dim = dim
logger.info(
"[MemoryMixin] Lemonade embedding service validated (%d-dim)",
EMBEDDING_DIM,
"[MemoryMixin] Lemonade embedding service validated "
"(model=%s, %d-dim)",
self._embedding_model,
self._embedding_dim,
)
# Invalidate stored vectors when the embedder changed. Vectors from
# a different model live in a different vector space (even at the
# same dim), so reusing them silently corrupts similarity search.
# Clearing forces backfill to re-embed with the active model.
prior = self._memory_store.get_embedder_id()
if prior is not None and prior != self._embedding_model:
cleared = self._memory_store.clear_all_embeddings()
logger.warning(
"[MemoryMixin] embedder changed (%s -> %s); cleared %d stored "
"embedding(s) for re-embedding",
prior,
self._embedding_model,
cleared,
)
self._memory_store.set_embedder_id(self._embedding_model)
except Exception as e:
logger.warning(
"[MemoryMixin] Lemonade embedding service unreachable — "
Expand Down Expand Up @@ -604,7 +643,7 @@ def _get_embedder(self) -> Any:
try:
from gaia.llm.providers.lemonade import LemonadeProvider

self._embedder = LemonadeProvider(model=EMBEDDING_MODEL)
self._embedder = LemonadeProvider(model=self._embedding_model)
logger.debug("[MemoryMixin] LemonadeProvider initialized for embeddings")
return self._embedder
except Exception as e:
Expand All @@ -613,20 +652,20 @@ def _get_embedder(self) -> Any:
) from e

def _embed_text(self, text: str) -> np.ndarray:
"""Embed text via Lemonade (nomic-embed-text-v2-moe-GGUF, 768-dim).
"""Embed text via Lemonade using the active embedder.

Required, not optional. Raises RuntimeError if embedding fails.

Args:
text: Text to embed.

Returns:
L2-normalized float32 numpy array of shape (768,).
L2-normalized float32 numpy array of shape ``(self._embedding_dim,)``.
"""
embedder = self._get_embedder()
try:
# LemonadeProvider.embed() returns list[list[float]]
results = embedder.embed([text], model=EMBEDDING_MODEL)
results = embedder.embed([text], model=self._embedding_model)
vec = np.array(results[0], dtype=np.float32)

# L2-normalize for cosine similarity via IndexFlatIP
Expand Down Expand Up @@ -684,13 +723,13 @@ def _rebuild_faiss_index(self) -> None:
# Get all active knowledge items that have embeddings
items = store.get_items_with_embeddings(include_sensitive=True)

index = faiss.IndexFlatIP(EMBEDDING_DIM)
index = faiss.IndexFlatIP(self._embedding_dim)
id_map = []

for item in items:
try:
vec = _blob_to_embedding(item["embedding"])
if vec.shape[0] != EMBEDDING_DIM:
if vec.shape[0] != self._embedding_dim:
logger.debug(
"[MemoryMixin] skipping embedding for %s: wrong dim %d",
item["id"],
Expand Down Expand Up @@ -750,19 +789,19 @@ def _faiss_remove(self, knowledge_id: str) -> None:
# Reconstruct all vectors except the removed one
n = self._faiss_index.ntotal
if n <= 1:
self._faiss_index = faiss.IndexFlatIP(EMBEDDING_DIM)
self._faiss_index = faiss.IndexFlatIP(self._embedding_dim)
self._faiss_id_map = []
return

all_vecs = np.zeros((n, EMBEDDING_DIM), dtype=np.float32)
all_vecs = np.zeros((n, self._embedding_dim), dtype=np.float32)
for i in range(n):
all_vecs[i] = self._faiss_index.reconstruct(i)

# Remove the target vector
keep_vecs = np.delete(all_vecs, idx, axis=0)
keep_ids = self._faiss_id_map[:idx] + self._faiss_id_map[idx + 1 :]

new_index = faiss.IndexFlatIP(EMBEDDING_DIM)
new_index = faiss.IndexFlatIP(self._embedding_dim)
new_index.add(keep_vecs)
self._faiss_index = new_index
self._faiss_id_map = keep_ids
Expand Down Expand Up @@ -1411,7 +1450,7 @@ def reconcile_memory(self, max_pairs: int = 20) -> Dict:
for item in items:
try:
vec = _blob_to_embedding(item["embedding"])
if vec.shape[0] != EMBEDDING_DIM:
if vec.shape[0] != self._embedding_dim:
continue
norm = np.linalg.norm(vec)
if norm > 0:
Expand All @@ -1431,7 +1470,7 @@ def reconcile_memory(self, max_pairs: int = 20) -> Dict:
try:
import faiss

temp_index = faiss.IndexFlatIP(EMBEDDING_DIM)
temp_index = faiss.IndexFlatIP(self._embedding_dim)
temp_index.add(mat)

# Search each item for its top-5 neighbors
Expand Down
58 changes: 58 additions & 0 deletions src/gaia/agents/base/memory_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,15 @@ def _safe_json_loads(value) -> object:
-- Knowledge FTS5 (standalone, manually synced, porter stemmer for morphological matching)
CREATE VIRTUAL TABLE IF NOT EXISTS knowledge_fts USING fts5(content, domain, category, tokenize='porter unicode61');

-- Key/value metadata (e.g. the embedder id that produced stored vectors).
-- Lives in the DB (not a sidecar file) so it is atomic with the data it
-- describes and visible to every connection — the agent and the UI memory
-- router open the same DB from different code paths (#1744).
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT
);

-- Tool history
CREATE TABLE IF NOT EXISTS tool_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand Down Expand Up @@ -1391,6 +1400,55 @@ def store_embedding(self, knowledge_id: str, embedding: bytes) -> bool:
self._conn.rollback()
raise

def clear_all_embeddings(self) -> int:
"""Null out every stored embedding so they get re-embedded on backfill.

Used when the active embedder changes: vectors from a different model
live in a different vector space (and possibly a different dimension),
so reusing them would silently corrupt similarity search. Returns the
number of rows cleared.
"""
with self._lock:
try:
rowcount = self._conn.execute(
"UPDATE knowledge SET embedding = NULL WHERE embedding IS NOT NULL"
).rowcount
self._conn.commit()
return rowcount
except Exception:
self._conn.rollback()
raise

#: ``meta`` key recording which embedder produced the stored vectors.
_EMBEDDER_META_KEY = "embedder_id"

def get_embedder_id(self) -> Optional[str]:
"""Return the embedder model id that produced the stored embeddings.

``None`` when nothing has been stamped yet (fresh DB) — callers treat
that as "no change to detect". Read from the ``meta`` table so every
connection (agent + UI router) sees the same value.
"""
with self._lock:
row = self._conn.execute(
"SELECT value FROM meta WHERE key = ?", (self._EMBEDDER_META_KEY,)
).fetchone()
return row[0] if row else None

def set_embedder_id(self, model_id: str) -> None:
"""Record the embedder model id that produced the stored embeddings."""
with self._lock:
try:
self._conn.execute(
"INSERT INTO meta (key, value) VALUES (?, ?) "
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
(self._EMBEDDER_META_KEY, model_id),
)
self._conn.commit()
except Exception:
self._conn.rollback()
raise

def get_items_with_embeddings(
self,
category: str | None = None,
Expand Down
14 changes: 11 additions & 3 deletions src/gaia/agents/chat/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@

from gaia.agents.base.agent import Agent, default_max_steps
from gaia.agents.base.console import AgentConsole
from gaia.agents.base.memory import EMBEDDING_MODEL, MemoryMixin
from gaia.agents.base.memory import MemoryMixin
from gaia.agents.base.tool_loader import ToolLoader
from gaia.agents.base.tools import _TOOL_REGISTRY
from gaia.agents.chat.session import SessionManager
from gaia.agents.chat.tool_bundles import DOC_BUNDLES, DOC_CORE_TOOLS
from gaia.agents.chat.tools import FileToolsMixin
from gaia.agents.registry import get_embedding_model_for_device
from gaia.agents.tools import FileSystemToolsMixin # Enhanced file system navigation
from gaia.agents.tools import ScratchpadToolsMixin # Structured data analysis
from gaia.agents.tools import ( # Web browsing and search; Shared tools
Expand Down Expand Up @@ -253,10 +254,17 @@ def __init__(self, config: Optional[ChatAgentConfig] = None):
else os.getenv("LEMONADE_BASE_URL", "http://localhost:13305/api/v1")
)

# Embedder is device-scoped: the NPU profile uses the FLM-native
# embedder so chat and embeddings stay co-resident on the NPU backend
# (a GGUF embedder runs on Vulkan and evicts the FLM chat model every
# turn — #1744). GPU/CPU keep the GGUF nomic embedder.
effective_embedding_model = get_embedding_model_for_device(config.device)

# Initialize RAG SDK (optional - will be None if dependencies not installed)
try:
rag_config = RAGConfig(
model=effective_model_id,
embedding_model=effective_embedding_model,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap, # Configurable overlap for context preservation
max_chunks=config.max_chunks,
Expand Down Expand Up @@ -347,7 +355,7 @@ def __init__(self, config: Optional[ChatAgentConfig] = None):
self.tool_loader = self._maybe_build_tool_loader()

# Initialize memory subsystem (before super().__init__ which calls _register_tools)
self.init_memory()
self.init_memory(embedding_model=effective_embedding_model)

# Store base URL for use in _register_tools() (VLM, etc.)
self._base_url = effective_base_url
Expand Down Expand Up @@ -498,7 +506,7 @@ def _embed_texts_batch(self, texts) -> "Any":
"""
import numpy as np

results = self._get_embedder().embed(list(texts), model=EMBEDDING_MODEL)
results = self._get_embedder().embed(list(texts), model=self._embedding_model)
vecs = np.asarray(results, dtype=np.float32)
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
norms[norms == 0] = 1.0
Expand Down
28 changes: 28 additions & 0 deletions src/gaia/agents/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,11 @@ def _compute_custom_origin_hash(py_file: Path) -> str:
return hashlib.sha256(py_file.read_bytes()).hexdigest()[:16]


# Default embedder (GPU/CPU). The NPU device overrides this with the FLM-native
# embedder so chat + embeddings stay co-resident on the NPU backend (#1744).
DEFAULT_EMBEDDING_MODEL = "nomic-embed-text-v2-moe-GGUF"


@dataclass
class DeviceConfig:
"""A verified (device, model, recipe, backend) configuration for an agent.
Expand All @@ -241,6 +246,10 @@ class DeviceConfig:
verified: Whether this combination has been tested end-to-end via
agent eval. Unverified configs show a warning badge in the UI.
ctx_size: Default context window size for this configuration.
embedding_model: Embedder model id for RAG/memory on this device. NPU
uses the FLM-native embedder so the chat model and embedder stay
co-resident on the NPU backend; a GGUF embedder runs on Vulkan and
evicts the FLM chat model every turn on a shared-memory APU (#1744).
"""

device: Literal["cpu", "gpu", "npu"]
Expand All @@ -249,6 +258,7 @@ class DeviceConfig:
backend: str
verified: bool = False
ctx_size: int = 32768
embedding_model: str = DEFAULT_EMBEDDING_MODEL


# Default device configurations for built-in agents using Gemma 4 E4B.
Expand Down Expand Up @@ -277,10 +287,28 @@ class DeviceConfig:
backend="flm:npu",
verified=True,
ctx_size=4096,
# FLM-native embedder so chat + embeddings stay co-resident on the NPU
# backend and don't thrash NPU<->Vulkan every turn (#1744).
embedding_model="embed-gemma-300m-FLM",
),
]


def get_embedding_model_for_device(device: Optional[str]) -> str:
"""Return the embedder model id for a device target.

Single source of truth: reads ``DEFAULT_DEVICE_CONFIGS`` so the embedder
choice lives next to the chat model/recipe/backend for each device. The NPU
profile uses the FLM-native embedder (see ``DeviceConfig.embedding_model``);
GPU/CPU and an unspecified device default to the GGUF nomic embedder, which
matches the GPU-default policy elsewhere in the CLI.
"""
for dc in DEFAULT_DEVICE_CONFIGS:
if dc.device == device:
return dc.embedding_model
return DEFAULT_EMBEDDING_MODEL


@dataclass
class ModelTier:
"""A selectable model size for an agent (#1162).
Expand Down
6 changes: 5 additions & 1 deletion src/gaia/installer/init_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,11 @@
"npu": {
"description": "Ryzen AI NPU acceleration via FLM backend (requires XDNA2 NPU)",
"agent": "chat",
"models": ["gemma4-it-e2b-FLM"],
# FLM chat model + FLM-native embedder so chat and embeddings stay
# co-resident on the NPU backend. A GGUF embedder would run on Vulkan
# and evict the FLM chat model every turn (#1744). Both are built-in
# Lemonade *-FLM models, pulled by name only (no recipe — #1655).
"models": ["gemma4-it-e2b-FLM", "embed-gemma-300m-FLM"],
"approx_size": "~3 GB",
"min_lemonade_version": "10.2.0",
# FLM default context on NPU. Smaller than GPU (32768) because NPU
Expand Down
Loading
Loading