From 6f61849e89b7bbf3ceddd91183e3b394ac4456e1 Mon Sep 17 00:00:00 2001 From: Codex Agent Date: Mon, 15 Jun 2026 07:23:27 +0000 Subject: [PATCH 1/3] refactor(backend): split LLM provider routing seams --- .../test_llm_provider_plugin_structure.py | 138 +++++ backend/utils/llm/clients.py | 497 +++--------------- backend/utils/llm/conversation_folder.py | 182 +++++++ backend/utils/llm/conversation_processing.py | 133 +---- backend/utils/llm/model_config.py | 317 +++++++++++ backend/utils/llm/providers.py | 159 ++++++ 6 files changed, 884 insertions(+), 542 deletions(-) create mode 100644 backend/tests/unit/test_llm_provider_plugin_structure.py create mode 100644 backend/utils/llm/conversation_folder.py create mode 100644 backend/utils/llm/model_config.py create mode 100644 backend/utils/llm/providers.py diff --git a/backend/tests/unit/test_llm_provider_plugin_structure.py b/backend/tests/unit/test_llm_provider_plugin_structure.py new file mode 100644 index 00000000000..f45d60c52c6 --- /dev/null +++ b/backend/tests/unit/test_llm_provider_plugin_structure.py @@ -0,0 +1,138 @@ +"""Unit tests for maintainable LLM provider/model plug-in seams.""" + +import os +import sys +from unittest.mock import MagicMock + +import pytest + +_HEAVY_MOCKS = { + 'firebase_admin': MagicMock(), + 'firebase_admin.firestore': MagicMock(), + 'google.cloud.firestore': MagicMock(), + 'google.cloud.firestore_v1': MagicMock(), + 'google.cloud.firestore_v1.base_query': MagicMock(), + 'database': MagicMock(), + 'database._client': MagicMock(), + 'database.llm_usage': MagicMock(), +} +for _mod, _mock in _HEAVY_MOCKS.items(): + sys.modules.setdefault(_mod, _mock) + +# Some older tests install lightweight langchain_core stubs. If this test runs +# after them, provide the prompt submodule conversation_folder imports. +if 'langchain_core' in sys.modules and 'langchain_core.prompts' not in sys.modules: + import types + + prompts_stub = types.ModuleType('langchain_core.prompts') + + class ChatPromptTemplate: + @classmethod + def from_messages(cls, messages): + return cls() + + setattr(prompts_stub, 'ChatPromptTemplate', ChatPromptTemplate) + sys.modules['langchain_core.prompts'] = prompts_stub + +os.environ.setdefault('OPENAI_API_KEY', 'sk-test') +os.environ.setdefault('ANTHROPIC_API_KEY', 'sk-ant-test') + +from utils.llm import providers +from utils.llm.conversation_folder import FolderAssignment, get_default_folder_id, validate_folder_assignment +from utils.llm.model_config import get_route_options + + +@pytest.fixture(autouse=True) +def clear_provider_cache(): + providers._llm_cache.clear() + yield + providers._llm_cache.clear() + + +class FakeChatOpenAI: + calls = [] + + def __init__(self, **kwargs): + self.kwargs = kwargs + FakeChatOpenAI.calls.append(kwargs) + + def bind(self, **kwargs): + self.bound_kwargs = kwargs + return self + + +def test_openai_compatible_provider_applies_base_url_headers_and_google_prefix(monkeypatch): + FakeChatOpenAI.calls.clear() + providers._llm_cache.clear() + monkeypatch.setattr(providers, 'ChatOpenAI', FakeChatOpenAI) + monkeypatch.setenv('OPENROUTER_API_KEY', 'sk-openrouter') + + llm = providers.get_or_create_openai_compatible_llm( + 'openrouter', 'gemini-3-flash-preview', options={'temperature': 0.7} + ) + + assert isinstance(llm, FakeChatOpenAI) + call = FakeChatOpenAI.calls[-1] + assert call['model'] == 'google/gemini-3-flash-preview' + assert call['api_key'] == 'sk-openrouter' + assert call['base_url'] == 'https://openrouter.ai/api/v1' + assert call['default_headers'] == {'X-Title': 'Omi Chat'} + assert call['temperature'] == 0.7 + + +def test_unknown_openai_compatible_provider_fails_loudly(): + with pytest.raises(ValueError, match="Unknown OpenAI-compatible provider"): + providers.get_or_create_openai_compatible_llm('missing-provider', 'some-model') + + +def test_route_options_keep_provider_quirks_out_of_callsites(): + assert get_route_options('wrapped_analysis', 'gemini-3-flash-preview', 'openrouter')['temperature'] == 0.7 + assert get_route_options('followup', 'gemini-2.5-flash-lite', 'gemini')['thinking_budget'] == 0 + assert get_route_options('fair_use', 'gpt-5.1', 'openai')['extra_body'] == {"prompt_cache_retention": "24h"} + + +def test_validate_folder_assignment_rejects_unknown_folder_id(): + folders = [ + {'id': 'default', 'name': 'General', 'is_default': True}, + {'id': 'work', 'name': 'Work'}, + ] + + result = validate_folder_assignment(FolderAssignment(folder_id='missing', confidence=0.95), folders, 'default') + + assert result.folder_id == 'default' + assert result.confidence == 0.3 + assert result.validation_status == 'invalid_folder_id_defaulted' + + +def test_validate_folder_assignment_low_confidence_uses_default(): + folders = [ + {'id': 'default', 'name': 'General', 'is_default': True}, + {'id': 'work', 'name': 'Work'}, + ] + + result = validate_folder_assignment(FolderAssignment(folder_id='work', confidence=0.4), folders, 'default') + + assert result.folder_id == 'default' + assert result.confidence == 0.4 + assert result.validation_status == 'low_confidence_defaulted' + + +def test_validate_folder_assignment_accepts_valid_high_confidence(): + folders = [ + {'id': 'default', 'name': 'General', 'is_default': True}, + {'id': 'work', 'name': 'Work'}, + ] + + result = validate_folder_assignment( + FolderAssignment(folder_id='work', confidence=0.9, reasoning='Clearly about work'), folders, 'default' + ) + + assert result.folder_id == 'work' + assert result.confidence == 0.9 + assert result.reasoning == 'Clearly about work' + assert result.validation_status == 'accepted' + + +def test_default_folder_id_is_extracted_once_for_route_logic(): + assert get_default_folder_id([{'id': 'a'}, {'id': 'b', 'is_default': True}]) == 'b' + assert get_default_folder_id([{'id': 'a'}]) is None diff --git a/backend/utils/llm/clients.py b/backend/utils/llm/clients.py index 3e96730fcbe..59c5c1f751b 100644 --- a/backend/utils/llm/clients.py +++ b/backend/utils/llm/clients.py @@ -1,24 +1,59 @@ import hashlib import logging import os -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional import anthropic import httpx from cachetools import TTLCache from langchain_core.language_models import BaseChatModel from langchain_core.output_parsers import PydanticOutputParser -from langchain_google_genai import ChatGoogleGenerativeAI from langchain_openai import ChatOpenAI, OpenAIEmbeddings import tiktoken from models.structured import Structured from utils.byok import get_byok_key +from utils.llm.model_config import ( + MODEL_QOS_PROFILES, + _ANTHROPIC_ONLY_FEATURES, + _CACHE_KEY_MODELS, + _DEFAULT_CONFIG, + _OPENROUTER_TEMPERATURES, + _PERPLEXITY_ONLY_FEATURES, + _PINNED_FEATURES, + _STRUCTURED_OUTPUT_FEATURES, + _active_profile, + _active_profile_name, + _byok_profile, + _byok_profile_name, + get_active_profile, + get_active_profile_name, + get_all_configured_features, + get_byok_profile, + get_byok_profile_name, + get_default_config, + get_model, + get_provider, + get_route_options, + is_anthropic_only_feature, + is_perplexity_only_feature, + is_structured_output_feature, + supports_prompt_cache, + _get_model_config, +) +from utils.llm.providers import ( + GEMINI_OPENAI_BASE_URL, + get_default_client, + get_or_create_gemini_llm as _get_or_create_gemini_llm, + get_or_create_openai_compatible_llm, + _llm_cache, +) from utils.llm.usage_tracker import get_usage_callback logger = logging.getLogger(__name__) _usage_callback = get_usage_callback() +_GEMINI_OPENAI_BASE_URL = GEMINI_OPENAI_BASE_URL # --------------------------------------------------------------------------- # BYOK (Bring Your Own Key) @@ -29,10 +64,6 @@ # provide lazy resolution since there's no request context at import time. # --------------------------------------------------------------------------- -# Google's OpenAI-compatible endpoint — used only for BYOK users who bring their -# own AI Studio API key. Platform calls use ChatGoogleGenerativeAI (native SDK). -_GEMINI_OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/" - class _AnthropicClientProxy: """Forwards every attribute to the appropriate anthropic.AsyncAnthropic for the request.""" @@ -163,15 +194,15 @@ def _create_byok_client( return _cached_openai_chat(model, byok_key, kwargs) if provider == 'gemini': - return _cached_openai_chat(model, byok_key, {**kwargs, 'base_url': _GEMINI_OPENAI_BASE_URL}) + return _cached_openai_chat(model, byok_key, {**kwargs, 'base_url': GEMINI_OPENAI_BASE_URL}) if provider == 'openrouter': # Gemini-based OpenRouter models reroute to Gemini direct via BYOK if model.startswith('gemini'): - temp = _OPENROUTER_TEMPERATURES.get(feature) - if temp is not None: - kwargs['temperature'] = temp - return _cached_openai_chat(model, byok_key, {**kwargs, 'base_url': _GEMINI_OPENAI_BASE_URL}) + route_options = get_route_options(feature, model, provider) + if 'temperature' in route_options: + kwargs['temperature'] = route_options['temperature'] + return _cached_openai_chat(model, byok_key, {**kwargs, 'base_url': GEMINI_OPENAI_BASE_URL}) return None # Non-Gemini OpenRouter: no BYOK support return None @@ -196,410 +227,47 @@ def get_openai_chat(model: str, **kwargs) -> ChatOpenAI: # --------------------------------------------------------------------------- -# Model QoS Profile System -# -# Each profile maps every feature to a (model, provider) tuple. -# The profile is the SINGLE SOURCE OF TRUTH for both model and provider. -# Provider is never inferred from model name — it is declared explicitly. -# -# This means the same model can be hosted by different providers: -# feature_a: ('gemini-2.5-flash', 'gemini') → Google direct -# feature_b: ('gemini-2.5-flash', 'openrouter') → OpenRouter -# -# Global switch: MODEL_QOS=premium (selects entire profile) -# -# Profiles: -# premium — maximize cost savings while preserving 80% of max quality -# max — 100% quality, best models available, no cost optimization -# byok — same models as max (BYOK users pay their own API costs) +# Model QoS and provider routing # --------------------------------------------------------------------------- -MODEL_QOS_PROFILES: Dict[str, Dict[str, Tuple[str, str]]] = { - # ----------------------------------------------------------------------- - # premium — maximize cost savings while preserving 80% of max quality. - # Uses gpt-5.4-mini (not gpt-5.4) for core features, gpt-4.1-mini (not gpt-4.1) - # for quality-sensitive tasks, gpt-4.1-nano for simple routing/classification, - # and Gemini flash-lite for low-complexity free-text (titles, followups, onboarding). - # ----------------------------------------------------------------------- - 'premium': { - # OpenAI — conversation processing - 'conv_action_items': ('gpt-5.4-mini', 'openai'), - 'conv_structure': ('gpt-5.4-mini', 'openai'), - 'conv_app_result': ('gpt-5.4-mini', 'openai'), - 'conv_app_select': ('gpt-4.1-nano', 'openai'), - 'conv_folder': ('gpt-4.1-nano', 'openai'), - 'conv_discard': ('gpt-4.1-nano', 'openai'), - 'daily_summary': ('gpt-5.4-mini', 'openai'), - 'daily_summary_simple': ('gpt-4.1-nano', 'openai'), - 'external_structure': ('gpt-4.1-mini', 'openai'), - # OpenAI — memories & knowledge - 'memories': ('gpt-4.1-mini', 'openai'), - 'learnings': ('gpt-5.4-mini', 'openai'), - 'memory_conflict': ('gpt-4.1-mini', 'openai'), - 'memory_category': ('gpt-4.1-nano', 'openai'), - 'knowledge_graph': ('gpt-4.1-mini', 'openai'), - # OpenAI — chat - 'chat_responses': ('gpt-5.4-mini', 'openai'), - 'chat_extraction': ('gpt-4.1-mini', 'openai'), - 'chat_graph': ('gpt-4.1-mini', 'openai'), - 'session_titles': ('gemini-2.5-flash-lite', 'gemini'), - # Features - 'goals': ('gpt-4.1-mini', 'openai'), - 'goals_advice': ('gpt-5.4-mini', 'openai'), - 'notifications': ('gpt-5.4-mini', 'openai'), - 'proactive_notification': ('gpt-4.1-mini', 'openai'), - 'followup': ('gemini-2.5-flash-lite', 'gemini'), - 'smart_glasses': ('gpt-4.1-nano', 'openai'), - 'openglass': ('gpt-4.1-mini', 'openai'), - 'onboarding': ('gemini-2.5-flash-lite', 'gemini'), - 'app_generator': ('gpt-5.4-mini', 'openai'), - 'app_integration': ('gemini-2.5-flash-lite', 'gemini'), - 'persona_clone': ('gpt-5.4-mini', 'openai'), - 'trends': ('gemini-2.5-flash-lite', 'gemini'), - # Anthropic (used via get_model() + anthropic_client) - 'chat_agent': ('claude-sonnet-4-6', 'anthropic'), - # Persona - 'persona_chat': ('gpt-4.1-nano', 'openai'), - 'persona_chat_premium': ('gpt-5.4-mini', 'openai'), - # OpenRouter - 'wrapped_analysis': ('gemini-3-flash-preview', 'openrouter'), - # Perplexity - 'web_search': ('sonar-pro', 'perplexity'), - }, - # ----------------------------------------------------------------------- - # max — 100% quality, best models available, no cost optimization. - # Uses gpt-5.4 for all core features, o4-mini for reasoning (learnings), - # gpt-4.1 for chat graph. Pure OpenAI for highest accuracy. - # ----------------------------------------------------------------------- - 'max': { - # OpenAI — conversation processing - 'conv_action_items': ('gpt-5.4', 'openai'), - 'conv_structure': ('gpt-5.4', 'openai'), - 'conv_app_result': ('gpt-5.4', 'openai'), - 'conv_app_select': ('gpt-4.1-mini', 'openai'), - 'conv_folder': ('gpt-4.1-mini', 'openai'), - 'conv_discard': ('gpt-4.1-mini', 'openai'), - 'daily_summary': ('gpt-5.4', 'openai'), - 'daily_summary_simple': ('gpt-4.1-mini', 'openai'), - 'external_structure': ('gpt-4.1-mini', 'openai'), - # OpenAI — memories & knowledge - 'memories': ('gpt-4.1-mini', 'openai'), - 'learnings': ('o4-mini', 'openai'), - 'memory_conflict': ('gpt-4.1-mini', 'openai'), - 'memory_category': ('gpt-4.1-mini', 'openai'), - 'knowledge_graph': ('gpt-4.1-mini', 'openai'), - # OpenAI — chat - 'chat_responses': ('gpt-5.4', 'openai'), - 'chat_extraction': ('gpt-4.1-mini', 'openai'), - 'chat_graph': ('gpt-4.1', 'openai'), - 'session_titles': ('gpt-4.1-mini', 'openai'), - # Features - 'goals': ('gpt-4.1-mini', 'openai'), - 'goals_advice': ('gpt-5.4', 'openai'), - 'notifications': ('gpt-5.4', 'openai'), - 'proactive_notification': ('gpt-4.1-mini', 'openai'), - 'followup': ('gpt-4.1-mini', 'openai'), - 'smart_glasses': ('gpt-4.1-mini', 'openai'), - 'openglass': ('gpt-4.1-mini', 'openai'), - 'onboarding': ('gpt-4.1-mini', 'openai'), - 'app_generator': ('gpt-5.4', 'openai'), - 'app_integration': ('gpt-4.1-mini', 'openai'), - 'persona_clone': ('gpt-5.4', 'openai'), - 'trends': ('gpt-4.1-mini', 'openai'), - # Anthropic - 'chat_agent': ('claude-sonnet-4-6', 'anthropic'), - # Persona - 'persona_chat': ('gpt-4.1-nano', 'openai'), - 'persona_chat_premium': ('gpt-5.4-mini', 'openai'), - # OpenRouter - 'wrapped_analysis': ('gemini-3-flash-preview', 'openrouter'), - # Perplexity - 'web_search': ('sonar-pro', 'perplexity'), - }, - # ----------------------------------------------------------------------- - # byok — same models as max. BYOK users pay their own API costs so they - # get the same best-quality routing as max subscribers. - # ----------------------------------------------------------------------- - 'byok': { - # OpenAI — conversation processing - 'conv_action_items': ('gpt-5.4', 'openai'), - 'conv_structure': ('gpt-5.4', 'openai'), - 'conv_app_result': ('gpt-5.4', 'openai'), - 'conv_app_select': ('gpt-4.1-mini', 'openai'), - 'conv_folder': ('gpt-4.1-mini', 'openai'), - 'conv_discard': ('gpt-4.1-mini', 'openai'), - 'daily_summary': ('gpt-5.4', 'openai'), - 'daily_summary_simple': ('gpt-4.1-mini', 'openai'), - 'external_structure': ('gpt-4.1-mini', 'openai'), - # OpenAI — memories & knowledge - 'memories': ('gpt-4.1-mini', 'openai'), - 'learnings': ('o4-mini', 'openai'), - 'memory_conflict': ('gpt-4.1-mini', 'openai'), - 'memory_category': ('gpt-4.1-mini', 'openai'), - 'knowledge_graph': ('gpt-4.1-mini', 'openai'), - # OpenAI — chat - 'chat_responses': ('gpt-5.4', 'openai'), - 'chat_extraction': ('gpt-4.1-mini', 'openai'), - 'chat_graph': ('gpt-4.1', 'openai'), - 'session_titles': ('gpt-4.1-mini', 'openai'), - # Features - 'goals': ('gpt-4.1-mini', 'openai'), - 'goals_advice': ('gpt-5.4', 'openai'), - 'notifications': ('gpt-5.4', 'openai'), - 'proactive_notification': ('gpt-4.1-mini', 'openai'), - 'followup': ('gpt-4.1-mini', 'openai'), - 'smart_glasses': ('gpt-4.1-mini', 'openai'), - 'openglass': ('gpt-4.1-mini', 'openai'), - 'onboarding': ('gpt-4.1-mini', 'openai'), - 'app_generator': ('gpt-5.4', 'openai'), - 'app_integration': ('gpt-4.1-mini', 'openai'), - 'persona_clone': ('gpt-5.4', 'openai'), - 'trends': ('gpt-4.1-mini', 'openai'), - # Anthropic - 'chat_agent': ('claude-sonnet-4-6', 'anthropic'), - # Persona - 'persona_chat': ('gpt-4.1-nano', 'openai'), - 'persona_chat_premium': ('gpt-5.4-mini', 'openai'), - # OpenRouter - 'wrapped_analysis': ('gemini-3-flash-preview', 'openrouter'), - # Perplexity - 'web_search': ('sonar-pro', 'perplexity'), - }, -} - -# Pinned features — (model, provider) fixed regardless of profile or env override. -_PINNED_FEATURES: Dict[str, Tuple[str, str]] = { - 'fair_use': ('gpt-5.1', 'openai'), -} - -# Resolve active profile once at startup. -_active_profile_name = os.environ.get('MODEL_QOS', 'premium').strip().lower() -if _active_profile_name not in MODEL_QOS_PROFILES: - logger.warning('MODEL_QOS=%s is not a valid profile, falling back to premium', _active_profile_name) - _active_profile_name = 'premium' -_active_profile = MODEL_QOS_PROFILES[_active_profile_name] - -# BYOK QoS — all BYOK users get routed to 'byok' profile (top-tier all-OpenAI). -# BYOK users pay their own API costs, so we give them maximum quality models. -_byok_profile_name = 'byok' -_byok_profile = MODEL_QOS_PROFILES[_byok_profile_name] - -# Features that can't go through get_llm() (non-ChatOpenAI providers). -_ANTHROPIC_ONLY_FEATURES = {'chat_agent'} -_PERPLEXITY_ONLY_FEATURES = {'web_search'} - - -# Feature-specific client config (temperature, headers — orthogonal to model choice). -# Only applied when a feature resolves to an OpenRouter model. -_OPENROUTER_TEMPERATURES: Dict[str, float] = { - 'persona_chat': 0.8, - 'persona_chat_premium': 0.8, - 'wrapped_analysis': 0.7, -} - -# Models that support OpenAI prompt caching (prompt_cache_key routing). -_CACHE_KEY_MODELS = {'gpt-5.4', 'gpt-5.4-mini'} - -# Features that call .with_structured_output() — logged when resolving to Gemini for compat monitoring. -_STRUCTURED_OUTPUT_FEATURES = { - 'chat_extraction', - 'proactive_notification', - 'conv_app_select', - 'external_structure', - 'trends', -} - -_DEFAULT_CONFIG: Tuple[str, str] = ('gpt-4.1-mini', 'openai') - - -def _get_model_config(feature: str) -> Tuple[str, str]: - """Get the (model, provider) tuple for a feature. Internal — used by get_llm/get_model/get_provider. - - Resolution order: pinned > active profile > fallback. - """ - if feature in _PINNED_FEATURES: - return _PINNED_FEATURES[feature] - return _active_profile.get(feature, _DEFAULT_CONFIG) - - -def get_model(feature: str) -> str: - """Get the model name for a feature from the active Model QoS profile. - Resolution order: pinned > active profile > fallback. - - Args: - feature: Feature name (e.g. 'conv_action_items', 'chat_agent'). - - Returns: - Model name string (e.g. 'gpt-4.1-mini', 'claude-sonnet-4-6'). - """ - return _get_model_config(feature)[0] - - -def get_provider(feature: str) -> str: - """Get the provider for a feature from the active Model QoS profile. - - Returns: - Provider string: 'openai', 'gemini', 'openrouter', 'anthropic', 'perplexity'. - """ - return _get_model_config(feature)[1] - - -# --------------------------------------------------------------------------- -# Client factories — provider-specific, cached per (model, streaming, provider) -# Each factory creates and caches a plain ChatOpenAI using Omi's default keys. -# BYOK resolution happens inline in get_llm() at request time. -# --------------------------------------------------------------------------- - -_llm_cache: Dict[tuple, Any] = {} +def _effective_byok_provider(model: str, provider: str) -> str: + """Map provider to the actual BYOK key type needed (Gemini-based OpenRouter → Gemini key).""" + if provider == 'openrouter' and model.startswith('gemini'): + return 'gemini' + return provider +# Compatibility wrappers for tests and legacy imports. New provider construction +# lives in providers.py. def _get_or_create_openai_llm(model_name: str, streaming: bool = False) -> ChatOpenAI: - """Get or create a cached ChatOpenAI for an OpenAI model.""" - key = (model_name, streaming, 'openai') - if key not in _llm_cache: - kwargs: Dict[str, Any] = { - 'callbacks': [_usage_callback], - 'request_timeout': 120, - 'max_retries': 1, - } - if model_name == 'gpt-5.1': - kwargs['extra_body'] = {"prompt_cache_retention": "24h"} - if streaming: - kwargs['streaming'] = True - kwargs['stream_options'] = {"include_usage": True} - _llm_cache[key] = ChatOpenAI(model=model_name, **kwargs) - return _llm_cache[key] + options: Dict[str, Any] = {} + if model_name == 'gpt-5.1': + options['extra_body'] = {"prompt_cache_retention": "24h"} + return get_or_create_openai_compatible_llm('openai', model_name, streaming, options) def _get_or_create_openrouter_llm( model_name: str, streaming: bool = False, temperature: Optional[float] = None ) -> ChatOpenAI: - """Get or create a cached ChatOpenAI for an OpenRouter model. - - Model names in the profile are bare (e.g. 'gemini-3-flash-preview'). - OpenRouter API requires vendor prefix (e.g. 'google/gemini-3-flash-preview'). - """ - # OpenRouter requires vendor-prefixed model names for Google models. - api_model = f'google/{model_name}' if model_name.startswith('gemini') else model_name - key = (model_name, streaming, 'openrouter', temperature) - if key not in _llm_cache: - kwargs: Dict[str, Any] = { - 'api_key': os.environ.get('OPENROUTER_API_KEY'), - 'base_url': "https://openrouter.ai/api/v1", - 'default_headers': {"X-Title": "Omi Chat"}, - 'callbacks': [_usage_callback], - 'request_timeout': 120, - 'max_retries': 1, - } - if temperature is not None: - kwargs['temperature'] = temperature - if streaming: - kwargs['streaming'] = True - kwargs['stream_options'] = {"include_usage": True} - _llm_cache[key] = ChatOpenAI(model=api_model, **kwargs) - return _llm_cache[key] - - -def _get_or_create_gemini_llm( - model_name: str, streaming: bool = False, thinking_budget: Optional[int] = None -) -> BaseChatModel: - """Get or create a cached ChatGoogleGenerativeAI for a Gemini model via native SDK. - - Routing priority: - 1. USE_VERTEX_AI=true + GOOGLE_CLOUD_PROJECT → Vertex AI (ADC, paid quota, ~34% savings with EDP) - 2. GEMINI_API_KEY set → AI Studio (paid-tier key, no OpenAI-compat rate limits) - 3. Neither → placeholder that fails at invoke time (unit tests) - - Vertex AI requires explicit opt-in via USE_VERTEX_AI=true because GOOGLE_CLOUD_PROJECT - is already set for Firestore and the service account may lack Vertex AI permissions. - - thinking_budget: when set (e.g. 0), caps Gemini 2.5 internal "thinking" tokens. - Gemini 2.5 Flash defaults thinking ON (dynamic budget), billed as output tokens — - for mechanical calls (titles/extraction/classification) that adds cost with no quality - gain, so callers pass thinking_budget=0. Only applies to gemini-2.5* models (Gemini 3 - uses thinking_level instead and ignores this). - - BYOK users still go through the OpenAI-compat endpoint via _create_byok_client(). - """ - key = (model_name, streaming, 'gemini', thinking_budget) - if key not in _llm_cache: - use_vertex = os.environ.get('USE_VERTEX_AI', '').lower() == 'true' - gcp_project = os.environ.get('GOOGLE_CLOUD_PROJECT', '') if use_vertex else '' - gemini_key = os.environ.get('GEMINI_API_KEY', '') - kwargs: Dict[str, Any] = {'callbacks': [_usage_callback], 'timeout': 120, 'max_retries': 1} - if streaming: - kwargs['streaming'] = True - - # thinking_budget is a native google-genai SDK construction param. It must only go to - # ChatGoogleGenerativeAI — passing it to the OpenAI-compat ChatOpenAI fallback leaks it into - # model_kwargs and crashes at invoke time ("Completions.parse() got an unexpected keyword - # argument 'thinking_budget'"). Keep it scoped to the native-SDK branches only. - gemini_kwargs = dict(kwargs) - if thinking_budget is not None and model_name.startswith('gemini-2.5'): - gemini_kwargs['thinking_budget'] = thinking_budget - - if gcp_project: - gcp_location = os.environ.get('GCP_LOCATION', 'us-central1') - _llm_cache[key] = ChatGoogleGenerativeAI( - model=model_name, project=gcp_project, location=gcp_location, **gemini_kwargs - ) - elif gemini_key: - gemini_kwargs['google_api_key'] = gemini_key - _llm_cache[key] = ChatGoogleGenerativeAI(model=model_name, **gemini_kwargs) - else: - logger.warning('No USE_VERTEX_AI or GEMINI_API_KEY — Gemini calls will fail at invoke time') - _llm_cache[key] = ChatOpenAI( - model=model_name, api_key='not-set', base_url=_GEMINI_OPENAI_BASE_URL, **kwargs - ) - return _llm_cache[key] - - -def _get_default_client(model: str, provider: str, streaming: bool, feature: str) -> BaseChatModel: - """Get the cached default client for a model/provider combo.""" - if provider == 'openrouter': - temp = _OPENROUTER_TEMPERATURES.get(feature) - return _get_or_create_openrouter_llm(model, streaming, temp) - if provider == 'gemini': - # All Gemini-routed features are mechanical (titles, follow-ups, onboarding, - # app integrations, trends) — reasoning features use Anthropic/OpenAI. Disable - # Gemini 2.5 "thinking" (on by default, billed as output tokens) for these. - return _get_or_create_gemini_llm(model, streaming, thinking_budget=0) - return _get_or_create_openai_llm(model, streaming) - - -def _effective_byok_provider(model: str, provider: str) -> str: - """Map provider to the actual BYOK key type needed (Gemini-based OpenRouter → Gemini key).""" - if provider == 'openrouter' and model.startswith('gemini'): - return 'gemini' - return provider + options: Dict[str, Any] = {} + if temperature is not None: + options['temperature'] = temperature + return get_or_create_openai_compatible_llm('openrouter', model_name, streaming, options) def get_llm(feature: str, streaming: bool = False, cache_key: Optional[str] = None) -> BaseChatModel: """Get the LLM client for a feature based on the active Model QoS profile. - Works for OpenAI, Gemini, and OpenRouter features. Returns a BaseChatModel - (ChatOpenAI for OpenAI/OpenRouter, ChatGoogleGenerativeAI for Gemini). - All share the same interface: .invoke(), .ainvoke(), .stream(), .with_structured_output(). - For Anthropic/Perplexity, use get_model(feature) to get the model string. - - Args: - feature: Feature name (e.g. 'conv_action_items', 'persona_chat'). - streaming: Whether to return a streaming-enabled client. - cache_key: Optional prompt cache routing key (OpenAI gpt-5.4/5.4-mini only). - - Usage: - llm = get_llm('conv_action_items', cache_key='omi-extract-actions') - response = llm.invoke(prompt) - - llm_stream = get_llm('chat_responses', streaming=True) - response = llm_stream.invoke(prompt, {'callbacks': callbacks}) + Works for OpenAI, Gemini, OpenRouter, and other registered OpenAI-compatible + providers. Returns a BaseChatModel. For Anthropic/Perplexity, use + get_model(feature) to get the model string and the provider-specific client. """ - if feature in _ANTHROPIC_ONLY_FEATURES: + if is_anthropic_only_feature(feature): raise ValueError( f"Feature '{feature}' is Anthropic — use get_model('{feature}') with anthropic_client instead of get_llm()" ) - if feature in _PERPLEXITY_ONLY_FEATURES: + if is_perplexity_only_feature(feature): raise ValueError( f"Feature '{feature}' is Perplexity — use get_model('{feature}') with the Perplexity HTTP client instead of get_llm()" ) @@ -615,20 +283,20 @@ def get_llm(feature: str, streaming: bool = False, cache_key: Optional[str] = No f"Feature '{feature}' resolved to Perplexity model '{model}' — use get_model() with Perplexity HTTP client" ) - # Log structured output compatibility when feature resolves to Gemini - if feature in _STRUCTURED_OUTPUT_FEATURES and provider == 'gemini': + if is_structured_output_feature(feature) and provider == 'gemini': logger.debug( - 'QoS structured_output on gemini: feature=%s model=%s profile=%s', feature, model, _active_profile_name + 'QoS structured_output on gemini: feature=%s model=%s profile=%s', + feature, + model, + get_active_profile_name(), ) - # BYOK resolution — if the user provided their own key, create a per-request client. - # When a BYOK QoS profile is configured, upgrade model selection for BYOK users. byok_provider = _effective_byok_provider(model, provider) byok_key = get_byok_key(byok_provider) + byok_profile = get_byok_profile() - if byok_key and _byok_profile: - # Try upgrading to BYOK profile's model selection - byok_model, byok_prov = _byok_profile.get(feature, (model, provider)) + if byok_key and byok_profile: + byok_model, byok_prov = byok_profile.get(feature, (model, provider)) byok_prov_eff = _effective_byok_provider(byok_model, byok_prov) byok_key_for_profile = get_byok_key(byok_prov_eff) if byok_key_for_profile: @@ -638,11 +306,15 @@ def get_llm(feature: str, streaming: bool = False, cache_key: Optional[str] = No if byok_key: byok_client = _create_byok_client(model, provider, byok_key, streaming, feature) - result = byok_client if byok_client is not None else _get_default_client(model, provider, streaming, feature) + result = ( + byok_client + if byok_client is not None + else get_default_client(model, provider, streaming, get_route_options(feature, model, provider)) + ) else: - result = _get_default_client(model, provider, streaming, feature) + result = get_default_client(model, provider, streaming, get_route_options(feature, model, provider)) - if cache_key and model in _CACHE_KEY_MODELS: + if cache_key and supports_prompt_cache(model): return result.bind(prompt_cache_key=cache_key) return result @@ -650,25 +322,26 @@ def get_llm(feature: str, streaming: bool = False, cache_key: Optional[str] = No def get_qos_info() -> Dict[str, Dict[str, str]]: """Return full feature→(model, provider) mapping for the active profile (debugging/monitoring).""" info: Dict[str, Dict[str, str]] = {} - all_features = set(_active_profile.keys()) | set(_PINNED_FEATURES.keys()) + active_profile = get_active_profile() + all_features = get_all_configured_features() for feature in sorted(all_features): model, provider = _get_model_config(feature) info[feature] = { 'model': model, - 'profile': _active_profile_name, + 'profile': get_active_profile_name(), 'provider': provider, } return info # Startup logging — log active profile so cost issues are traceable. -logger.info('Model QoS profile=%s (%d features)', _active_profile_name, len(_active_profile)) +_active_profile = get_active_profile() +logger.info('Model QoS profile=%s (%d features)', get_active_profile_name(), len(_active_profile)) for _feat, (_model, _provider) in sorted(_active_profile.items()): logger.info(' QoS %s: %s [%s]', _feat, _model, _provider) -logger.info('BYOK QoS profile=%s', _byok_profile_name) +logger.info('BYOK QoS profile=%s', get_byok_profile_name()) -# Log structured output features on Gemini for compatibility monitoring -_so_gemini = {f for f in _STRUCTURED_OUTPUT_FEATURES if _active_profile.get(f, _DEFAULT_CONFIG)[1] == 'gemini'} +_so_gemini = {f for f in _active_profile if is_structured_output_feature(f) and _get_model_config(f)[1] == 'gemini'} if _so_gemini: logger.info('Structured output features on Gemini: %s', ', '.join(sorted(_so_gemini))) diff --git a/backend/utils/llm/conversation_folder.py b/backend/utils/llm/conversation_folder.py new file mode 100644 index 00000000000..3bcdd9ef85f --- /dev/null +++ b/backend/utils/llm/conversation_folder.py @@ -0,0 +1,182 @@ +"""Conversation folder assignment LLM route. + +Folder assignment has route-specific safety semantics: returned folder IDs must +exist in the user's folder list, and low-confidence assignments fall back to the +user's default folder. Keeping that logic in one small module makes it safe to +change providers/models through ``get_llm('conv_folder')`` without duplicating +validation in each experiment or callsite. +""" + +import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple + +from langchain_core.output_parsers import PydanticOutputParser +from langchain_core.prompts import ChatPromptTemplate +from pydantic import BaseModel, Field + +from .clients import get_llm + +logger = logging.getLogger(__name__) + + +class FolderAssignment(BaseModel): + """Model for AI folder assignment response.""" + + folder_id: str = Field(description="The ID of the best matching folder for this conversation") + confidence: float = Field( + default=0.5, ge=0.0, le=1.0, description="Confidence score for folder assignment (0.0 to 1.0)" + ) + reasoning: str = Field(default="", description="Brief explanation of why this folder was chosen") + + +@dataclass(frozen=True) +class FolderAssignmentResult: + folder_id: Optional[str] + confidence: float + reasoning: str + validation_status: str + + +def build_folders_context(folders: List[dict]) -> str: + """ + Build context string for LLM folder assignment using natural language descriptions. + + Each folder's description explains what conversations belong in it, + allowing the AI to match based on intent rather than keywords. + """ + if not folders: + return "No folders available. Use default assignment." + + lines = [] + for folder in folders: + folder_id = folder.get('id', '') + name = folder.get('name', '') + description = folder.get('description', '') + is_default = folder.get('is_default', False) + + # Format: folder_id | "Folder Name" → Description + if description: + line = f'- {folder_id} | "{name}" → {description}' + else: + line = f'- {folder_id} | "{name}"' + + if is_default: + line += " (DEFAULT - use when no other folder matches)" + + lines.append(line) + + return "\n".join(lines) + + +def build_conversation_folder_context(title: str, overview: str, category: str) -> str: + return f""" +Title: {title} +Category: {category} +Overview: {overview} +""".strip() + + +def get_default_folder_id(user_folders: List[dict]) -> Optional[str]: + default_folder = next((f for f in user_folders if f.get('is_default')), None) + return default_folder.get('id') if default_folder else None + + +def validate_folder_assignment( + response: FolderAssignment, + user_folders: List[dict], + default_folder_id: Optional[str], + confidence_threshold: float = 0.7, +) -> FolderAssignmentResult: + """Apply route-specific safety checks to a parsed folder assignment.""" + + valid_folder_ids = {f.get('id') for f in user_folders} + if response.folder_id not in valid_folder_ids: + return FolderAssignmentResult( + folder_id=default_folder_id, + confidence=0.3, + reasoning="Invalid folder ID returned, using default", + validation_status='invalid_folder_id_defaulted', + ) + + if response.confidence < confidence_threshold and default_folder_id: + return FolderAssignmentResult( + folder_id=default_folder_id, + confidence=response.confidence, + reasoning=f"Low confidence ({response.confidence:.2f}), using default folder", + validation_status='low_confidence_defaulted', + ) + + return FolderAssignmentResult( + folder_id=response.folder_id, + confidence=response.confidence, + reasoning=response.reasoning, + validation_status='accepted', + ) + + +def assign_conversation_to_folder( + title: str, + overview: str, + category: str, + user_folders: List[dict], +) -> Tuple[Optional[str], float, str]: + """ + Use AI to assign a conversation to the most appropriate folder. + + Args: + title: The conversation title + overview: The conversation overview/summary + category: The conversation category + user_folders: List of user's folders with id, name, description, is_default + + Returns: + Tuple of (folder_id, confidence, reasoning) + Returns (None, 0.0, reason) if assignment fails or confidence is too low + """ + if not user_folders: + return None, 0.0, "No folders available" + + folders_context = build_folders_context(user_folders) + default_folder_id = get_default_folder_id(user_folders) + conversation_context = build_conversation_folder_context(title, overview, category) + + prompt_text = '''You are a folder assignment system. Match the conversation to the folder that best represents its overall theme. + +FOLDERS: +{folders_context} + +CONVERSATION: +{conversation_context} + +INSTRUCTIONS: +- Match based on the dominant theme of the conversation (what it's fundamentally about) +- The folder should feel like a natural home for this conversation +- Only assign to a non-default folder if the theme clearly matches +- When in doubt, use the DEFAULT folder + +Provide: +- folder_id: The best matching folder ID from the list above +- confidence: Match strength (0.0-1.0). Use 0.9+ only for clear thematic matches, below 0.7 means use DEFAULT +- reasoning: One sentence explaining the match + +{format_instructions}''' + + folder_parser = PydanticOutputParser(pydantic_object=FolderAssignment) + prompt = ChatPromptTemplate.from_messages([('system', prompt_text)]) + chain = prompt | get_llm('conv_folder') | folder_parser + + try: + response: FolderAssignment = chain.invoke( + { + 'folders_context': folders_context, + 'conversation_context': conversation_context, + 'format_instructions': folder_parser.get_format_instructions(), + } + ) + result = validate_folder_assignment(response, user_folders, default_folder_id) + return result.folder_id, result.confidence, result.reasoning + + except Exception as e: + logger.error(f'Error assigning conversation to folder: {e}') + return default_folder_id, 0.0, f"Error: {str(e)}" diff --git a/backend/utils/llm/conversation_processing.py b/backend/utils/llm/conversation_processing.py index e46a03c2ee8..7d4cde2ea62 100644 --- a/backend/utils/llm/conversation_processing.py +++ b/backend/utils/llm/conversation_processing.py @@ -13,6 +13,7 @@ from models.conversation_photo import ConversationPhoto from models.structured import ActionItem, ActionItemsExtraction, Event, Structured from .clients import get_llm, parser +from utils.llm.conversation_folder import FolderAssignment, assign_conversation_to_folder, build_folders_context import logging logger = logging.getLogger(__name__) @@ -20,136 +21,8 @@ # ============================================= # FOLDER ASSIGNMENT # ============================================= - - -class FolderAssignment(BaseModel): - """Model for AI folder assignment response.""" - - folder_id: str = Field(description="The ID of the best matching folder for this conversation") - confidence: float = Field( - default=0.5, ge=0.0, le=1.0, description="Confidence score for folder assignment (0.0 to 1.0)" - ) - reasoning: str = Field(default="", description="Brief explanation of why this folder was chosen") - - -def build_folders_context(folders: List[dict]) -> str: - """ - Build context string for LLM folder assignment using natural language descriptions. - - Each folder's description explains what conversations belong in it, - allowing the AI to match based on intent rather than keywords. - """ - if not folders: - return "No folders available. Use default assignment." - - lines = [] - for folder in folders: - folder_id = folder.get('id', '') - name = folder.get('name', '') - description = folder.get('description', '') - is_default = folder.get('is_default', False) - - # Format: folder_id | "Folder Name" → Description - if description: - line = f'- {folder_id} | "{name}" → {description}' - else: - line = f'- {folder_id} | "{name}"' - - if is_default: - line += " (DEFAULT - use when no other folder matches)" - - lines.append(line) - - return "\n".join(lines) - - -def assign_conversation_to_folder( - title: str, - overview: str, - category: str, - user_folders: List[dict], -) -> Tuple[Optional[str], float, str]: - """ - Use AI to assign a conversation to the most appropriate folder. - - Args: - title: The conversation title - overview: The conversation overview/summary - category: The conversation category - user_folders: List of user's folders with id, name, description, is_default - - Returns: - Tuple of (folder_id, confidence, reasoning) - Returns (None, 0.0, reason) if assignment fails or confidence is too low - """ - if not user_folders: - return None, 0.0, "No folders available" - - folders_context = build_folders_context(user_folders) - - # Find default folder for fallback - default_folder = next((f for f in user_folders if f.get('is_default')), None) - default_folder_id = default_folder.get('id') if default_folder else None - - # Build conversation context - conversation_context = f""" -Title: {title} -Category: {category} -Overview: {overview} -""".strip() - - prompt_text = '''You are a folder assignment system. Match the conversation to the folder that best represents its overall theme. - -FOLDERS: -{folders_context} - -CONVERSATION: -{conversation_context} - -INSTRUCTIONS: -- Match based on the dominant theme of the conversation (what it's fundamentally about) -- The folder should feel like a natural home for this conversation -- Only assign to a non-default folder if the theme clearly matches -- When in doubt, use the DEFAULT folder - -Provide: -- folder_id: The best matching folder ID from the list above -- confidence: Match strength (0.0-1.0). Use 0.9+ only for clear thematic matches, below 0.7 means use DEFAULT -- reasoning: One sentence explaining the match - -{format_instructions}''' - - folder_parser = PydanticOutputParser(pydantic_object=FolderAssignment) - prompt = ChatPromptTemplate.from_messages([('system', prompt_text)]) - chain = prompt | get_llm('conv_folder') | folder_parser - - try: - response: FolderAssignment = chain.invoke( - { - 'folders_context': folders_context, - 'conversation_context': conversation_context, - 'format_instructions': folder_parser.get_format_instructions(), - } - ) - - # Validate the folder_id exists - valid_folder_ids = {f.get('id') for f in user_folders} - if response.folder_id not in valid_folder_ids: - return default_folder_id, 0.3, f"Invalid folder ID returned, using default" - - # If confidence is too low, use default folder - if response.confidence < 0.7 and default_folder_id: - return ( - default_folder_id, - response.confidence, - f"Low confidence ({response.confidence:.2f}), using default folder", - ) - - return response.folder_id, response.confidence, response.reasoning - - except Exception as e: - logger.error(f'Error assigning conversation to folder: {e}') - return default_folder_id, 0.0, f"Error: {str(e)}" +# The implementation moved to conversation_folder.py; that route still uses +# get_llm('conv_folder') as the production model/provider plug-in seam. class DiscardConversation(BaseModel): diff --git a/backend/utils/llm/model_config.py b/backend/utils/llm/model_config.py new file mode 100644 index 00000000000..dec6e79c4d1 --- /dev/null +++ b/backend/utils/llm/model_config.py @@ -0,0 +1,317 @@ +"""Model/profile configuration for backend LLM feature routing. + +This module is the source of truth for feature → (model, provider) routing. +Provider-specific client construction lives in ``providers.py``; callers should +continue to use ``clients.get_llm(feature)``. +""" + +import logging +import os +from typing import Dict, Tuple + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Model QoS Profile System +# +# Each profile maps every feature to a (model, provider) tuple. +# The profile is the SINGLE SOURCE OF TRUTH for both model and provider. +# Provider is never inferred from model name — it is declared explicitly. +# +# This means the same model can be hosted by different providers: +# feature_a: ('gemini-2.5-flash', 'gemini') → Google direct +# feature_b: ('gemini-2.5-flash', 'openrouter') → OpenRouter +# +# Global switch: MODEL_QOS=premium (selects entire profile) +# +# Profiles: +# premium — maximize cost savings while preserving 80% of max quality +# max — 100% quality, best models available, no cost optimization +# byok — same models as max (BYOK users pay their own API costs) +# --------------------------------------------------------------------------- + +MODEL_QOS_PROFILES: Dict[str, Dict[str, Tuple[str, str]]] = { + # ----------------------------------------------------------------------- + # premium — maximize cost savings while preserving 80% of max quality. + # Uses gpt-5.4-mini (not gpt-5.4) for core features, gpt-4.1-mini (not gpt-4.1) + # for quality-sensitive tasks, gpt-4.1-nano for simple routing/classification, + # and Gemini flash-lite for low-complexity free-text (titles, followups, onboarding). + # ----------------------------------------------------------------------- + 'premium': { + # OpenAI — conversation processing + 'conv_action_items': ('gpt-5.4-mini', 'openai'), + 'conv_structure': ('gpt-5.4-mini', 'openai'), + 'conv_app_result': ('gpt-5.4-mini', 'openai'), + 'conv_app_select': ('gpt-4.1-nano', 'openai'), + 'conv_folder': ('gpt-4.1-nano', 'openai'), + 'conv_discard': ('gpt-4.1-nano', 'openai'), + 'daily_summary': ('gpt-5.4-mini', 'openai'), + 'daily_summary_simple': ('gpt-4.1-nano', 'openai'), + 'external_structure': ('gpt-4.1-mini', 'openai'), + # OpenAI — memories & knowledge + 'memories': ('gpt-4.1-mini', 'openai'), + 'learnings': ('gpt-5.4-mini', 'openai'), + 'memory_conflict': ('gpt-4.1-mini', 'openai'), + 'memory_category': ('gpt-4.1-nano', 'openai'), + 'knowledge_graph': ('gpt-4.1-mini', 'openai'), + # OpenAI — chat + 'chat_responses': ('gpt-5.4-mini', 'openai'), + 'chat_extraction': ('gpt-4.1-mini', 'openai'), + 'chat_graph': ('gpt-4.1-mini', 'openai'), + 'session_titles': ('gemini-2.5-flash-lite', 'gemini'), + # Features + 'goals': ('gpt-4.1-mini', 'openai'), + 'goals_advice': ('gpt-5.4-mini', 'openai'), + 'notifications': ('gpt-5.4-mini', 'openai'), + 'proactive_notification': ('gpt-4.1-mini', 'openai'), + 'followup': ('gemini-2.5-flash-lite', 'gemini'), + 'smart_glasses': ('gpt-4.1-nano', 'openai'), + 'openglass': ('gpt-4.1-mini', 'openai'), + 'onboarding': ('gemini-2.5-flash-lite', 'gemini'), + 'app_generator': ('gpt-5.4-mini', 'openai'), + 'app_integration': ('gemini-2.5-flash-lite', 'gemini'), + 'persona_clone': ('gpt-5.4-mini', 'openai'), + 'trends': ('gemini-2.5-flash-lite', 'gemini'), + # Anthropic (used via get_model() + anthropic_client) + 'chat_agent': ('claude-sonnet-4-6', 'anthropic'), + # Persona + 'persona_chat': ('gpt-4.1-nano', 'openai'), + 'persona_chat_premium': ('gpt-5.4-mini', 'openai'), + # OpenRouter + 'wrapped_analysis': ('gemini-3-flash-preview', 'openrouter'), + # Perplexity + 'web_search': ('sonar-pro', 'perplexity'), + }, + # ----------------------------------------------------------------------- + # max — 100% quality, best models available, no cost optimization. + # Uses gpt-5.4 for all core features, o4-mini for reasoning (learnings), + # gpt-4.1 for chat graph. Pure OpenAI for highest accuracy. + # ----------------------------------------------------------------------- + 'max': { + # OpenAI — conversation processing + 'conv_action_items': ('gpt-5.4', 'openai'), + 'conv_structure': ('gpt-5.4', 'openai'), + 'conv_app_result': ('gpt-5.4', 'openai'), + 'conv_app_select': ('gpt-4.1-mini', 'openai'), + 'conv_folder': ('gpt-4.1-mini', 'openai'), + 'conv_discard': ('gpt-4.1-mini', 'openai'), + 'daily_summary': ('gpt-5.4', 'openai'), + 'daily_summary_simple': ('gpt-4.1-mini', 'openai'), + 'external_structure': ('gpt-4.1-mini', 'openai'), + # OpenAI — memories & knowledge + 'memories': ('gpt-4.1-mini', 'openai'), + 'learnings': ('o4-mini', 'openai'), + 'memory_conflict': ('gpt-4.1-mini', 'openai'), + 'memory_category': ('gpt-4.1-mini', 'openai'), + 'knowledge_graph': ('gpt-4.1-mini', 'openai'), + # OpenAI — chat + 'chat_responses': ('gpt-5.4', 'openai'), + 'chat_extraction': ('gpt-4.1-mini', 'openai'), + 'chat_graph': ('gpt-4.1', 'openai'), + 'session_titles': ('gpt-4.1-mini', 'openai'), + # Features + 'goals': ('gpt-4.1-mini', 'openai'), + 'goals_advice': ('gpt-5.4', 'openai'), + 'notifications': ('gpt-5.4', 'openai'), + 'proactive_notification': ('gpt-4.1-mini', 'openai'), + 'followup': ('gpt-4.1-mini', 'openai'), + 'smart_glasses': ('gpt-4.1-mini', 'openai'), + 'openglass': ('gpt-4.1-mini', 'openai'), + 'onboarding': ('gpt-4.1-mini', 'openai'), + 'app_generator': ('gpt-5.4', 'openai'), + 'app_integration': ('gpt-4.1-mini', 'openai'), + 'persona_clone': ('gpt-5.4', 'openai'), + 'trends': ('gpt-4.1-mini', 'openai'), + # Anthropic + 'chat_agent': ('claude-sonnet-4-6', 'anthropic'), + # Persona + 'persona_chat': ('gpt-4.1-nano', 'openai'), + 'persona_chat_premium': ('gpt-5.4-mini', 'openai'), + # OpenRouter + 'wrapped_analysis': ('gemini-3-flash-preview', 'openrouter'), + # Perplexity + 'web_search': ('sonar-pro', 'perplexity'), + }, + # ----------------------------------------------------------------------- + # byok — same models as max. BYOK users pay their own API costs so they + # get the same best-quality routing as max subscribers. + # ----------------------------------------------------------------------- + 'byok': { + # OpenAI — conversation processing + 'conv_action_items': ('gpt-5.4', 'openai'), + 'conv_structure': ('gpt-5.4', 'openai'), + 'conv_app_result': ('gpt-5.4', 'openai'), + 'conv_app_select': ('gpt-4.1-mini', 'openai'), + 'conv_folder': ('gpt-4.1-mini', 'openai'), + 'conv_discard': ('gpt-4.1-mini', 'openai'), + 'daily_summary': ('gpt-5.4', 'openai'), + 'daily_summary_simple': ('gpt-4.1-mini', 'openai'), + 'external_structure': ('gpt-4.1-mini', 'openai'), + # OpenAI — memories & knowledge + 'memories': ('gpt-4.1-mini', 'openai'), + 'learnings': ('o4-mini', 'openai'), + 'memory_conflict': ('gpt-4.1-mini', 'openai'), + 'memory_category': ('gpt-4.1-mini', 'openai'), + 'knowledge_graph': ('gpt-4.1-mini', 'openai'), + # OpenAI — chat + 'chat_responses': ('gpt-5.4', 'openai'), + 'chat_extraction': ('gpt-4.1-mini', 'openai'), + 'chat_graph': ('gpt-4.1', 'openai'), + 'session_titles': ('gpt-4.1-mini', 'openai'), + # Features + 'goals': ('gpt-4.1-mini', 'openai'), + 'goals_advice': ('gpt-5.4', 'openai'), + 'notifications': ('gpt-5.4', 'openai'), + 'proactive_notification': ('gpt-4.1-mini', 'openai'), + 'followup': ('gpt-4.1-mini', 'openai'), + 'smart_glasses': ('gpt-4.1-mini', 'openai'), + 'openglass': ('gpt-4.1-mini', 'openai'), + 'onboarding': ('gpt-4.1-mini', 'openai'), + 'app_generator': ('gpt-5.4', 'openai'), + 'app_integration': ('gpt-4.1-mini', 'openai'), + 'persona_clone': ('gpt-5.4', 'openai'), + 'trends': ('gpt-4.1-mini', 'openai'), + # Anthropic + 'chat_agent': ('claude-sonnet-4-6', 'anthropic'), + # Persona + 'persona_chat': ('gpt-4.1-nano', 'openai'), + 'persona_chat_premium': ('gpt-5.4-mini', 'openai'), + # OpenRouter + 'wrapped_analysis': ('gemini-3-flash-preview', 'openrouter'), + # Perplexity + 'web_search': ('sonar-pro', 'perplexity'), + }, +} + +# Pinned features — (model, provider) fixed regardless of profile or env override. +_PINNED_FEATURES: Dict[str, Tuple[str, str]] = { + 'fair_use': ('gpt-5.1', 'openai'), +} + +# Resolve active profile once at startup. +_active_profile_name = os.environ.get('MODEL_QOS', 'premium').strip().lower() +if _active_profile_name not in MODEL_QOS_PROFILES: + logger.warning('MODEL_QOS=%s is not a valid profile, falling back to premium', _active_profile_name) + _active_profile_name = 'premium' +_active_profile = MODEL_QOS_PROFILES[_active_profile_name] + +# BYOK QoS — all BYOK users get routed to 'byok' profile (top-tier all-OpenAI). +# BYOK users pay their own API costs, so we give them maximum quality models. +_byok_profile_name = 'byok' +_byok_profile = MODEL_QOS_PROFILES[_byok_profile_name] + +# Features that can't go through get_llm() (non-ChatOpenAI providers). +_ANTHROPIC_ONLY_FEATURES = {'chat_agent'} +_PERPLEXITY_ONLY_FEATURES = {'web_search'} + + +# Feature-specific client config (temperature, headers — orthogonal to model choice). +# Only applied when a feature resolves to an OpenRouter model. +_OPENROUTER_TEMPERATURES: Dict[str, float] = { + 'persona_chat': 0.8, + 'persona_chat_premium': 0.8, + 'wrapped_analysis': 0.7, +} + +# Models that support OpenAI prompt caching (prompt_cache_key routing). +_CACHE_KEY_MODELS = {'gpt-5.4', 'gpt-5.4-mini'} + +# Features that call .with_structured_output() — logged when resolving to Gemini for compat monitoring. +_STRUCTURED_OUTPUT_FEATURES = { + 'chat_extraction', + 'proactive_notification', + 'conv_app_select', + 'external_structure', + 'trends', +} + +_DEFAULT_CONFIG: Tuple[str, str] = ('gpt-4.1-mini', 'openai') + + +def _get_model_config(feature: str) -> Tuple[str, str]: + """Get the (model, provider) tuple for a feature. Internal — used by get_llm/get_model/get_provider. + + Resolution order: pinned > active profile > fallback. + """ + if feature in _PINNED_FEATURES: + return _PINNED_FEATURES[feature] + return _active_profile.get(feature, _DEFAULT_CONFIG) + + +def get_model(feature: str) -> str: + """Get the model name for a feature from the active Model QoS profile. + + Resolution order: pinned > active profile > fallback. + + Args: + feature: Feature name (e.g. 'conv_action_items', 'chat_agent'). + + Returns: + Model name string (e.g. 'gpt-4.1-mini', 'claude-sonnet-4-6'). + """ + return _get_model_config(feature)[0] + + +def get_provider(feature: str) -> str: + """Get the provider for a feature from the active Model QoS profile. + + Returns: + Provider string: 'openai', 'gemini', 'openrouter', 'anthropic', 'perplexity'. + """ + return _get_model_config(feature)[1] + + +def get_route_options(feature: str, model: str, provider: str) -> Dict[str, object]: + """Return provider/model construction options for a resolved route.""" + + options: Dict[str, object] = {} + if model == 'gpt-5.1': + options['extra_body'] = {"prompt_cache_retention": "24h"} + if provider == 'openrouter': + temperature = _OPENROUTER_TEMPERATURES.get(feature) + if temperature is not None: + options['temperature'] = temperature + if provider == 'gemini': + # Mechanical Gemini-routed features do not need paid thinking tokens. + options['thinking_budget'] = 0 + return options + + +def supports_prompt_cache(model: str) -> bool: + return model in _CACHE_KEY_MODELS + + +def is_structured_output_feature(feature: str) -> bool: + return feature in _STRUCTURED_OUTPUT_FEATURES + + +def is_anthropic_only_feature(feature: str) -> bool: + return feature in _ANTHROPIC_ONLY_FEATURES + + +def is_perplexity_only_feature(feature: str) -> bool: + return feature in _PERPLEXITY_ONLY_FEATURES + + +def get_active_profile_name() -> str: + return _active_profile_name + + +def get_active_profile() -> Dict[str, Tuple[str, str]]: + return _active_profile + + +def get_all_configured_features() -> set[str]: + return set(_active_profile.keys()) | set(_PINNED_FEATURES.keys()) + + +def get_default_config() -> Tuple[str, str]: + return _DEFAULT_CONFIG + + +def get_byok_profile() -> Dict[str, Tuple[str, str]]: + return _byok_profile + + +def get_byok_profile_name() -> str: + return _byok_profile_name diff --git a/backend/utils/llm/providers.py b/backend/utils/llm/providers.py new file mode 100644 index 00000000000..d4c56820d20 --- /dev/null +++ b/backend/utils/llm/providers.py @@ -0,0 +1,159 @@ +"""Provider-specific chat model construction for LLM feature routing. + +This module owns the mechanics of turning a resolved provider/model route into a +LangChain ``BaseChatModel``. Keep product features out of this file: callers +should route by feature through ``utils.llm.clients.get_llm()`` and let the model +configuration decide which provider/model to use. +""" + +import logging +import os +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +from langchain_core.language_models import BaseChatModel +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_openai import ChatOpenAI +from pydantic import SecretStr + +from utils.llm.usage_tracker import get_usage_callback + +logger = logging.getLogger(__name__) + +_usage_callback = get_usage_callback() + +# Google's OpenAI-compatible endpoint — used only for BYOK users who bring their +# own AI Studio API key. Platform Gemini calls use ChatGoogleGenerativeAI. +GEMINI_OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/" + + +@dataclass(frozen=True) +class OpenAICompatibleProviderConfig: + """Configuration for providers served through ChatOpenAI-compatible APIs.""" + + name: str + api_key_env: str + base_url: Optional[str] = None + default_headers: Dict[str, str] = field(default_factory=dict) + prefix_google_models: bool = False + + +OPENAI_COMPATIBLE_PROVIDERS: Dict[str, OpenAICompatibleProviderConfig] = { + 'openai': OpenAICompatibleProviderConfig(name='openai', api_key_env='OPENAI_API_KEY'), + 'openrouter': OpenAICompatibleProviderConfig( + name='openrouter', + api_key_env='OPENROUTER_API_KEY', + base_url="https://openrouter.ai/api/v1", + default_headers={"X-Title": "Omi Chat"}, + prefix_google_models=True, + ), +} + +_llm_cache: Dict[tuple, Any] = {} + + +def _cache_key(provider: str, model_name: str, streaming: bool, options: Dict[str, Any]) -> tuple: + option_items = tuple(sorted((key, repr(value)) for key, value in options.items())) + return provider, model_name, streaming, option_items + + +def _api_model_name(provider_config: OpenAICompatibleProviderConfig, model_name: str) -> str: + if provider_config.prefix_google_models and model_name.startswith('gemini'): + return f'google/{model_name}' + return model_name + + +def get_or_create_openai_compatible_llm( + provider: str, + model_name: str, + streaming: bool = False, + options: Optional[Dict[str, Any]] = None, +) -> ChatOpenAI: + """Get or create a cached ChatOpenAI-compatible chat model.""" + + options = options or {} + if provider not in OPENAI_COMPATIBLE_PROVIDERS: + raise ValueError(f"Unknown OpenAI-compatible provider '{provider}'") + + provider_config = OPENAI_COMPATIBLE_PROVIDERS[provider] + key = _cache_key(provider, model_name, streaming, options) + if key not in _llm_cache: + kwargs: Dict[str, Any] = { + 'callbacks': [_usage_callback], + 'request_timeout': options.get('request_timeout', 120), + 'max_retries': options.get('max_retries', 1), + } + api_key = os.environ.get(provider_config.api_key_env) + if api_key: + kwargs['api_key'] = api_key + if provider_config.base_url: + kwargs['base_url'] = provider_config.base_url + if provider_config.default_headers: + kwargs['default_headers'] = provider_config.default_headers + if options.get('extra_body'): + kwargs['extra_body'] = options['extra_body'] + if 'temperature' in options: + kwargs['temperature'] = options['temperature'] + if streaming: + kwargs['streaming'] = True + kwargs['stream_options'] = {"include_usage": True} + + _llm_cache[key] = ChatOpenAI(model=_api_model_name(provider_config, model_name), **kwargs) + return _llm_cache[key] + + +def get_or_create_gemini_llm( + model_name: str, streaming: bool = False, thinking_budget: Optional[int] = None +) -> BaseChatModel: + """Get or create a cached ChatGoogleGenerativeAI for a Gemini model via native SDK. + + Routing priority: + 1. USE_VERTEX_AI=true + GOOGLE_CLOUD_PROJECT → Vertex AI + 2. GEMINI_API_KEY set → AI Studio + 3. Neither → placeholder that fails at invoke time (unit tests) + + BYOK users still go through the OpenAI-compatible Gemini endpoint in clients.py. + """ + + key = (model_name, streaming, 'gemini', thinking_budget) + if key not in _llm_cache: + use_vertex = os.environ.get('USE_VERTEX_AI', '').lower() == 'true' + gcp_project = os.environ.get('GOOGLE_CLOUD_PROJECT', '') if use_vertex else '' + gemini_key = os.environ.get('GEMINI_API_KEY', '') + kwargs: Dict[str, Any] = {'callbacks': [_usage_callback], 'timeout': 120, 'max_retries': 1} + if streaming: + kwargs['streaming'] = True + if thinking_budget is not None and model_name.startswith('gemini-2.5'): + kwargs['thinking_budget'] = thinking_budget + + if gcp_project: + gcp_location = os.environ.get('GCP_LOCATION', 'us-central1') + _llm_cache[key] = ChatGoogleGenerativeAI( + model=model_name, project=gcp_project, location=gcp_location, **kwargs + ) + elif gemini_key: + kwargs['google_api_key'] = gemini_key + _llm_cache[key] = ChatGoogleGenerativeAI(model=model_name, **kwargs) + else: + logger.warning('No USE_VERTEX_AI or GEMINI_API_KEY — Gemini calls will fail at invoke time') + _llm_cache[key] = ChatOpenAI( + model=model_name, + api_key=SecretStr('not-set'), + base_url=GEMINI_OPENAI_BASE_URL, + **kwargs, + ) + return _llm_cache[key] + + +def get_default_client( + model: str, + provider: str, + streaming: bool, + options: Optional[Dict[str, Any]] = None, +) -> BaseChatModel: + """Get the cached default client for a model/provider combo.""" + + options = options or {} + if provider == 'gemini': + return get_or_create_gemini_llm(model, streaming, thinking_budget=options.get('thinking_budget')) + return get_or_create_openai_compatible_llm(provider, model, streaming, options) From 27c6c40acb86ac6520ce749407e93db5cb6a0d91 Mon Sep 17 00:00:00 2001 From: Codex Agent Date: Mon, 15 Jun 2026 07:33:36 +0000 Subject: [PATCH 2/3] test(backend): cover LLM provider routing compatibility --- .../test_llm_provider_plugin_structure.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/backend/tests/unit/test_llm_provider_plugin_structure.py b/backend/tests/unit/test_llm_provider_plugin_structure.py index f45d60c52c6..82fb4455cd1 100644 --- a/backend/tests/unit/test_llm_provider_plugin_structure.py +++ b/backend/tests/unit/test_llm_provider_plugin_structure.py @@ -61,6 +61,65 @@ def bind(self, **kwargs): return self +def test_clients_facade_preserves_legacy_compatibility_exports(): + import utils.llm.clients as clients + + for name in [ + '_GEMINI_OPENAI_BASE_URL', + '_DEFAULT_CONFIG', + 'MODEL_QOS_PROFILES', + '_PINNED_FEATURES', + '_CACHE_KEY_MODELS', + '_STRUCTURED_OUTPUT_FEATURES', + '_ANTHROPIC_ONLY_FEATURES', + '_PERPLEXITY_ONLY_FEATURES', + ]: + assert hasattr(clients, name), f'utils.llm.clients no longer exports {name}' + + +def test_openai_compatible_provider_cache_separates_route_options(monkeypatch): + FakeChatOpenAI.calls.clear() + monkeypatch.setattr(providers, 'ChatOpenAI', FakeChatOpenAI) + monkeypatch.setenv('OPENROUTER_API_KEY', '***') + + low_temp = providers.get_or_create_openai_compatible_llm( + 'openrouter', 'anthropic/claude-sonnet-4', options={'temperature': 0.1} + ) + high_temp = providers.get_or_create_openai_compatible_llm( + 'openrouter', 'anthropic/claude-sonnet-4', options={'temperature': 0.7} + ) + high_temp_again = providers.get_or_create_openai_compatible_llm( + 'openrouter', 'anthropic/claude-sonnet-4', options={'temperature': 0.7} + ) + + assert low_temp is not high_temp + assert high_temp is high_temp_again + assert FakeChatOpenAI.calls[0]['temperature'] == 0.1 + assert FakeChatOpenAI.calls[1]['temperature'] == 0.7 + + +def test_get_llm_facade_still_uses_provider_factory_options(monkeypatch): + import utils.llm.clients as clients + + captured = {} + + def fake_default_client(model, provider, streaming=False, options=None): + captured.update(model=model, provider=provider, streaming=streaming, options=options) + return FakeChatOpenAI(model=model, provider=provider, streaming=streaming, **(options or {})) + + monkeypatch.setattr(clients, 'get_default_client', fake_default_client) + + llm = clients.get_llm('wrapped_analysis') + + assert isinstance(llm, FakeChatOpenAI) + assert captured == { + 'model': 'gemini-3-flash-preview', + 'provider': 'openrouter', + 'streaming': False, + 'options': {'temperature': 0.7}, + } + + def test_openai_compatible_provider_applies_base_url_headers_and_google_prefix(monkeypatch): FakeChatOpenAI.calls.clear() providers._llm_cache.clear() From 37880df796cc3d77cb915c28d04c733baf153e2e Mon Sep 17 00:00:00 2001 From: David Zhang Date: Fri, 19 Jun 2026 07:08:47 +0700 Subject: [PATCH 3/3] fix: strip thinking_budget from ChatOpenAI fallback kwargs (P2 bot fix) thinking_budget is a ChatGoogleGenerativeAI-only parameter that causes ChatOpenAI to reject at invoke time in test/local environments without Gemini credentials. --- backend/utils/llm/providers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/utils/llm/providers.py b/backend/utils/llm/providers.py index d4c56820d20..034e773c098 100644 --- a/backend/utils/llm/providers.py +++ b/backend/utils/llm/providers.py @@ -136,11 +136,14 @@ def get_or_create_gemini_llm( _llm_cache[key] = ChatGoogleGenerativeAI(model=model_name, **kwargs) else: logger.warning('No USE_VERTEX_AI or GEMINI_API_KEY — Gemini calls will fail at invoke time') + # Strip thinking_budget — it's a ChatGoogleGenerativeAI-only param + # that ChatOpenAI rejects at invoke time. + fallback_kwargs = {k: v for k, v in kwargs.items() if k != 'thinking_budget'} _llm_cache[key] = ChatOpenAI( model=model_name, api_key=SecretStr('not-set'), base_url=GEMINI_OPENAI_BASE_URL, - **kwargs, + **fallback_kwargs, ) return _llm_cache[key]