diff --git a/phases/00-setup-and-tooling/04-apis-and-keys/code/first_api_call.py b/phases/00-setup-and-tooling/04-apis-and-keys/code/first_api_call.py index 579455f1f..6c67288b4 100644 --- a/phases/00-setup-and-tooling/04-apis-and-keys/code/first_api_call.py +++ b/phases/00-setup-and-tooling/04-apis-and-keys/code/first_api_call.py @@ -4,6 +4,7 @@ def call_with_sdk(): + """Make an LLM API call using the official Anthropic Python SDK.""" try: import anthropic except ImportError: @@ -12,7 +13,7 @@ def call_with_sdk(): client = anthropic.Anthropic() response = client.messages.create( - model="claude-sonnet-4-20250514", + model="claude-sonnet-4-6", max_tokens=256, messages=[{"role": "user", "content": "What is a neural network in one sentence?"}] ) @@ -21,6 +22,7 @@ def call_with_sdk(): def call_raw_http(): + """Make a raw HTTP API call to Anthropic's Messages endpoint using urllib.""" api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: print("Set ANTHROPIC_API_KEY environment variable first") @@ -33,7 +35,7 @@ def call_raw_http(): "anthropic-version": "2023-06-01", } body = json.dumps({ - "model": "claude-sonnet-4-20250514", + "model": "claude-sonnet-4-6", "max_tokens": 256, "messages": [{"role": "user", "content": "What is a neural network in one sentence?"}], }).encode() diff --git a/phases/00-setup-and-tooling/04-apis-and-keys/docs/en.md b/phases/00-setup-and-tooling/04-apis-and-keys/docs/en.md index e222a85b6..0e7b2d584 100644 --- a/phases/00-setup-and-tooling/04-apis-and-keys/docs/en.md +++ b/phases/00-setup-and-tooling/04-apis-and-keys/docs/en.md @@ -60,7 +60,7 @@ import anthropic client = anthropic.Anthropic() response = client.messages.create( - model="claude-sonnet-4-20250514", + model="claude-sonnet-4-6", max_tokens=256, messages=[{"role": "user", "content": "What is a neural network in one sentence?"}] ) @@ -76,7 +76,7 @@ import Anthropic from "@anthropic-ai/sdk"; const client = new Anthropic(); const response = await client.messages.create({ - model: "claude-sonnet-4-20250514", + model: "claude-sonnet-4-6", max_tokens: 256, messages: [{ role: "user", content: "What is a neural network in one sentence?" }], }); @@ -98,7 +98,7 @@ headers = { "anthropic-version": "2023-06-01", } body = json.dumps({ - "model": "claude-sonnet-4-20250514", + "model": "claude-sonnet-4-6", "max_tokens": 256, "messages": [{"role": "user", "content": "What is a neural network in one sentence?"}], }).encode() diff --git a/phases/11-llm-engineering/06-rag/docs/en.md b/phases/11-llm-engineering/06-rag/docs/en.md index f01049bd2..5c66a117a 100644 --- a/phases/11-llm-engineering/06-rag/docs/en.md +++ b/phases/11-llm-engineering/06-rag/docs/en.md @@ -358,7 +358,7 @@ client = anthropic.Anthropic() def generate(prompt): response = client.messages.create( - model="claude-sonnet-4-20250514", + model="claude-sonnet-4-6", max_tokens=1024, messages=[{"role": "user", "content": prompt}] ) diff --git a/phases/11-llm-engineering/07-advanced-rag/docs/en.md b/phases/11-llm-engineering/07-advanced-rag/docs/en.md index 7eda62f9a..257230223 100644 --- a/phases/11-llm-engineering/07-advanced-rag/docs/en.md +++ b/phases/11-llm-engineering/07-advanced-rag/docs/en.md @@ -459,7 +459,7 @@ client = anthropic.Anthropic() def hyde_with_llm(query): response = client.messages.create( - model="claude-sonnet-4-20250514", + model="claude-sonnet-4-6", max_tokens=256, messages=[{ "role": "user", diff --git a/phases/11-llm-engineering/09-function-calling/docs/en.md b/phases/11-llm-engineering/09-function-calling/docs/en.md index e9c66c06a..6d65973c9 100644 --- a/phases/11-llm-engineering/09-function-calling/docs/en.md +++ b/phases/11-llm-engineering/09-function-calling/docs/en.md @@ -612,7 +612,7 @@ OpenAI returns tool calls as `response.choices[0].message.tool_calls`. Each call # client = anthropic.Anthropic() # # response = client.messages.create( -# model="claude-sonnet-4-20250514", +# model="claude-sonnet-4-6", # max_tokens=1024, # tools=[{ # "name": "get_weather", @@ -633,7 +633,7 @@ OpenAI returns tool calls as `response.choices[0].message.tool_calls`. Each call # result = get_weather(**tool_block.input) # # final = client.messages.create( -# model="claude-sonnet-4-20250514", +# model="claude-sonnet-4-6", # max_tokens=1024, # tools=[...], # messages=[ diff --git a/phases/11-llm-engineering/10-evaluation/docs/en.md b/phases/11-llm-engineering/10-evaluation/docs/en.md index 30b7e26e4..ffd3410bf 100644 --- a/phases/11-llm-engineering/10-evaluation/docs/en.md +++ b/phases/11-llm-engineering/10-evaluation/docs/en.md @@ -741,7 +741,7 @@ if __name__ == "__main__": # # providers: # - openai:gpt-4o -# - anthropic:messages:claude-sonnet-4-20250514 +# - anthropic:messages:claude-sonnet-4-6 # # tests: # - vars: diff --git a/phases/11-llm-engineering/11-caching-cost/docs/en.md b/phases/11-llm-engineering/11-caching-cost/docs/en.md index 934282469..3923ac87f 100644 --- a/phases/11-llm-engineering/11-caching-cost/docs/en.md +++ b/phases/11-llm-engineering/11-caching-cost/docs/en.md @@ -758,7 +758,7 @@ if __name__ == "__main__": # client = anthropic.Anthropic() # # response = client.messages.create( -# model="claude-sonnet-4-20250514", +# model="claude-sonnet-4-6", # max_tokens=1024, # system=[ # { diff --git a/phases/11-llm-engineering/13-production-app/code/production_app.py b/phases/11-llm-engineering/13-production-app/code/production_app.py index b32225c07..30cb03981 100644 --- a/phases/11-llm-engineering/13-production-app/code/production_app.py +++ b/phases/11-llm-engineering/13-production-app/code/production_app.py @@ -14,7 +14,8 @@ class ModelName(Enum): - CLAUDE_SONNET = "claude-sonnet-4-20250514" + """Enum representing supported LLM model identifiers.""" + CLAUDE_SONNET = "claude-sonnet-4-6" GPT_4O = "gpt-4o" GPT_4O_MINI = "gpt-4o-mini" @@ -30,6 +31,7 @@ class ModelName(Enum): @dataclass class RequestLog: + """Dataclass for logging comprehensive information about each LLM request.""" request_id: str user_id: str timestamp: str @@ -48,6 +50,7 @@ class RequestLog: @dataclass class CostTracker: + """Class for tracking input/output tokens and cost metrics globally and per user/model.""" total_input_tokens: int = 0 total_output_tokens: int = 0 total_cost_usd: float = 0.0 @@ -57,6 +60,7 @@ class CostTracker: cost_by_model: dict = field(default_factory=lambda: defaultdict(float)) def record(self, user_id, model, input_tokens, output_tokens, cost): + """Record the token usage and cost for a successful LLM request.""" self.total_input_tokens += input_tokens self.total_output_tokens += output_tokens self.total_cost_usd += cost @@ -65,6 +69,7 @@ def record(self, user_id, model, input_tokens, output_tokens, cost): self.cost_by_model[model] += cost def summary(self): + """Generate a dictionary summary of aggregated cost, request counts, and top users.""" avg_cost = self.total_cost_usd / max(self.total_requests, 1) cache_rate = self.total_cache_hits / max(self.total_requests, 1) * 100 return { @@ -83,6 +88,7 @@ def summary(self): @dataclass class PromptTemplate: + """Dataclass representing a versioned prompt template with associated settings.""" name: str version: str template: str @@ -150,6 +156,7 @@ class PromptTemplate: def select_prompt(template_name, user_id, variables): + """Select and render the appropriate prompt template based on experiment traffic buckets.""" versions = PROMPT_TEMPLATES.get(template_name) if not versions: raise ValueError(f"Unknown template: {template_name}") @@ -170,6 +177,7 @@ def select_prompt(template_name, user_id, variables): def simple_embedding(text, dim=64): + """Generate a simple, deterministic pseudo-embedding vector from text using SHA-256.""" h = hashlib.sha256(text.lower().strip().encode()).hexdigest() raw = [int(h[i:i+2], 16) / 255.0 for i in range(0, min(len(h), dim * 2), 2)] while len(raw) < dim: @@ -181,6 +189,7 @@ def simple_embedding(text, dim=64): def cosine_similarity(a, b): + """Calculate the cosine similarity between two vectors.""" dot = sum(x * y for x, y in zip(a, b)) norm_a = math.sqrt(sum(x * x for x in a)) norm_b = math.sqrt(sum(x * x for x in b)) @@ -190,7 +199,9 @@ def cosine_similarity(a, b): class SemanticCache: + """A semantic caching system that uses pseudo-embeddings and cosine similarity.""" def __init__(self, similarity_threshold=0.92, max_entries=10000, ttl_seconds=3600): + """Initialize the SemanticCache with a similarity threshold, max size, and TTL.""" self.threshold = similarity_threshold self.max_entries = max_entries self.ttl = ttl_seconds @@ -199,6 +210,7 @@ def __init__(self, similarity_threshold=0.92, max_entries=10000, ttl_seconds=360 self.misses = 0 def get(self, query): + """Lookup a query in the cache and return the entry if it exceeds the similarity threshold.""" query_emb = simple_embedding(query) now = time.time() @@ -226,6 +238,7 @@ def get(self, query): return None def put(self, query, response): + """Put a new query and response entry into the semantic cache, evicting old entries if full.""" if len(self.entries) >= self.max_entries: self.entries.sort(key=lambda e: e["timestamp"]) self.entries = self.entries[len(self.entries) // 4:] @@ -238,6 +251,7 @@ def put(self, query, response): }) def stats(self): + """Return semantic cache metrics such as cache hits, misses, and hit rate percentage.""" total = self.hits + self.misses return { "entries": len(self.entries), @@ -275,6 +289,7 @@ def stats(self): @dataclass class GuardrailResult: + """Dataclass holding validation outcome and details from a guardrail check.""" passed: bool blocked_reason: str | None = None pii_detected: list = field(default_factory=list) @@ -282,6 +297,7 @@ class GuardrailResult: def check_input_guardrails(text): + """Check user input for injection patterns and PII, returning redaction if necessary.""" for pattern in INJECTION_PATTERNS: if re.search(pattern, text, re.IGNORECASE): return GuardrailResult( @@ -308,6 +324,7 @@ def check_input_guardrails(text): def check_output_guardrails(text): + """Verify that model output does not contain banned code or SQL syntax patterns.""" for pattern in BANNED_OUTPUT_PATTERNS: if re.search(pattern, text): return GuardrailResult( @@ -318,10 +335,12 @@ def check_output_guardrails(text): def estimate_tokens(text): + """Estimate the token count of a given text using a simple word-ratio heuristic.""" return max(1, len(text.split()) * 4 // 3) def calculate_cost(model, input_tokens, output_tokens): + """Compute the API call cost based on input/output tokens and pricing configurations.""" pricing = MODEL_PRICING.get(model, MODEL_PRICING[ModelName.GPT_4O]) input_cost = input_tokens / 1_000_000 * pricing["input"] output_cost = output_tokens / 1_000_000 * pricing["output"] @@ -357,6 +376,7 @@ def calculate_cost(model, input_tokens, output_tokens): async def call_llm_with_retry(prompt, model, max_retries=3): + """Simulate an LLM API call with retry logic and exponential backoff on connection errors.""" for attempt in range(max_retries + 1): try: failure_chance = 0.15 if attempt == 0 else 0.05 @@ -390,6 +410,7 @@ async def call_llm_with_retry(prompt, model, max_retries=3): async def call_with_fallback(prompt, preferred_model=None): + """Attempt an LLM API call cascading down the fallback chain if the preferred model fails.""" chain = list(FALLBACK_CHAIN) if preferred_model and preferred_model in chain: chain.remove(preferred_model) @@ -413,6 +434,7 @@ async def call_with_fallback(prompt, preferred_model=None): async def stream_response(text): + """Simulate streaming token delivery by yielding chunks with subtle random sleep delays.""" words = text.split() for i, word in enumerate(words): token = word if i == 0 else " " + word @@ -421,13 +443,16 @@ async def stream_response(text): class ProductionLLMService: + """Production service orchestrator integrating caching, guardrails, fallbacks, and cost tracking.""" def __init__(self): + """Initialize the ProductionLLMService with a semantic cache, cost tracker, and logs.""" self.cache = SemanticCache(similarity_threshold=0.92, ttl_seconds=3600) self.cost_tracker = CostTracker() self.request_logs = [] self.eval_results = [] async def handle_request(self, user_id, query, template_name="general_chat", variables=None): + """Handle a single LLM request by running it through input guardrails, caching, fallback LLM call, and output guardrails.""" request_id = str(uuid.uuid4())[:12] start_time = time.time() variables = variables or {} @@ -523,6 +548,7 @@ async def handle_request(self, user_id, query, template_name="general_chat", var } async def handle_streaming_request(self, user_id, query, template_name="general_chat"): + """Handle a request with streaming token delivery, aggregating the simulated stream statistics.""" result = await self.handle_request(user_id, query, template_name) if result.get("cache_hit"): return result @@ -535,6 +561,7 @@ async def handle_streaming_request(self, user_id, query, template_name="general_ return result def _blocked_response(self, request_id, user_id, template_name, guardrail_result, start_time): + """Generate a structured response for requests blocked by input guardrails.""" log = RequestLog( request_id=request_id, user_id=user_id, @@ -561,6 +588,7 @@ def _blocked_response(self, request_id, user_id, template_name, guardrail_result } def _log_eval(self, request_id, template_name, version, result, latency_ms): + """Log request details and latency for post-execution evaluation and offline analysis.""" self.eval_results.append({ "request_id": request_id, "template": template_name, @@ -572,6 +600,7 @@ def _log_eval(self, request_id, template_name, version, result, latency_ms): }) def health_check(self): + """Return the health status of the service along with cache, cost, and logging metrics.""" return { "status": "healthy", "timestamp": datetime.now(timezone.utc).isoformat(), @@ -583,6 +612,7 @@ def health_check(self): async def run_production_demo(): + """Run a comprehensive production simulation demo with normal, streaming, load, and guardrail test requests.""" service = ProductionLLMService() print("=" * 70) @@ -691,6 +721,7 @@ async def run_production_demo(): def main(): + """Run the main async loop executing the production demo simulation.""" asyncio.run(run_production_demo()) diff --git a/phases/11-llm-engineering/13-production-app/docs/en.md b/phases/11-llm-engineering/13-production-app/docs/en.md index 98ef6ffdb..1a26c6c89 100644 --- a/phases/11-llm-engineering/13-production-app/docs/en.md +++ b/phases/11-llm-engineering/13-production-app/docs/en.md @@ -143,7 +143,7 @@ Give up: return fallback response **Fallback model chain.** When your primary model is unavailable, fall through a chain: ``` -claude-sonnet-4-20250514 -> gpt-4o -> gpt-4o-mini -> cached response -> "Service temporarily unavailable" +claude-sonnet-4-6 -> gpt-4o -> gpt-4o-mini -> cached response -> "Service temporarily unavailable" ``` Each step trades quality for availability. The user always gets something. @@ -296,7 +296,7 @@ from typing import AsyncGenerator class ModelName(Enum): - CLAUDE_SONNET = "claude-sonnet-4-20250514" + CLAUDE_SONNET = "claude-sonnet-4-6" GPT_4O = "gpt-4o" GPT_4O_MINI = "gpt-4o-mini" @@ -1078,7 +1078,7 @@ Replace the simulated LLM calls with actual provider SDKs. # yield delta # # -# async def call_anthropic(prompt, model="claude-sonnet-4-20250514"): +# async def call_anthropic(prompt, model="claude-sonnet-4-6"): # client = anthropic.AsyncAnthropic() # async with client.messages.stream( # model=model,