Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@


def call_with_sdk():
"""Make an LLM API call using the official Anthropic Python SDK."""
try:
import anthropic
except ImportError:
Expand All @@ -12,7 +13,7 @@ def call_with_sdk():

client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-20250514",
model="claude-sonnet-4-6",
max_tokens=256,
messages=[{"role": "user", "content": "What is a neural network in one sentence?"}]
)
Expand All @@ -21,6 +22,7 @@ def call_with_sdk():


def call_raw_http():
"""Make a raw HTTP API call to Anthropic's Messages endpoint using urllib."""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("Set ANTHROPIC_API_KEY environment variable first")
Expand All @@ -33,7 +35,7 @@ def call_raw_http():
"anthropic-version": "2023-06-01",
}
body = json.dumps({
"model": "claude-sonnet-4-20250514",
"model": "claude-sonnet-4-6",
"max_tokens": 256,
"messages": [{"role": "user", "content": "What is a neural network in one sentence?"}],
}).encode()
Expand Down
6 changes: 3 additions & 3 deletions phases/00-setup-and-tooling/04-apis-and-keys/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ import anthropic
client = anthropic.Anthropic()

response = client.messages.create(
model="claude-sonnet-4-20250514",
model="claude-sonnet-4-6",
max_tokens=256,
messages=[{"role": "user", "content": "What is a neural network in one sentence?"}]
)
Expand All @@ -76,7 +76,7 @@ import Anthropic from "@anthropic-ai/sdk";
const client = new Anthropic();

const response = await client.messages.create({
model: "claude-sonnet-4-20250514",
model: "claude-sonnet-4-6",
max_tokens: 256,
messages: [{ role: "user", content: "What is a neural network in one sentence?" }],
});
Expand All @@ -98,7 +98,7 @@ headers = {
"anthropic-version": "2023-06-01",
}
body = json.dumps({
"model": "claude-sonnet-4-20250514",
"model": "claude-sonnet-4-6",
"max_tokens": 256,
"messages": [{"role": "user", "content": "What is a neural network in one sentence?"}],
}).encode()
Expand Down
2 changes: 1 addition & 1 deletion phases/11-llm-engineering/06-rag/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ client = anthropic.Anthropic()

def generate(prompt):
response = client.messages.create(
model="claude-sonnet-4-20250514",
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
Expand Down
2 changes: 1 addition & 1 deletion phases/11-llm-engineering/07-advanced-rag/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ client = anthropic.Anthropic()

def hyde_with_llm(query):
response = client.messages.create(
model="claude-sonnet-4-20250514",
model="claude-sonnet-4-6",
max_tokens=256,
messages=[{
"role": "user",
Expand Down
4 changes: 2 additions & 2 deletions phases/11-llm-engineering/09-function-calling/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ OpenAI returns tool calls as `response.choices[0].message.tool_calls`. Each call
# client = anthropic.Anthropic()
#
# response = client.messages.create(
# model="claude-sonnet-4-20250514",
# model="claude-sonnet-4-6",
# max_tokens=1024,
# tools=[{
# "name": "get_weather",
Expand All @@ -633,7 +633,7 @@ OpenAI returns tool calls as `response.choices[0].message.tool_calls`. Each call
# result = get_weather(**tool_block.input)
#
# final = client.messages.create(
# model="claude-sonnet-4-20250514",
# model="claude-sonnet-4-6",
# max_tokens=1024,
# tools=[...],
# messages=[
Expand Down
2 changes: 1 addition & 1 deletion phases/11-llm-engineering/10-evaluation/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ if __name__ == "__main__":
#
# providers:
# - openai:gpt-4o
# - anthropic:messages:claude-sonnet-4-20250514
# - anthropic:messages:claude-sonnet-4-6
#
# tests:
# - vars:
Expand Down
2 changes: 1 addition & 1 deletion phases/11-llm-engineering/11-caching-cost/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ if __name__ == "__main__":
# client = anthropic.Anthropic()
#
# response = client.messages.create(
# model="claude-sonnet-4-20250514",
# model="claude-sonnet-4-6",
# max_tokens=1024,
# system=[
# {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@


class ModelName(Enum):
CLAUDE_SONNET = "claude-sonnet-4-20250514"
"""Enum representing supported LLM model identifiers."""
CLAUDE_SONNET = "claude-sonnet-4-6"
GPT_4O = "gpt-4o"
GPT_4O_MINI = "gpt-4o-mini"

Expand All @@ -30,6 +31,7 @@ class ModelName(Enum):

@dataclass
class RequestLog:
"""Dataclass for logging comprehensive information about each LLM request."""
request_id: str
user_id: str
timestamp: str
Expand All @@ -48,6 +50,7 @@ class RequestLog:

@dataclass
class CostTracker:
"""Class for tracking input/output tokens and cost metrics globally and per user/model."""
total_input_tokens: int = 0
total_output_tokens: int = 0
total_cost_usd: float = 0.0
Expand All @@ -57,6 +60,7 @@ class CostTracker:
cost_by_model: dict = field(default_factory=lambda: defaultdict(float))

def record(self, user_id, model, input_tokens, output_tokens, cost):
"""Record the token usage and cost for a successful LLM request."""
self.total_input_tokens += input_tokens
self.total_output_tokens += output_tokens
self.total_cost_usd += cost
Expand All @@ -65,6 +69,7 @@ def record(self, user_id, model, input_tokens, output_tokens, cost):
self.cost_by_model[model] += cost

def summary(self):
"""Generate a dictionary summary of aggregated cost, request counts, and top users."""
avg_cost = self.total_cost_usd / max(self.total_requests, 1)
cache_rate = self.total_cache_hits / max(self.total_requests, 1) * 100
return {
Expand All @@ -83,6 +88,7 @@ def summary(self):

@dataclass
class PromptTemplate:
"""Dataclass representing a versioned prompt template with associated settings."""
name: str
version: str
template: str
Expand Down Expand Up @@ -150,6 +156,7 @@ class PromptTemplate:


def select_prompt(template_name, user_id, variables):
"""Select and render the appropriate prompt template based on experiment traffic buckets."""
versions = PROMPT_TEMPLATES.get(template_name)
if not versions:
raise ValueError(f"Unknown template: {template_name}")
Expand All @@ -170,6 +177,7 @@ def select_prompt(template_name, user_id, variables):


def simple_embedding(text, dim=64):
"""Generate a simple, deterministic pseudo-embedding vector from text using SHA-256."""
h = hashlib.sha256(text.lower().strip().encode()).hexdigest()
raw = [int(h[i:i+2], 16) / 255.0 for i in range(0, min(len(h), dim * 2), 2)]
while len(raw) < dim:
Expand All @@ -181,6 +189,7 @@ def simple_embedding(text, dim=64):


def cosine_similarity(a, b):
"""Calculate the cosine similarity between two vectors."""
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
Expand All @@ -190,7 +199,9 @@ def cosine_similarity(a, b):


class SemanticCache:
"""A semantic caching system that uses pseudo-embeddings and cosine similarity."""
def __init__(self, similarity_threshold=0.92, max_entries=10000, ttl_seconds=3600):
"""Initialize the SemanticCache with a similarity threshold, max size, and TTL."""
self.threshold = similarity_threshold
self.max_entries = max_entries
self.ttl = ttl_seconds
Expand All @@ -199,6 +210,7 @@ def __init__(self, similarity_threshold=0.92, max_entries=10000, ttl_seconds=360
self.misses = 0

def get(self, query):
"""Lookup a query in the cache and return the entry if it exceeds the similarity threshold."""
query_emb = simple_embedding(query)
now = time.time()

Expand Down Expand Up @@ -226,6 +238,7 @@ def get(self, query):
return None

def put(self, query, response):
"""Put a new query and response entry into the semantic cache, evicting old entries if full."""
if len(self.entries) >= self.max_entries:
self.entries.sort(key=lambda e: e["timestamp"])
self.entries = self.entries[len(self.entries) // 4:]
Expand All @@ -238,6 +251,7 @@ def put(self, query, response):
})

def stats(self):
"""Return semantic cache metrics such as cache hits, misses, and hit rate percentage."""
total = self.hits + self.misses
return {
"entries": len(self.entries),
Expand Down Expand Up @@ -275,13 +289,15 @@ def stats(self):

@dataclass
class GuardrailResult:
"""Dataclass holding validation outcome and details from a guardrail check."""
passed: bool
blocked_reason: str | None = None
pii_detected: list = field(default_factory=list)
modified_text: str | None = None


def check_input_guardrails(text):
"""Check user input for injection patterns and PII, returning redaction if necessary."""
for pattern in INJECTION_PATTERNS:
if re.search(pattern, text, re.IGNORECASE):
return GuardrailResult(
Expand All @@ -308,6 +324,7 @@ def check_input_guardrails(text):


def check_output_guardrails(text):
"""Verify that model output does not contain banned code or SQL syntax patterns."""
for pattern in BANNED_OUTPUT_PATTERNS:
if re.search(pattern, text):
return GuardrailResult(
Expand All @@ -318,10 +335,12 @@ def check_output_guardrails(text):


def estimate_tokens(text):
"""Estimate the token count of a given text using a simple word-ratio heuristic."""
return max(1, len(text.split()) * 4 // 3)


def calculate_cost(model, input_tokens, output_tokens):
"""Compute the API call cost based on input/output tokens and pricing configurations."""
pricing = MODEL_PRICING.get(model, MODEL_PRICING[ModelName.GPT_4O])
input_cost = input_tokens / 1_000_000 * pricing["input"]
output_cost = output_tokens / 1_000_000 * pricing["output"]
Expand Down Expand Up @@ -357,6 +376,7 @@ def calculate_cost(model, input_tokens, output_tokens):


async def call_llm_with_retry(prompt, model, max_retries=3):
"""Simulate an LLM API call with retry logic and exponential backoff on connection errors."""
for attempt in range(max_retries + 1):
try:
failure_chance = 0.15 if attempt == 0 else 0.05
Expand Down Expand Up @@ -390,6 +410,7 @@ async def call_llm_with_retry(prompt, model, max_retries=3):


async def call_with_fallback(prompt, preferred_model=None):
"""Attempt an LLM API call cascading down the fallback chain if the preferred model fails."""
chain = list(FALLBACK_CHAIN)
if preferred_model and preferred_model in chain:
chain.remove(preferred_model)
Expand All @@ -413,6 +434,7 @@ async def call_with_fallback(prompt, preferred_model=None):


async def stream_response(text):
"""Simulate streaming token delivery by yielding chunks with subtle random sleep delays."""
words = text.split()
for i, word in enumerate(words):
token = word if i == 0 else " " + word
Expand All @@ -421,13 +443,16 @@ async def stream_response(text):


class ProductionLLMService:
"""Production service orchestrator integrating caching, guardrails, fallbacks, and cost tracking."""
def __init__(self):
"""Initialize the ProductionLLMService with a semantic cache, cost tracker, and logs."""
self.cache = SemanticCache(similarity_threshold=0.92, ttl_seconds=3600)
self.cost_tracker = CostTracker()
self.request_logs = []
self.eval_results = []

async def handle_request(self, user_id, query, template_name="general_chat", variables=None):
"""Handle a single LLM request by running it through input guardrails, caching, fallback LLM call, and output guardrails."""
request_id = str(uuid.uuid4())[:12]
start_time = time.time()
variables = variables or {}
Expand Down Expand Up @@ -523,6 +548,7 @@ async def handle_request(self, user_id, query, template_name="general_chat", var
}

async def handle_streaming_request(self, user_id, query, template_name="general_chat"):
"""Handle a request with streaming token delivery, aggregating the simulated stream statistics."""
result = await self.handle_request(user_id, query, template_name)
if result.get("cache_hit"):
return result
Expand All @@ -535,6 +561,7 @@ async def handle_streaming_request(self, user_id, query, template_name="general_
return result

def _blocked_response(self, request_id, user_id, template_name, guardrail_result, start_time):
"""Generate a structured response for requests blocked by input guardrails."""
log = RequestLog(
request_id=request_id,
user_id=user_id,
Expand All @@ -561,6 +588,7 @@ def _blocked_response(self, request_id, user_id, template_name, guardrail_result
}

def _log_eval(self, request_id, template_name, version, result, latency_ms):
"""Log request details and latency for post-execution evaluation and offline analysis."""
self.eval_results.append({
"request_id": request_id,
"template": template_name,
Expand All @@ -572,6 +600,7 @@ def _log_eval(self, request_id, template_name, version, result, latency_ms):
})

def health_check(self):
"""Return the health status of the service along with cache, cost, and logging metrics."""
return {
"status": "healthy",
"timestamp": datetime.now(timezone.utc).isoformat(),
Expand All @@ -583,6 +612,7 @@ def health_check(self):


async def run_production_demo():
"""Run a comprehensive production simulation demo with normal, streaming, load, and guardrail test requests."""
service = ProductionLLMService()

print("=" * 70)
Expand Down Expand Up @@ -691,6 +721,7 @@ async def run_production_demo():


def main():
"""Run the main async loop executing the production demo simulation."""
asyncio.run(run_production_demo())


Expand Down
6 changes: 3 additions & 3 deletions phases/11-llm-engineering/13-production-app/docs/en.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ Give up: return fallback response
**Fallback model chain.** When your primary model is unavailable, fall through a chain:

```
claude-sonnet-4-20250514 -> gpt-4o -> gpt-4o-mini -> cached response -> "Service temporarily unavailable"
claude-sonnet-4-6 -> gpt-4o -> gpt-4o-mini -> cached response -> "Service temporarily unavailable"
```

Each step trades quality for availability. The user always gets something.
Expand Down Expand Up @@ -296,7 +296,7 @@ from typing import AsyncGenerator


class ModelName(Enum):
CLAUDE_SONNET = "claude-sonnet-4-20250514"
CLAUDE_SONNET = "claude-sonnet-4-6"
GPT_4O = "gpt-4o"
GPT_4O_MINI = "gpt-4o-mini"

Expand Down Expand Up @@ -1078,7 +1078,7 @@ Replace the simulated LLM calls with actual provider SDKs.
# yield delta
#
#
# async def call_anthropic(prompt, model="claude-sonnet-4-20250514"):
# async def call_anthropic(prompt, model="claude-sonnet-4-6"):
# client = anthropic.AsyncAnthropic()
# async with client.messages.stream(
# model=model,
Expand Down