From 0906db27c1c01df2041f50712827479366748486 Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Mon, 25 May 2026 09:17:01 -0400 Subject: [PATCH 1/7] Tool Call Reasoning Example --- .../embabel-tool-reasoning/README.md | 68 +++ .../embabel-tool-reasoning/pom.xml | 75 +++ .../ToolCallReasoningIntegrationTest.java | 447 ++++++++++++++++++ embabel-modules/pom.xml | 1 + 4 files changed, 591 insertions(+) create mode 100644 embabel-modules/embabel-tool-reasoning/README.md create mode 100644 embabel-modules/embabel-tool-reasoning/pom.xml create mode 100644 embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java diff --git a/embabel-modules/embabel-tool-reasoning/README.md b/embabel-modules/embabel-tool-reasoning/README.md new file mode 100644 index 000000000000..a65b108923da --- /dev/null +++ b/embabel-modules/embabel-tool-reasoning/README.md @@ -0,0 +1,68 @@ +## ENVIRONMENT SETUP +``` +OPENAI_API_KEY= +``` + +## TEST EXECUTION + +```bash + mvn test -Dtest=ToolCallReasoningIntegrationTest + +``` + +## SAMPLE OUTPUT + +```bash +15:07:21.793 [main] INFO ToolCallReasoningIntegrationTest - +========== RESULT (11979 ms) ========== +Recommended: ParkingRecommendation[chosenOption=GARAGE, location=Midtown Manhattan, estimatedTotalCost=90, summary=No feasible street or metered parking was found within the required time and duration constraints. Garage parking was reserved to guarantee on-time arrival and accommodate the 3-hour meeting despite higher cost.] +Reasoning: [ThinkingBlock(content=Tool: [tool=functions.findStreetParking] +Why THIS tool: To explore the availability and location of free street parking near the client meeting area. +Information expected: The real-time availability or likelihood of securing street parking within required proximity and time. +Advantage over alternatives: Street parking is the lowest cost option but uncertain; assessing its availability directly helps gauge risk of delay. +Confidence in this tool selection [confidence=0.85], tagType=TAG, tagValue=tool_use_reasoning), ThinkingBlock(content=Tool: [tool=functions.findMeterParking] +Why THIS tool: To check for metered parking availability for suitable duration nearby, as it offers moderate cost and may partially fit time needs. +Information expected: Metered parking availability, pricing, and time limits relevant to the 3-hour meeting. +Advantage over alternatives: It balances cost and reliability better than street parking, useful for moderate risk tolerance. +Confidence in this tool selection [confidence=0.9], tagType=TAG, tagValue=tool_use_reasoning), ThinkingBlock(content=Tool: [tool=functions.reserveGarage] +Why THIS tool: Since street and metered parking are unavailable or impractical given the time constraint and duration, reserving guaranteed garage parking is a reliable fallback. +Information expected: Confirmation of garage parking availability and reservation to ensure on-time arrival with minimal risk. +Advantage over alternatives: Eliminates uncertainty and risk of being late by guaranteeing parking close to the meeting. +Confidence in this tool selection [confidence=0.95], tagType=TAG, tagValue=tool_use_reasoning), ThinkingBlock(content=The probe for street parking revealed no available spots within a feasible 30-minute timeframe, making it too risky given the tight schedule and meeting start time. Metered parking was also found unavailable very quickly nearby, plus it imposes a 2-hour limit which is insufficient for the 3-hour meeting. These results demonstrate that free and moderate-cost options are either not accessible or impractical due to time and duration constraints. Given these factors, reserving the garage parking emerged as the best alternative, offering guaranteed availability and eliminating the risk of lateness. Although it is the most expensive option ($30 per hour), it aligns best with the client's priority of punctuality and the meeting's long duration. Confidence in this recommendation is high [confidence=0.95]., tagType=TAG, tagValue=final_decision_reasoning)] + +Callback stats: + beforeLlmCall: 3 + afterLlmCall: 3 + afterToolResult: 3 + + +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - Tools invoked: [findStreetParking, findMeterParking, reserveGarage] +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - === TOOL CALL PATTERN ANALYSIS === +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - Total LLM iterations: 3 +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - Total tools called: 3 +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - Tool result callbacks: 3 +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - PATTERN: Multiple tools called in SAME iteration (parallel tool calls) +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - Thinking blocks captured: 4 +15:07:21.797 [main] INFO ToolCallReasoningIntegrationTest - Block 1: tagType=TAG, tagValue=tool_use_reasoning, contentLength=475 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Attributes: {confidence=0.85, tool=functions.findStreetParking} +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Tool: functions.findStreetParking +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Confidence: 0.85 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Block 2: tagType=TAG, tagValue=tool_use_reasoning, contentLength=473 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Attributes: {confidence=0.9, tool=functions.findMeterParking} +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Tool: functions.findMeterParking +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Confidence: 0.9 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Block 3: tagType=TAG, tagValue=tool_use_reasoning, contentLength=515 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Attributes: {confidence=0.95, tool=functions.reserveGarage} +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Tool: functions.reserveGarage +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Confidence: 0.95 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Block 4: tagType=TAG, tagValue=final_decision_reasoning, contentLength=808 +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Attributes: {confidence=0.95} +15:07:21.798 [main] INFO ToolCallReasoningIntegrationTest - Confidence: 0.95 +[INFO] Tests run: 1, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 16.30 s -- in com.baeldung.embabel.agent.api.tool.loop.thinking.ToolCallReasoningIntegrationTest +[INFO] +[INFO] Results: +[INFO] +[INFO] Tests run: 1, Failures: 0, Errors: 0, Skipped: 0 + + +``` \ No newline at end of file diff --git a/embabel-modules/embabel-tool-reasoning/pom.xml b/embabel-modules/embabel-tool-reasoning/pom.xml new file mode 100644 index 000000000000..ef8178ea7623 --- /dev/null +++ b/embabel-modules/embabel-tool-reasoning/pom.xml @@ -0,0 +1,75 @@ + + + 4.0.0 + + com.baeldung + embabel-modules + 0.0.1 + + + embabel-tool-reasoning + + + 21 + 21 + UTF-8 + 0.4.0 + + + + + + + com.embabel.agent + embabel-agent-api + ${embabel-agent.version} + + + com.embabel.agent + embabel-agent-starter-shell + ${embabel-agent.version} + + + com.embabel.agent + embabel-agent-starter-openai + ${embabel-agent.version} + + + + + com.embabel.agent + embabel-agent-test-internal + ${embabel-agent.version} + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 21 + 21 + 21 + + + + org.springframework.boot + spring-boot-maven-plugin + + + repackage + + true + + + + + + + + \ No newline at end of file diff --git a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java new file mode 100644 index 000000000000..b87e544e763e --- /dev/null +++ b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java @@ -0,0 +1,447 @@ +/* + * + * + * Example got pattened after: + * + * https://github.com/embabel/embabel-agent-experimental/blob/main/embabel-experimental-integration-tests/src/test/java/com/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIT.java + * + * Original code (see link above)) was developed by Embabel Pty Ltd, 2026 + */ +package com.baeldung.embabel.agent.api.tool.loop.thinking; + +import com.embabel.agent.AgentTestApplication; +import com.embabel.agent.api.annotation.LlmTool; +import com.embabel.agent.api.common.Ai; +import com.embabel.agent.api.tool.callback.*; +import com.embabel.chat.Message; +import com.embabel.chat.SystemMessage; +import com.embabel.common.core.thinking.ThinkingResponse; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.ActiveProfiles; + +import java.lang.reflect.Method; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration test demonstrating DefaultToolLoop with inspectors and transformers. + *

+ * Shows how to: + * - Create tools that fetch real data (restaurant menus via Jsoup) + * - Use ToolLoopInspector for observability + * - Use ToolLoopTransformer for System Message positioning + * - Have LLM use tools and summarize results + */ +@SpringBootTest( + classes = AgentTestApplication.class, + properties = { + "embabel.models.cheapest=gpt-4.1-mini", + "embabel.models.best=gpt-4.1-mini", + "embabel.models.default-llm=gpt-4.1-mini", + "embabel.agent.platform.llm-operations.prompts.defaultTimeout=240s", + "embabel.agent.platform.llm-operations.data-binding.fixedBackoffMillis=6000" + } +) +@ActiveProfiles("tool-reasoning") +class ToolCallReasoningIntegrationTest { + + protected final Logger logger = LoggerFactory.getLogger(getClass()); + + @Autowired + private Ai ai; + + @BeforeAll + static void setUp() { + System.setProperty("embabel.agent.shell.interactive.enabled", "false"); + } + + public record ParkingRecommendation( + Option chosenOption, // which option was selected + String location, // e.g. "Midtown Manhattan" + int estimatedTotalCost, // total expected cost + String summary // short human-readable explanation + ) { + + public enum Option { + STREET, + METER, + GARAGE + } + } + + /** + * Extract tool names from a tooling class using reflection. + * Finds all methods annotated with @LlmTool. + */ + private static List extractToolNames() { + return Arrays.stream(ParkingTooling.class.getDeclaredMethods()) + .filter(method -> method.isAnnotationPresent(LlmTool.class)) + .map(Method::getName) + .sorted() + .collect(Collectors.toList()); + } + + /** + * Extracts attributes in [key=value] format from thinking block content. + * + *

This utility parses thinking blocks for structured attributes embedded in the text. + * Attributes follow the format: [key=value] + * + *

Usage Example: + *

{@code
+     * // Get thinking block from LLM response
+     * ThinkingResponse result = ai.thinking().createObject(...);
+     * ThinkingBlock block = result.getThinkingBlocks().get(0);
+     *
+     * // Extract attributes
+     * Map attributes = extractAttributes(block.getContent());
+     *
+     * }
+ * + *

Input Example: + *

+     * "Garage parking is best due to time constraints [confidence=0.9] and [priority=high]"
+     * 
+ * + *

Output: + *

+     * Map: {"confidence" -> "0.9", "priority" -> "high"}
+     * 
+ * + * @param thinkingContent the thinking block content containing [key=value] pairs + * @return map of extracted key-value pairs, empty if no matches found + */ + private static Map extractAttributes(String thinkingContent) { + Map attributes = new HashMap<>(); + // Pattern: [key=value] where key and value don't contain = or ] + Pattern pattern = Pattern.compile("\\[([^=]+)=([^\\]]+)\\]"); + Matcher matcher = pattern.matcher(thinkingContent); + + while (matcher.find()) { + String key = matcher.group(1).trim(); + String value = matcher.group(2).trim(); + attributes.put(key, value); + } + + return attributes; + } + + /** + * Parking Finder Tooling + */ + public class ParkingTooling { + + private final Random random = new Random(); + + @LlmTool(description = "Find free street parking. Uncertain and may take time.") + public String findStreetParking(String location, int maxMinutes) { + + boolean found = random.nextDouble() < 0.3; // low probability + + if (found) { + return "Street parking found near " + location + " (free)"; + } + return "No street parking found within " + maxMinutes + " minutes"; + } + + @LlmTool(description = "Find metered parking. Moderate cost and moderate availability. May have time limits.") + public String findMeterParking(String location, int maxMinutes) { + + boolean found = random.nextDouble() < 0.6; // medium probability + + if (found) { + return "Metered parking found near " + location + " ($5/hour, 2-hour limit)"; + } + return "No metered parking found within " + maxMinutes + " minutes"; + } + + @LlmTool(description = "Reserve guaranteed garage parking near destination.") + public String reserveGarage(String location) { + + return "Garage reserved near " + location + " ($30/hour, guaranteed)"; + } + } + + + /** + * Test with multiple tool probes to verify thinking blocks accumulate correctly + * across multiple tool calls (either in same iteration or across iterations). + *

+ * This test investigates: + * - Whether multiple tools are called in the same iteration or separate iterations + * - Whether multiple tool calls in the same iteration share the same AssistantMessage + * - Whether thinking blocks accumulate correctly in both scenarios + */ + @Test + void parkingDecisionMakerWithMultiProbes() { + // Create Parking Options Tool + var tools = new ParkingTooling(); + var loggingInspector = createLoggingInspector(); + var callbackTracker = new CallbackTracker(); + + var systemMessageTransformer = new SystemMessageTransformer( + "You are a helpful decision assistant. Be concise and practical.", + """ + CRITICAL WORKFLOW - Two-phase decision process: + + === PHASE 1: Tool Selection (First Response) === + + 1. For EACH tool you plan to call, emit a SEPARATE block: + + + Tool: [tool=TOOL_NAME] + Why THIS tool: [explain why this specific tool is needed] + Information expected: [what this tool will reveal] + Advantage over alternatives: [why this tool vs others] + Confidence in this tool selection [confidence=0.XX] + + + IMPORTANT: Keep the brackets! Example: "Tool: [tool=myTool]" not "Tool: myTool" + + Since you must call at least 2 tools, you must emit at least 2 separate blocks. + + 2. Call AT LEAST TWO tools to gather comprehensive information + - You MUST call at least 2 tools to probe different aspects. + - Tools are PROBES for information gathering, not final decisions. + - Multiple probes provide better decision quality. + + === PHASE 2: Final Decision (After receiving tool results) === + + 1. Emit final decision reasoning: + + Explain: + - What each tool probe revealed + - How the probe results informed your analysis + - Why you chose this option based on probe data and constraints + - Confidence in final recommendation in format [confidence=0.XX] + + + 2. Then provide the final structured output + - Your final recommendation should synthesize insights from multiple probes. + - Never copy reasoning blocks into the final structured object. + + REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. Emit reasoning in BOTH phases. + """ + ); + + String prompt = """ + + Scenario: + An advisor is driving to a client meeting in Midtown Manhattan. + + Constraints: + - 30 minutes remain before the meeting starts + - arriving late is not acceptable + - the meeting is expected to last about 3 hours + + Parking options: + - Street parking: free, but uncertain + - Metered parking: $5 per hour, typically limited to 2 hours + - Garage parking: $30 per hour, guaranteed availability + + Important decision factors: + - available time before the meeting + - risk of arriving late + - trade-offs between street, metered, and garage parking + + Recommend the best parking option. + + Available tools: %s + + + """.formatted(String.join(", ", extractToolNames())); + + + long start = System.currentTimeMillis(); + ThinkingResponse result = ai.withDefaultLlm() + .withToolObject(tools) + .withToolLoopInspectors(callbackTracker, loggingInspector) + .withToolLoopTransformers(systemMessageTransformer) + .thinking().createObject(prompt, ParkingRecommendation.class); + long elapsed = System.currentTimeMillis() - start; + + logger.info(""" + + ========== RESULT ({} ms) ========== + Recommended: {} + Reasoning: {} + + Callback stats: + beforeLlmCall: {} + afterLlmCall: {} + afterToolResult: {} + + """, + elapsed, + result.getResult(), + result.getThinkingBlocks(), + callbackTracker.beforeLlmCallCount.get(), + callbackTracker.afterLlmCallCount.get(), + callbackTracker.afterToolResultCount.get() + ); + + // Assertions + + // Verify at least 2 tools were called + assertTrue(callbackTracker.toolsInvoked.size() >= 2, + "Should invoke at least 2 tools for comprehensive probing, invoked: " + callbackTracker.toolsInvoked); + logger.info("Tools invoked: {}", callbackTracker.toolsInvoked); + + // Verify thinking blocks were accumulated + assertFalse(result.getThinkingBlocks().isEmpty(), + "Should have accumulated thinking blocks"); + + // Log analysis of tool call pattern + int totalIterations = callbackTracker.beforeLlmCallCount.get(); + int toolsCalled = callbackTracker.toolsInvoked.size(); + int toolResultCallbacks = callbackTracker.afterToolResultCount.get(); + + logger.info("=== TOOL CALL PATTERN ANALYSIS ==="); + logger.info("Total LLM iterations: {}", totalIterations); + logger.info("Total tools called: {}", toolsCalled); + logger.info("Tool result callbacks: {}", toolResultCallbacks); + + if (toolResultCallbacks == toolsCalled && totalIterations < toolsCalled + 1) { + logger.info("PATTERN: Multiple tools called in SAME iteration (parallel tool calls)"); + } else if (totalIterations >= toolsCalled) { + logger.info("PATTERN: Tools called across SEPARATE iterations (sequential tool calls)"); + } + + logger.info("Thinking blocks captured: {}", result.getThinkingBlocks().size()); + for (int i = 0; i < result.getThinkingBlocks().size(); i++) { + var block = result.getThinkingBlocks().get(i); + logger.info(" Block {}: tagType={}, tagValue={}, contentLength={}", + i + 1, block.getTagType(), block.getTagValue(), block.getContent().length()); + + // Extract and validate attributes as Map + Map attributes = extractAttributes(block.getContent()); + if (!attributes.isEmpty()) { + logger.info(" Attributes: {}", attributes); + + // Validate and log tool attribute + if (block.getTagValue().equals("tool_use_reasoning")) { + if (attributes.containsKey("tool")) { + logger.info(" Tool: {}", attributes.get("tool")); + } else { + logger.warn(" Missing [tool=...] attribute in tool_use_reasoning block"); + } + } + + // Validate and log confidence attribute + if (attributes.containsKey("confidence")) { + String confidence = attributes.get("confidence"); + logger.info(" Confidence: {}", confidence); + try { + double confidenceValue = Double.parseDouble(confidence); + if (confidenceValue < 0.0 || confidenceValue > 1.0) { + logger.warn(" Confidence value {} is out of range [0.0, 1.0]", confidenceValue); + } + } catch (NumberFormatException e) { + logger.warn(" Invalid confidence value: {}", confidence); + } + } else { + logger.warn(" Missing [confidence=...] attribute in block"); + } + } else { + logger.warn(" No attributes found in block (tagValue={})", block.getTagValue()); + } + } + } + + + /** + * Transformer that adds system messages after existing system messages but before user messages. + */ + static class SystemMessageTransformer implements ToolLoopTransformer { + private final List systemMessages; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + SystemMessageTransformer(List systemMessages) { + this.systemMessages = systemMessages; + } + + SystemMessageTransformer(String... systemMessages) { + this.systemMessages = List.of(systemMessages); + } + + @NotNull + @Override + public List transformBeforeLlmCall(@NotNull BeforeLlmCallContext context) { + logger.info("Adding {} system message(s) before LLM call (iteration {})", + systemMessages.size(), context.getIteration()); + var history = new ArrayList<>(context.getHistory()); + + // Find the last SystemMessage index + int lastSystemMessageIndex = -1; + for (int i = 0; i < history.size(); i++) { + if (history.get(i) instanceof SystemMessage) { + lastSystemMessageIndex = i; + } + } + + // Insert after last SystemMessage, or at beginning if none exist + int insertIndex = lastSystemMessageIndex + 1; + for (String content : systemMessages) { + history.add(insertIndex++, new SystemMessage(content)); + } + + return history; + } + } + + /** + * Custom inspector that tracks callback invocations for testing. + */ + static class CallbackTracker implements ToolLoopInspector { + final AtomicInteger beforeLlmCallCount = new AtomicInteger(); + final AtomicInteger afterLlmCallCount = new AtomicInteger(); + final AtomicInteger afterToolResultCount = new AtomicInteger(); + final List toolsInvoked = new ArrayList<>(); + + protected final Logger logger = LoggerFactory.getLogger(getClass()); + + @Override + public void beforeLlmCall(@NotNull BeforeLlmCallContext context) { + beforeLlmCallCount.incrementAndGet(); + var threadName = Thread.currentThread().getName(); + logger.info("Before LLM Call Thread {}", threadName); + } + + @Override + public void afterLlmCall(@NotNull AfterLlmCallContext context) { + afterLlmCallCount.incrementAndGet(); + } + + @Override + public void afterToolResult(@NotNull AfterToolResultContext context) { + afterToolResultCount.incrementAndGet(); + synchronized (toolsInvoked) { + toolsInvoked.add(context.getToolCall().getName()); + } + } + } + + /** + * Create a logging inspector with INFO level. + */ + protected ToolLoopLoggingInspector createLoggingInspector() { + return new ToolLoopLoggingInspector( + LogLevel.INFO, + LoggerFactory.getLogger(ToolLoopLoggingInspector.class) + ); + } + + +} diff --git a/embabel-modules/pom.xml b/embabel-modules/pom.xml index 8b0b11bec684..258f429366b5 100644 --- a/embabel-modules/pom.xml +++ b/embabel-modules/pom.xml @@ -17,6 +17,7 @@ embabel-quiz-generator + embabel-tool-reasoning \ No newline at end of file From abb9f8acc4bfa173e45f738aaeddca6845ce170e Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Thu, 25 Jun 2026 17:07:22 -0400 Subject: [PATCH 2/7] Implement reviewer's feedback --- .../embabel-tool-reasoning/pom.xml | 6 +- .../ToolCallReasoningIntegrationTest.java | 456 ++++++++++-------- 2 files changed, 248 insertions(+), 214 deletions(-) diff --git a/embabel-modules/embabel-tool-reasoning/pom.xml b/embabel-modules/embabel-tool-reasoning/pom.xml index ef8178ea7623..165df9d35cfc 100644 --- a/embabel-modules/embabel-tool-reasoning/pom.xml +++ b/embabel-modules/embabel-tool-reasoning/pom.xml @@ -52,9 +52,9 @@ org.apache.maven.plugins maven-compiler-plugin - 21 - 21 - 21 + ${maven.compiler.source} + ${maven.compiler.source} + ${maven.compiler.source} diff --git a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java index b87e544e763e..f36c42d01f3a 100644 --- a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java +++ b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java @@ -1,11 +1,10 @@ /* * - * - * Example got pattened after: + * Example got patterned after: * * https://github.com/embabel/embabel-agent-experimental/blob/main/embabel-experimental-integration-tests/src/test/java/com/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIT.java * - * Original code (see link above)) was developed by Embabel Pty Ltd, 2026 + * Original code (see link above) was developed by Embabel Pty Ltd, 2026 */ package com.baeldung.embabel.agent.api.tool.loop.thinking; @@ -15,7 +14,9 @@ import com.embabel.agent.api.tool.callback.*; import com.embabel.chat.Message; import com.embabel.chat.SystemMessage; +import com.embabel.common.core.thinking.ThinkingBlock; import com.embabel.common.core.thinking.ThinkingResponse; + import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -27,6 +28,8 @@ import java.lang.reflect.Method; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -39,25 +42,25 @@ * Integration test demonstrating DefaultToolLoop with inspectors and transformers. *

* Shows how to: - * - Create tools that fetch real data (restaurant menus via Jsoup) + * - Create tools that simulate probing for parking options * - Use ToolLoopInspector for observability * - Use ToolLoopTransformer for System Message positioning - * - Have LLM use tools and summarize results + * - Have LLM use tools and produce a structured decision with reasoning */ @SpringBootTest( - classes = AgentTestApplication.class, - properties = { - "embabel.models.cheapest=gpt-4.1-mini", - "embabel.models.best=gpt-4.1-mini", - "embabel.models.default-llm=gpt-4.1-mini", - "embabel.agent.platform.llm-operations.prompts.defaultTimeout=240s", - "embabel.agent.platform.llm-operations.data-binding.fixedBackoffMillis=6000" - } + classes = AgentTestApplication.class, + properties = { + "embabel.models.cheapest=gpt-4.1-mini", + "embabel.models.best=gpt-4.1-mini", + "embabel.models.default-llm=gpt-4.1-mini", + "embabel.agent.platform.llm-operations.prompts.defaultTimeout=240s", + "embabel.agent.platform.llm-operations.data-binding.fixedBackoffMillis=6000" + } ) @ActiveProfiles("tool-reasoning") class ToolCallReasoningIntegrationTest { - protected final Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); @Autowired private Ai ai; @@ -68,12 +71,13 @@ static void setUp() { } public record ParkingRecommendation( - Option chosenOption, // which option was selected - String location, // e.g. "Midtown Manhattan" - int estimatedTotalCost, // total expected cost - String summary // short human-readable explanation + Option chosenOption, // which option was selected + String location, // e.g. "Midtown Manhattan" + int estimatedTotalCost, // total expected cost + String summary // short human-readable explanation ) { + @SuppressWarnings("unused") public enum Option { STREET, METER, @@ -87,51 +91,29 @@ public enum Option { */ private static List extractToolNames() { return Arrays.stream(ParkingTooling.class.getDeclaredMethods()) - .filter(method -> method.isAnnotationPresent(LlmTool.class)) - .map(Method::getName) - .sorted() - .collect(Collectors.toList()); + .filter(method -> method.isAnnotationPresent(LlmTool.class)) + .map(Method::getName) + .sorted() + .toList(); } + private static final Pattern ATTRIBUTE_PATTERN = Pattern.compile("\\[([^=]+)=([^]]+)]"); + /** - * Extracts attributes in [key=value] format from thinking block content. - * - *

This utility parses thinking blocks for structured attributes embedded in the text. - * Attributes follow the format: [key=value] - * - *

Usage Example: - *

{@code
-     * // Get thinking block from LLM response
-     * ThinkingResponse result = ai.thinking().createObject(...);
-     * ThinkingBlock block = result.getThinkingBlocks().get(0);
+     * Extracts {@code [key=value]} pairs from thinking block content.
      *
-     * // Extract attributes
-     * Map attributes = extractAttributes(block.getContent());
-     *
-     * }
- * - *

Input Example: - *

-     * "Garage parking is best due to time constraints [confidence=0.9] and [priority=high]"
-     * 
- * - *

Output: - *

-     * Map: {"confidence" -> "0.9", "priority" -> "high"}
-     * 
- * - * @param thinkingContent the thinking block content containing [key=value] pairs - * @return map of extracted key-value pairs, empty if no matches found + * @param thinkingContent thinking block content to parse + * @return map of key-value pairs, empty if none found */ private static Map extractAttributes(String thinkingContent) { Map attributes = new HashMap<>(); - // Pattern: [key=value] where key and value don't contain = or ] - Pattern pattern = Pattern.compile("\\[([^=]+)=([^\\]]+)\\]"); - Matcher matcher = pattern.matcher(thinkingContent); + Matcher matcher = ATTRIBUTE_PATTERN.matcher(thinkingContent); while (matcher.find()) { - String key = matcher.group(1).trim(); - String value = matcher.group(2).trim(); + String key = matcher.group(1) + .trim(); + String value = matcher.group(2) + .trim(); attributes.put(key, value); } @@ -139,9 +121,9 @@ private static Map extractAttributes(String thinkingContent) { } /** - * Parking Finder Tooling + * Parking Finder Tooling */ - public class ParkingTooling { + static class ParkingTooling { private final Random random = new Random(); @@ -174,7 +156,6 @@ public String reserveGarage(String location) { } } - /** * Test with multiple tool probes to verify thinking blocks accumulate correctly * across multiple tool calls (either in same iteration or across iterations). @@ -185,193 +166,242 @@ public String reserveGarage(String location) { * - Whether thinking blocks accumulate correctly in both scenarios */ @Test - void parkingDecisionMakerWithMultiProbes() { - // Create Parking Options Tool + void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { var tools = new ParkingTooling(); var loggingInspector = createLoggingInspector(); var callbackTracker = new CallbackTracker(); var systemMessageTransformer = new SystemMessageTransformer( - "You are a helpful decision assistant. Be concise and practical.", - """ - CRITICAL WORKFLOW - Two-phase decision process: - - === PHASE 1: Tool Selection (First Response) === - - 1. For EACH tool you plan to call, emit a SEPARATE block: - - - Tool: [tool=TOOL_NAME] - Why THIS tool: [explain why this specific tool is needed] - Information expected: [what this tool will reveal] - Advantage over alternatives: [why this tool vs others] - Confidence in this tool selection [confidence=0.XX] - - - IMPORTANT: Keep the brackets! Example: "Tool: [tool=myTool]" not "Tool: myTool" - - Since you must call at least 2 tools, you must emit at least 2 separate blocks. - - 2. Call AT LEAST TWO tools to gather comprehensive information - - You MUST call at least 2 tools to probe different aspects. - - Tools are PROBES for information gathering, not final decisions. - - Multiple probes provide better decision quality. - - === PHASE 2: Final Decision (After receiving tool results) === - - 1. Emit final decision reasoning: - - Explain: - - What each tool probe revealed - - How the probe results informed your analysis - - Why you chose this option based on probe data and constraints - - Confidence in final recommendation in format [confidence=0.XX] - - - 2. Then provide the final structured output - - Your final recommendation should synthesize insights from multiple probes. - - Never copy reasoning blocks into the final structured object. - - REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. Emit reasoning in BOTH phases. - """ - ); - - String prompt = """ + "You are a helpful decision assistant. Be concise and practical.", + """ + CRITICAL WORKFLOW - Two-phase decision process: + + === PHASE 1: Tool Selection (First Response) === - Scenario: - An advisor is driving to a client meeting in Midtown Manhattan. + 1. For EACH tool you plan to call, emit a SEPARATE block: - Constraints: - - 30 minutes remain before the meeting starts - - arriving late is not acceptable - - the meeting is expected to last about 3 hours + + Tool: [TOOL_NAME] + Why THIS tool: [explain why this specific tool is needed] + Information expected: [what this tool will reveal] + Advantage over alternatives: [why this tool vs others] + Confidence in this tool selection [confidence=0.XX] + - Parking options: - - Street parking: free, but uncertain - - Metered parking: $5 per hour, typically limited to 2 hours - - Garage parking: $30 per hour, guaranteed availability + Since you must call at least 2 tools, you must emit at least 2 separate blocks. - Important decision factors: - - available time before the meeting - - risk of arriving late - - trade-offs between street, metered, and garage parking + 2. Call AT LEAST TWO tools to gather comprehensive information + - You MUST call at least 2 tools to probe different aspects. + - Tools are PROBES for information gathering, not final decisions. + - Multiple probes provide better decision quality. - Recommend the best parking option. + === PHASE 2: Final Decision (After receiving tool results) === - Available tools: %s + 1. Emit final decision reasoning: + + - What each tool probe revealed + - How the probe results informed your analysis + - Why you chose this option based on probe data and constraints + Confidence: [confidence=0.XX] + + 2. Then provide the final structured output + - Your final recommendation should synthesize insights from multiple probes. + - Never copy reasoning blocks into the final structured object. - """.formatted(String.join(", ", extractToolNames())); + REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. Emit reasoning in BOTH phases. + """ + ); + String prompt = """ + + Scenario: + An advisor is driving to a client meeting in Midtown Manhattan. + + Constraints: + - 30 minutes remain before the meeting starts + - arriving late is not acceptable + - the meeting is expected to last about 3 hours + + Parking options: + - Street parking: free, but uncertain + - Metered parking: $5 per hour, typically limited to 2 hours + - Garage parking: $30 per hour, guaranteed availability + + Important decision factors: + - available time before the meeting + - risk of arriving late + - trade-offs between street, metered, and garage parking + + Recommend the best parking option. + + Available tools: %s + + + """.formatted(String.join(", ", extractToolNames())); long start = System.currentTimeMillis(); ThinkingResponse result = ai.withDefaultLlm() - .withToolObject(tools) - .withToolLoopInspectors(callbackTracker, loggingInspector) - .withToolLoopTransformers(systemMessageTransformer) - .thinking().createObject(prompt, ParkingRecommendation.class); + .withToolObject(tools) + .withToolLoopInspectors(callbackTracker, loggingInspector) + .withToolLoopTransformers(systemMessageTransformer) + .thinking() + .createObject(prompt, ParkingRecommendation.class); long elapsed = System.currentTimeMillis() - start; + String formattedThinking = result.getThinkingBlocks() + .stream() + .map(Object::toString) + .collect(Collectors.joining("\n ")); + logger.info(""" - - ========== RESULT ({} ms) ========== - Recommended: {} - Reasoning: {} - - Callback stats: - beforeLlmCall: {} - afterLlmCall: {} - afterToolResult: {} - - """, - elapsed, - result.getResult(), - result.getThinkingBlocks(), - callbackTracker.beforeLlmCallCount.get(), - callbackTracker.afterLlmCallCount.get(), - callbackTracker.afterToolResultCount.get() + + ========== RESULT ({} ms) ========== + Recommended: {} + Reasoning: + {} + + Callback stats: + beforeLlmCall: {} + afterLlmCall: {} + afterToolResult: {} + + """, + elapsed, + result.getResult(), + formattedThinking, + callbackTracker.beforeLlmCallCount.get(), + callbackTracker.afterLlmCallCount.get(), + callbackTracker.afterToolResultCount.get() ); - // Assertions - - // Verify at least 2 tools were called - assertTrue(callbackTracker.toolsInvoked.size() >= 2, - "Should invoke at least 2 tools for comprehensive probing, invoked: " + callbackTracker.toolsInvoked); - logger.info("Tools invoked: {}", callbackTracker.toolsInvoked); + // Verify at least 2 unique tools were called + List uniqueToolsInvoked = callbackTracker.toolsInvoked.stream() + .distinct() + .toList(); + assertTrue(uniqueToolsInvoked.size() >= 2, + "Should invoke at least 2 tools for comprehensive probing, invoked: " + callbackTracker.toolsInvoked); + logger.info("Tools invoked (unique={}, total={}): {}", + uniqueToolsInvoked.size(), callbackTracker.toolsInvoked.size(), callbackTracker.toolsInvoked); // Verify thinking blocks were accumulated - assertFalse(result.getThinkingBlocks().isEmpty(), - "Should have accumulated thinking blocks"); + assertFalse(result.getThinkingBlocks() + .isEmpty(), "Should have accumulated thinking blocks"); + + // Report tool call pattern and thinking block analysis + reportToolCallPattern(callbackTracker); + reportThinkingBlocks(result.getThinkingBlocks()); + } - // Log analysis of tool call pattern - int totalIterations = callbackTracker.beforeLlmCallCount.get(); - int toolsCalled = callbackTracker.toolsInvoked.size(); + /** + * Reports tool call pattern using {@code iterationsWithToolCalls} for direct, unambiguous classification: + *
    + *
  • size == 1: all tools returned by a single LLM response (parallel)
  • + *
  • size == uniqueTools: one tool per LLM response (sequential)
  • + *
  • between: some responses batched multiple tools, others returned one (mixed)
  • + *
+ * Skips pattern analysis when duplicate tool names indicate a retry — + * the tracker accumulates across all attempts so the numbers would be misleading. + */ + private void reportToolCallPattern(CallbackTracker callbackTracker) { + int totalLlmCalls = callbackTracker.beforeLlmCallCount.get(); + int totalToolsInvoked = callbackTracker.toolsInvoked.size(); int toolResultCallbacks = callbackTracker.afterToolResultCount.get(); + int iterationsWithTools = callbackTracker.iterationsWithToolCalls.size(); - logger.info("=== TOOL CALL PATTERN ANALYSIS ==="); - logger.info("Total LLM iterations: {}", totalIterations); - logger.info("Total tools called: {}", toolsCalled); - logger.info("Tool result callbacks: {}", toolResultCallbacks); + List uniqueTools = callbackTracker.toolsInvoked.stream() + .distinct() + .toList(); - if (toolResultCallbacks == toolsCalled && totalIterations < toolsCalled + 1) { - logger.info("PATTERN: Multiple tools called in SAME iteration (parallel tool calls)"); - } else if (totalIterations >= toolsCalled) { - logger.info("PATTERN: Tools called across SEPARATE iterations (sequential tool calls)"); + logger.info(""" + === TOOL CALL PATTERN ANALYSIS === + Total LLM calls: {} + Unique tools: {} + Total invocations: {} + Tool result callbacks: {} + Iterations with tool calls: {}""", + totalLlmCalls, + uniqueTools, + totalToolsInvoked, + toolResultCallbacks, + iterationsWithTools + ); + + if (totalToolsInvoked > uniqueTools.size()) { + // Duplicate names mean the tool loop restarted (LlmDataBindingProperties retry). + // Stats above span all attempts combined — pattern analysis would be misleading. + logger.warn("RETRY: stats span multiple attempts — pattern analysis skipped"); + return; } - logger.info("Thinking blocks captured: {}", result.getThinkingBlocks().size()); - for (int i = 0; i < result.getThinkingBlocks().size(); i++) { - var block = result.getThinkingBlocks().get(i); - logger.info(" Block {}: tagType={}, tagValue={}, contentLength={}", - i + 1, block.getTagType(), block.getTagValue(), block.getContent().length()); + if (iterationsWithTools == 1) { + // All tool calls were returned in a single LLM response — the model batched them. + logger.info("PATTERN: All {} tools called in SAME iteration (parallel)", totalToolsInvoked); + } else if (iterationsWithTools == uniqueTools.size()) { + // Each LLM response returned exactly one tool call — fully sequential probing. + logger.info("PATTERN: One tool per iteration (sequential), {} iterations", iterationsWithTools); + } else { + // Some iterations batched multiple tools, others returned one — mixed strategy. + logger.info("PATTERN: {} tools across {} iterations (mixed batching)", + totalToolsInvoked, iterationsWithTools); + } + } + + /** + * Reports each thinking block's metadata, tool name, and {@code [confidence=...]} attribute. + * + *

Tool name is extracted directly from the {@code Tool:} line of {@code tool_use_reasoning} + * blocks — works regardless of whether the LLM used bracket format or plain text. + * + *

{@code [confidence=...]} is extracted via the generic {@link #extractAttributes} parser. + */ + private void reportThinkingBlocks(List thinkingBlocks) { + logger.info("Thinking blocks captured: {}", thinkingBlocks.size()); - // Extract and validate attributes as Map + for (int i = 0; i < thinkingBlocks.size(); i++) { + var block = thinkingBlocks.get(i); + logger.info(" Block {}: tagType={}, tagValue={}, contentLength={}", + i + 1, block.getTagType(), block.getTagValue(), block.getContent() + .length()); + block.getContent() + .lines() + .filter(line -> line.trim() + .startsWith("Tool:")) + .findFirst() + .map(line -> line.replaceFirst(".*Tool:\\s*", "") + .trim()) + .ifPresent(tool -> logger.info(" Tool: {}", tool)); Map attributes = extractAttributes(block.getContent()); - if (!attributes.isEmpty()) { - logger.info(" Attributes: {}", attributes); - - // Validate and log tool attribute - if (block.getTagValue().equals("tool_use_reasoning")) { - if (attributes.containsKey("tool")) { - logger.info(" Tool: {}", attributes.get("tool")); - } else { - logger.warn(" Missing [tool=...] attribute in tool_use_reasoning block"); - } - } + if (attributes.isEmpty()) { + logger.warn(" No [key=value] attributes found in block (tagValue={})", block.getTagValue()); + continue; + } + logger.info(" Attributes: {}", attributes); - // Validate and log confidence attribute - if (attributes.containsKey("confidence")) { - String confidence = attributes.get("confidence"); - logger.info(" Confidence: {}", confidence); - try { - double confidenceValue = Double.parseDouble(confidence); - if (confidenceValue < 0.0 || confidenceValue > 1.0) { - logger.warn(" Confidence value {} is out of range [0.0, 1.0]", confidenceValue); - } - } catch (NumberFormatException e) { - logger.warn(" Invalid confidence value: {}", confidence); - } - } else { - logger.warn(" Missing [confidence=...] attribute in block"); + String confidence = attributes.get("confidence"); + if (confidence == null) { + logger.warn(" Missing [confidence=...] attribute in block (tagValue={})", block.getTagValue()); + continue; + } + try { + double confidenceValue = Double.parseDouble(confidence); + if (confidenceValue < 0.0 || confidenceValue > 1.0) { + logger.warn(" Confidence {} out of range [0.0, 1.0]", confidenceValue); } - } else { - logger.warn(" No attributes found in block (tagValue={})", block.getTagValue()); + } catch (NumberFormatException e) { + logger.warn(" Invalid confidence value: {}", confidence); } } } - /** * Transformer that adds system messages after existing system messages but before user messages. */ static class SystemMessageTransformer implements ToolLoopTransformer { + private final List systemMessages; private final Logger logger = LoggerFactory.getLogger(getClass()); - SystemMessageTransformer(List systemMessages) { - this.systemMessages = systemMessages; - } - SystemMessageTransformer(String... systemMessages) { this.systemMessages = List.of(systemMessages); } @@ -380,7 +410,7 @@ static class SystemMessageTransformer implements ToolLoopTransformer { @Override public List transformBeforeLlmCall(@NotNull BeforeLlmCallContext context) { logger.info("Adding {} system message(s) before LLM call (iteration {})", - systemMessages.size(), context.getIteration()); + systemMessages.size(), context.getIteration()); var history = new ArrayList<>(context.getHistory()); // Find the last SystemMessage index @@ -405,18 +435,23 @@ public List transformBeforeLlmCall(@NotNull BeforeLlmCallContext contex * Custom inspector that tracks callback invocations for testing. */ static class CallbackTracker implements ToolLoopInspector { + final AtomicInteger beforeLlmCallCount = new AtomicInteger(); final AtomicInteger afterLlmCallCount = new AtomicInteger(); final AtomicInteger afterToolResultCount = new AtomicInteger(); - final List toolsInvoked = new ArrayList<>(); + // CopyOnWriteArrayList for lock-free reads; writes (tool results) are infrequent + final List toolsInvoked = new CopyOnWriteArrayList<>(); + // tracks which iteration numbers had at least one tool call; size drives + // parallel (1) vs sequential (== uniqueTools) vs mixed (between) classification + final Set iterationsWithToolCalls = ConcurrentHashMap.newKeySet(); - protected final Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); @Override public void beforeLlmCall(@NotNull BeforeLlmCallContext context) { beforeLlmCallCount.incrementAndGet(); - var threadName = Thread.currentThread().getName(); - logger.info("Before LLM Call Thread {}", threadName); + logger.info("Before LLM Call Thread {}", Thread.currentThread() + .getName()); } @Override @@ -427,21 +462,20 @@ public void afterLlmCall(@NotNull AfterLlmCallContext context) { @Override public void afterToolResult(@NotNull AfterToolResultContext context) { afterToolResultCount.incrementAndGet(); - synchronized (toolsInvoked) { - toolsInvoked.add(context.getToolCall().getName()); - } + toolsInvoked.add(context.getToolCall() + .getName()); + iterationsWithToolCalls.add(context.getIteration()); } } /** * Create a logging inspector with INFO level. */ - protected ToolLoopLoggingInspector createLoggingInspector() { + private ToolLoopLoggingInspector createLoggingInspector() { return new ToolLoopLoggingInspector( - LogLevel.INFO, - LoggerFactory.getLogger(ToolLoopLoggingInspector.class) + LogLevel.INFO, + LoggerFactory.getLogger(ToolLoopLoggingInspector.class) ); } - } From 98671c94e25215dace122450f6250852fe44fa71 Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Thu, 25 Jun 2026 18:05:04 -0400 Subject: [PATCH 3/7] Code formatting - eliminate blank lines --- .../ToolCallReasoningIntegrationTest.java | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java index f36c42d01f3a..dda3b39c00c7 100644 --- a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java +++ b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java @@ -1,7 +1,5 @@ /* - * * Example got patterned after: - * * https://github.com/embabel/embabel-agent-experimental/blob/main/embabel-experimental-integration-tests/src/test/java/com/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIT.java * * Original code (see link above) was developed by Embabel Pty Ltd, 2026 @@ -61,7 +59,6 @@ class ToolCallReasoningIntegrationTest { private final Logger logger = LoggerFactory.getLogger(getClass()); - @Autowired private Ai ai; @@ -116,7 +113,6 @@ private static Map extractAttributes(String thinkingContent) { .trim(); attributes.put(key, value); } - return attributes; } @@ -129,20 +125,17 @@ static class ParkingTooling { @LlmTool(description = "Find free street parking. Uncertain and may take time.") public String findStreetParking(String location, int maxMinutes) { - boolean found = random.nextDouble() < 0.3; // low probability - if (found) { return "Street parking found near " + location + " (free)"; } return "No street parking found within " + maxMinutes + " minutes"; } - @LlmTool(description = "Find metered parking. Moderate cost and moderate availability. May have time limits.") + @LlmTool(description = "Find metered parking. Moderate cost and moderate availability. " + + "May have time limits.") public String findMeterParking(String location, int maxMinutes) { - boolean found = random.nextDouble() < 0.6; // medium probability - if (found) { return "Metered parking found near " + location + " ($5/hour, 2-hour limit)"; } @@ -151,7 +144,6 @@ public String findMeterParking(String location, int maxMinutes) { @LlmTool(description = "Reserve guaranteed garage parking near destination.") public String reserveGarage(String location) { - return "Garage reserved near " + location + " ($30/hour, guaranteed)"; } } @@ -209,10 +201,10 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { - Your final recommendation should synthesize insights from multiple probes. - Never copy reasoning blocks into the final structured object. - REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. Emit reasoning in BOTH phases. + REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. + Emit reasoning in BOTH phases. """ ); - String prompt = """ Scenario: @@ -253,7 +245,6 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { .stream() .map(Object::toString) .collect(Collectors.joining("\n ")); - logger.info(""" ========== RESULT ({} ms) ========== @@ -274,7 +265,6 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { callbackTracker.afterLlmCallCount.get(), callbackTracker.afterToolResultCount.get() ); - // Verify at least 2 unique tools were called List uniqueToolsInvoked = callbackTracker.toolsInvoked.stream() .distinct() @@ -312,7 +302,6 @@ private void reportToolCallPattern(CallbackTracker callbackTracker) { List uniqueTools = callbackTracker.toolsInvoked.stream() .distinct() .toList(); - logger.info(""" === TOOL CALL PATTERN ANALYSIS === Total LLM calls: {} @@ -326,7 +315,6 @@ private void reportToolCallPattern(CallbackTracker callbackTracker) { toolResultCallbacks, iterationsWithTools ); - if (totalToolsInvoked > uniqueTools.size()) { // Duplicate names mean the tool loop restarted (LlmDataBindingProperties retry). // Stats above span all attempts combined — pattern analysis would be misleading. @@ -420,13 +408,11 @@ public List transformBeforeLlmCall(@NotNull BeforeLlmCallContext contex lastSystemMessageIndex = i; } } - // Insert after last SystemMessage, or at beginning if none exist int insertIndex = lastSystemMessageIndex + 1; for (String content : systemMessages) { history.add(insertIndex++, new SystemMessage(content)); } - return history; } } @@ -477,5 +463,4 @@ private ToolLoopLoggingInspector createLoggingInspector() { LoggerFactory.getLogger(ToolLoopLoggingInspector.class) ); } - } From a380e3745f36428b63240fd865f8969b17739032 Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Mon, 29 Jun 2026 14:52:05 -0400 Subject: [PATCH 4/7] Prompt code indent alignment --- .../ToolCallReasoningIntegrationTest.java | 116 +++++++++--------- 1 file changed, 57 insertions(+), 59 deletions(-) diff --git a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java index dda3b39c00c7..f3fc6de27470 100644 --- a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java +++ b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java @@ -164,72 +164,70 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { var callbackTracker = new CallbackTracker(); var systemMessageTransformer = new SystemMessageTransformer( - "You are a helpful decision assistant. Be concise and practical.", +"You are a helpful decision assistant. Be concise and practical.", + """ + CRITICAL WORKFLOW - Two-phase decision process: + + === PHASE 1: Tool Selection (First Response) === + + 1. For EACH tool you plan to call, emit a SEPARATE block: + + + Tool: [TOOL_NAME] + Why THIS tool: [explain why this specific tool is needed] + Information expected: [what this tool will reveal] + Advantage over alternatives: [why this tool vs others] + Confidence in this tool selection [confidence=0.XX] + + + Since you must call at least 2 tools, you must emit at least 2 separate blocks. + + 2. Call AT LEAST TWO tools to gather comprehensive information + - You MUST call at least 2 tools to probe different aspects. + - Tools are PROBES for information gathering, not final decisions. + - Multiple probes provide better decision quality. + + === PHASE 2: Final Decision (After receiving tool results) === + + 1. Emit final decision reasoning: + + - What each tool probe revealed + - How the probe results informed your analysis + - Why you chose this option based on probe data and constraints + Confidence: [confidence=0.XX] + + + 2. Then provide the final structured output + - Your final recommendation should synthesize insights from multiple probes. + - Never copy reasoning blocks into the final structured object. + + REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. + Emit reasoning in BOTH phases. """ - CRITICAL WORKFLOW - Two-phase decision process: - - === PHASE 1: Tool Selection (First Response) === - - 1. For EACH tool you plan to call, emit a SEPARATE block: - - - Tool: [TOOL_NAME] - Why THIS tool: [explain why this specific tool is needed] - Information expected: [what this tool will reveal] - Advantage over alternatives: [why this tool vs others] - Confidence in this tool selection [confidence=0.XX] - - - Since you must call at least 2 tools, you must emit at least 2 separate blocks. - - 2. Call AT LEAST TWO tools to gather comprehensive information - - You MUST call at least 2 tools to probe different aspects. - - Tools are PROBES for information gathering, not final decisions. - - Multiple probes provide better decision quality. - - === PHASE 2: Final Decision (After receiving tool results) === - - 1. Emit final decision reasoning: - - - What each tool probe revealed - - How the probe results informed your analysis - - Why you chose this option based on probe data and constraints - Confidence: [confidence=0.XX] - - - 2. Then provide the final structured output - - Your final recommendation should synthesize insights from multiple probes. - - Never copy reasoning blocks into the final structured object. - - REMINDER: One block per tool call. At least 2 tools = at least 2 blocks. - Emit reasoning in BOTH phases. - """ ); - String prompt = """ - - Scenario: - An advisor is driving to a client meeting in Midtown Manhattan. - - Constraints: - - 30 minutes remain before the meeting starts - - arriving late is not acceptable - - the meeting is expected to last about 3 hours - - Parking options: - - Street parking: free, but uncertain - - Metered parking: $5 per hour, typically limited to 2 hours - - Garage parking: $30 per hour, guaranteed availability + String prompt = + """ + Scenario: + An advisor is driving to a client meeting in Midtown Manhattan. - Important decision factors: - - available time before the meeting - - risk of arriving late - - trade-offs between street, metered, and garage parking + Constraints: + - 30 minutes remain before the meeting starts + - arriving late is not acceptable + - the meeting is expected to last about 3 hours - Recommend the best parking option. + Parking options: + - Street parking: free, but uncertain + - Metered parking: $5 per hour, typically limited to 2 hours + - Garage parking: $30 per hour, guaranteed availability - Available tools: %s + Important decision factors: + - available time before the meeting + - risk of arriving late + - trade-offs between street, metered, and garage parking + Recommend the best parking option. + Available tools: %s """.formatted(String.join(", ", extractToolNames())); long start = System.currentTimeMillis(); From 51477fa8e11c16ad5cd399414be307e80d96dd47 Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Mon, 29 Jun 2026 17:31:16 -0400 Subject: [PATCH 5/7] Embabel version upgrade to 0.5.0 --- embabel-modules/embabel-tool-reasoning/pom.xml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/embabel-modules/embabel-tool-reasoning/pom.xml b/embabel-modules/embabel-tool-reasoning/pom.xml index 165df9d35cfc..a23895c2ab45 100644 --- a/embabel-modules/embabel-tool-reasoning/pom.xml +++ b/embabel-modules/embabel-tool-reasoning/pom.xml @@ -15,7 +15,7 @@ 21 21 UTF-8 - 0.4.0 + 0.5.0 @@ -25,7 +25,8 @@ com.embabel.agent embabel-agent-api ${embabel-agent.version} - + + com.embabel.agent embabel-agent-starter-shell From 099301617cd1ed3019ee33e8ff7e82fce79604b8 Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Tue, 30 Jun 2026 16:34:48 -0400 Subject: [PATCH 6/7] Polish formatting and update pom --- .../embabel-tool-reasoning/pom.xml | 15 +--- .../ToolCallReasoningIntegrationTest.java | 80 +++++++++---------- 2 files changed, 39 insertions(+), 56 deletions(-) diff --git a/embabel-modules/embabel-tool-reasoning/pom.xml b/embabel-modules/embabel-tool-reasoning/pom.xml index a23895c2ab45..cbc187b905cf 100644 --- a/embabel-modules/embabel-tool-reasoning/pom.xml +++ b/embabel-modules/embabel-tool-reasoning/pom.xml @@ -19,25 +19,12 @@ - - - - com.embabel.agent - embabel-agent-api - ${embabel-agent.version} - - - - com.embabel.agent - embabel-agent-starter-shell - ${embabel-agent.version} - + com.embabel.agent embabel-agent-starter-openai ${embabel-agent.version} - com.embabel.agent diff --git a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java index f3fc6de27470..bc90c7b75025 100644 --- a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java +++ b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java @@ -28,6 +28,7 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -121,11 +122,9 @@ private static Map extractAttributes(String thinkingContent) { */ static class ParkingTooling { - private final Random random = new Random(); - @LlmTool(description = "Find free street parking. Uncertain and may take time.") public String findStreetParking(String location, int maxMinutes) { - boolean found = random.nextDouble() < 0.3; // low probability + boolean found = ThreadLocalRandom.current().nextDouble() < 0.3; // low probability if (found) { return "Street parking found near " + location + " (free)"; } @@ -135,7 +134,7 @@ public String findStreetParking(String location, int maxMinutes) { @LlmTool(description = "Find metered parking. Moderate cost and moderate availability. " + "May have time limits.") public String findMeterParking(String location, int maxMinutes) { - boolean found = random.nextDouble() < 0.6; // medium probability + boolean found = ThreadLocalRandom.current().nextDouble() < 0.6; // medium probability if (found) { return "Metered parking found near " + location + " ($5/hour, 2-hour limit)"; } @@ -164,8 +163,7 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { var callbackTracker = new CallbackTracker(); var systemMessageTransformer = new SystemMessageTransformer( -"You are a helpful decision assistant. Be concise and practical.", - """ + "You are a helpful decision assistant. Be concise and practical.", """ CRITICAL WORKFLOW - Two-phase decision process: === PHASE 1: Tool Selection (First Response) === @@ -207,29 +205,28 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { ); String prompt = """ - Scenario: - An advisor is driving to a client meeting in Midtown Manhattan. + Scenario: + An advisor is driving to a client meeting in Midtown Manhattan. - Constraints: - - 30 minutes remain before the meeting starts - - arriving late is not acceptable - - the meeting is expected to last about 3 hours + Constraints: + - 30 minutes remain before the meeting starts + - arriving late is not acceptable + - the meeting is expected to last about 3 hours - Parking options: - - Street parking: free, but uncertain - - Metered parking: $5 per hour, typically limited to 2 hours - - Garage parking: $30 per hour, guaranteed availability + Parking options: + - Street parking: free, but uncertain + - Metered parking: $5 per hour, typically limited to 2 hours + - Garage parking: $30 per hour, guaranteed availability - Important decision factors: - - available time before the meeting - - risk of arriving late - - trade-offs between street, metered, and garage parking + Important decision factors: + - available time before the meeting + - risk of arriving late + - trade-offs between street, metered, and garage parking - Recommend the best parking option. + Recommend the best parking option. - Available tools: %s + Available tools: %s """.formatted(String.join(", ", extractToolNames())); - long start = System.currentTimeMillis(); ThinkingResponse result = ai.withDefaultLlm() .withToolObject(tools) @@ -244,19 +241,18 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { .map(Object::toString) .collect(Collectors.joining("\n ")); logger.info(""" - - ========== RESULT ({} ms) ========== - Recommended: {} - Reasoning: - {} - - Callback stats: - beforeLlmCall: {} - afterLlmCall: {} - afterToolResult: {} - - """, - elapsed, + + ========== RESULT ({} ms) ========== + Recommended: {} + Reasoning: + {} + + Callback stats: + beforeLlmCall: {} + afterLlmCall: {} + afterToolResult: {} + + """, elapsed, result.getResult(), formattedThinking, callbackTracker.beforeLlmCallCount.get(), @@ -301,12 +297,12 @@ private void reportToolCallPattern(CallbackTracker callbackTracker) { .distinct() .toList(); logger.info(""" - === TOOL CALL PATTERN ANALYSIS === - Total LLM calls: {} - Unique tools: {} - Total invocations: {} - Tool result callbacks: {} - Iterations with tool calls: {}""", + === TOOL CALL PATTERN ANALYSIS === + Total LLM calls: {} + Unique tools: {} + Total invocations: {} + Tool result callbacks: {} + Iterations with tool calls: {}""", totalLlmCalls, uniqueTools, totalToolsInvoked, From f13fa6905c51ae0b33835f1ff9b191fd8aa0720b Mon Sep 17 00:00:00 2001 From: Igor Dayen Date: Tue, 30 Jun 2026 16:44:30 -0400 Subject: [PATCH 7/7] Minor formatting inconsistencies --- .../loop/thinking/ToolCallReasoningIntegrationTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java index bc90c7b75025..edca8459f48e 100644 --- a/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java +++ b/embabel-modules/embabel-tool-reasoning/src/test/java/com/baeldung/embabel/agent/api/tool/loop/thinking/ToolCallReasoningIntegrationTest.java @@ -163,7 +163,8 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { var callbackTracker = new CallbackTracker(); var systemMessageTransformer = new SystemMessageTransformer( - "You are a helpful decision assistant. Be concise and practical.", """ + "You are a helpful decision assistant. Be concise and practical.", + """ CRITICAL WORKFLOW - Two-phase decision process: === PHASE 1: Tool Selection (First Response) === @@ -252,7 +253,8 @@ void whenUsingMultipleToolProbes_thenMakeParkingDecisions() { afterLlmCall: {} afterToolResult: {} - """, elapsed, + """, + elapsed, result.getResult(), formattedThinking, callbackTracker.beforeLlmCallCount.get(),