From 7508a3c8408246e8c9c1726a52c3e72e01746140 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Wed, 1 Jul 2026 23:39:39 +0530
Subject: [PATCH 1/3] fix(harness): change strategy on repeated unproductive
 tool results (#4089)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agents that hit an unproductive tool result tended to retry the same
strategy (identical tool + args + error) instead of adapting. The turn
loop's only no-progress handling was RepeatedToolFailureMiddleware, which
went straight from repeated failure to a hard halt — it never fed a signal
back so the model could change approach.

Add a reusable no-progress primitive (NoProgressTracker) that tracks recent
(tool, args) -> outcome and returns a Continue / Nudge / Halt verdict via an
escalation ladder that caps same-strategy retries before forcing an
alternative:

  - identical repeat == 2 -> Nudge: inject a structured "no progress since
    step X" corrective as a user turn so the model uses a different tool or
    arguments (enumerate before reading, correct the query) or reports back
  - identical repeat >= 3 -> Halt (retries exhausted)
  - varied failures == 4 -> Nudge; >= 6 -> Halt backstop
  - hard policy-reject >= 2 -> Halt; any success resets

RepeatedToolFailureMiddleware becomes a thin driver: it feeds each tool
outcome into the tracker and translates the verdict onto the shared steering
handle (Nudge -> InjectMessage fed back before the next model call; Halt ->
halt summary + Pause). The primitive is harness-agnostic and unit-tested so
issues #4090/#4092 can layer on it.

Add unit tests for the tracker (nudge-then-halt, success reset, changed-args
clears the streak, hard-reject, varied backstop, unknown-tool exclusion,
threshold clamp) and update the middleware tests to assert the
InjectMessage(nudge) -> Pause(halt) escalation.
---
 src/openhuman/tinyagents/middleware.rs  | 247 +++++++--------
 src/openhuman/tinyagents/mod.rs         |  19 +-
 src/openhuman/tinyagents/no_progress.rs | 381 ++++++++++++++++++++++++
 3 files changed, 498 insertions(+), 149 deletions(-)
 create mode 100644 src/openhuman/tinyagents/no_progress.rs

diff --git a/src/openhuman/tinyagents/middleware.rs b/src/openhuman/tinyagents/middleware.rs
index a6ea26b9dd..6a9fc402f1 100644
--- a/src/openhuman/tinyagents/middleware.rs
+++ b/src/openhuman/tinyagents/middleware.rs
@@ -1052,57 +1052,43 @@ impl Middleware<()> for CostBudgetMiddleware {
     }
 }
 
-/// Consecutive **any**-failure no-progress backstop: different commands all
-/// failing means the goal is unreachable here. Matches the legacy
-/// `NO_PROGRESS_FAILURE_THRESHOLD`.
-const NO_PROGRESS_FAILURE_THRESHOLD: usize = 6;
-/// Consecutive **identical** hard-policy-rejection repeats before halting — a
-/// blocked call re-issued unchanged can never succeed. Legacy
-/// `HARD_REJECT_REPEAT_THRESHOLD`.
-const HARD_REJECT_REPEAT_THRESHOLD: usize = 2;
-
-/// `after_tool`: stop the run when tool calls keep failing with no progress
-/// (issue #4249). The legacy tool loop's progress guard surfaced a root-cause
-/// halt summary — a security/approval denial re-issued unchanged, an identical
-/// error retried, or *different* commands all failing — instead of burning the
-/// whole iteration budget and ending on a generic cap error. The tinyagents path
-/// kept only the model/tool call caps, so this reinstates the guard as a graph
-/// middleware. Three halt conditions, checked per failure (any success resets
-/// every counter — progress was made):
+/// `after_tool`: drive the no-progress escalation ladder (issues #4249, #4089).
+/// The legacy tool loop's progress guard surfaced a root-cause halt summary — a
+/// security/approval denial re-issued unchanged, an identical error retried, or
+/// *different* commands all failing — instead of burning the whole iteration
+/// budget and ending on a generic cap error. The tinyagents path kept only the
+/// model/tool call caps, so this reinstates the guard as a graph middleware.
 ///
-/// 1. **Hard policy rejection** (`[policy-blocked]`) repeated `HARD_REJECT_REPEAT_THRESHOLD`
-///    times with an identical signature — "blocked by the security policy … re-issued".
-/// 2. **Identical** error signature repeated `identical_threshold` times —
-///    "retried N times with identical arguments".
-/// 3. **Any** failure `NO_PROGRESS_FAILURE_THRESHOLD` times in a row (even with
-///    varied errors) — "N tool calls in a row failed".
+/// The detection + escalation live in the reusable
+/// [`NoProgressTracker`](super::no_progress::NoProgressTracker); this middleware
+/// is the thin driver that feeds each tool outcome in and translates the
+/// returned [`NoProgress`](super::no_progress::NoProgress) verdict onto the
+/// shared steering handle:
 ///
-/// On trip it records a root-cause summary into the shared [`HaltSummarySlot`]
-/// (the turn overrides its final text with it) and pauses the run via the shared
-/// steering handle (same mechanism as the stop-hook / cap pausers).
+/// - [`NoProgress::Nudge`] — a repeated identical failure below the retry cap:
+///   inject a structured "no progress since step X" corrective as a user turn so
+///   the model changes strategy on its next step (#4089). The loop continues.
+/// - [`NoProgress::Halt`] — same-strategy retries exhausted (or the any-failure
+///   backstop tripped): record the root-cause summary into the shared
+///   [`HaltSummarySlot`] (the turn overrides its final text with it) and pause
+///   the run via the steering handle (same mechanism as the stop-hook / cap
+///   pausers).
 pub struct RepeatedToolFailureMiddleware {
     handle: SteeringHandle,
-    identical_threshold: usize,
     halt_summary: super::HaltSummarySlot,
-    state: std::sync::Mutex<FailureState>,
+    tracker: super::no_progress::NoProgressTracker,
     /// call_id → argument fingerprint, captured in `before_tool` (the tool result
     /// carries no arguments). Folded into the identical-repeat signature so the
-    /// "identical arguments" halt only trips on the *same* args — two different
+    /// "identical arguments" ladder only trips on the *same* args — two different
     /// argument sets that happen to share a first error line don't count as a
     /// repeat and can't pre-empt the generic no-progress backstop.
     arg_sigs: std::sync::Mutex<std::collections::HashMap<String, String>>,
 }
 
-#[derive(Default)]
-struct FailureState {
-    last_sig: Option<String>,
-    same_count: usize,
-    consecutive: usize,
-}
-
 impl RepeatedToolFailureMiddleware {
-    /// Build the breaker. `identical_threshold` (the identical-signature retry
-    /// ceiling) is clamped to at least 2 — a single failure is never a loop.
+    /// Build the breaker. `identical_threshold` is the same-strategy retry cap
+    /// (identical tool + args + error), clamped by the tracker so a nudge always
+    /// precedes the halt.
     pub fn new(
         handle: SteeringHandle,
         identical_threshold: usize,
@@ -1110,9 +1096,8 @@ impl RepeatedToolFailureMiddleware {
     ) -> Self {
         Self {
             handle,
-            identical_threshold: identical_threshold.max(2),
             halt_summary,
-            state: std::sync::Mutex::new(FailureState::default()),
+            tracker: super::no_progress::NoProgressTracker::new(identical_threshold),
             arg_sigs: std::sync::Mutex::new(std::collections::HashMap::new()),
         }
     }
@@ -1120,20 +1105,13 @@ impl RepeatedToolFailureMiddleware {
 
 /// A stable, bounded fingerprint of a tool call's arguments for the identical-
 /// repeat signature (hashed so a huge payload doesn't bloat the map/comparison).
-fn args_fingerprint(arguments: &serde_json::Value) -> String {
+pub(super) fn args_fingerprint(arguments: &serde_json::Value) -> String {
     use std::hash::{Hash, Hasher};
     let mut hasher = std::collections::hash_map::DefaultHasher::new();
     arguments.to_string().hash(&mut hasher);
     format!("{:x}", hasher.finish())
 }
 
-/// Trim a tool error for inclusion in a halt summary (keep it bounded but retain
-/// the deterministic leading detail the model/user needs).
-fn truncate_for_halt(text: &str) -> String {
-    const MAX: usize = 600;
-    crate::openhuman::util::truncate_with_ellipsis(text, MAX)
-}
-
 #[async_trait]
 impl Middleware<()> for RepeatedToolFailureMiddleware {
     fn name(&self) -> &str {
@@ -1156,102 +1134,67 @@ impl Middleware<()> for RepeatedToolFailureMiddleware {
 
     async fn after_tool(
         &self,
-        _ctx: &mut RunContext<()>,
+        ctx: &mut RunContext<()>,
         _state: &(),
         result: &mut TaToolResult,
     ) -> TaResult<()> {
-        let mut state = self.state.lock().unwrap();
         let arg_fp = self
             .arg_sigs
             .lock()
             .ok()
             .and_then(|mut sigs| sigs.remove(&result.call_id))
             .unwrap_or_default();
-        let Some(err) = result.error.as_deref() else {
-            // Success → progress was made; reset every counter.
-            *state = FailureState::default();
-            return Ok(());
-        };
-
-        // Signature: tool name + argument fingerprint + first error line (the
-        // deterministic parts; a huge payload tail must not dominate the
-        // identical-repeat comparison). Including the args means the "identical
-        // arguments" halt only fires when the args truly repeat.
-        let err_line = err.lines().next().unwrap_or(err);
-        let sig = format!("{}\u{1f}{arg_fp}\u{1f}{err_line}", result.name);
-        // The unknown-tool recovery is a failure the model can correct (it got the
-        // "unknown tool" feedback), so it must NOT feed the generic *any*-failure
-        // no-progress counter — else a turn that recovers from one bad tool name
-        // and then legitimately exhausts its iteration budget would trip the
-        // backstop instead of hitting the cap. It still feeds the *identical*-repeat
-        // counter, so re-issuing the SAME unavailable tool halts with a root cause.
-        if result.name != UNKNOWN_TOOL_SENTINEL {
-            state.consecutive += 1;
-        }
-        let same_count = match &state.last_sig {
-            Some(prev) if *prev == sig => {
-                state.same_count += 1;
-                state.same_count
-            }
-            _ => {
-                state.last_sig = Some(sig);
-                state.same_count = 1;
-                1
-            }
-        };
 
         // A hard policy rejection is marked in the tool output; it can never
-        // succeed when re-issued unchanged, so it trips faster.
-        let is_hard_reject = result
+        // succeed when re-issued unchanged, so the ladder trips it faster.
+        let hard_reject = result
             .content
             .contains(crate::openhuman::security::POLICY_BLOCKED_MARKER)
-            || err.contains(crate::openhuman::security::POLICY_BLOCKED_MARKER);
-
-        let summary = if is_hard_reject && same_count >= HARD_REJECT_REPEAT_THRESHOLD {
-            Some(format!(
-                "Stopping: the `{}` call is blocked by the security policy and was re-issued with \
-                 identical arguments — it can never succeed this way. Reason:\n{}\n\nDo not repeat \
-                 this call; use an allowed alternative or report that it can't be done here.",
-                result.name,
-                truncate_for_halt(err),
-            ))
-        } else if same_count >= self.identical_threshold {
-            Some(format!(
-                "Stopping: the `{}` call was retried {same_count} times with identical arguments \
-                 and kept failing — repeating it will not help. Last error:\n{}\n\nThis looks \
-                 unrecoverable in the current environment. Report this back instead of retrying.",
-                result.name,
-                truncate_for_halt(err),
-            ))
-        } else if state.consecutive >= NO_PROGRESS_FAILURE_THRESHOLD {
-            Some(format!(
-                "Stopping: {} tool calls in a row failed with no progress. Last error (from \
-                 `{}`):\n{}\n\nDifferent commands are all failing — the goal looks unreachable in \
-                 this environment. Report this back instead of retrying.",
-                state.consecutive,
-                result.name,
-                truncate_for_halt(err),
-            ))
-        } else {
-            None
+            || result
+                .error
+                .as_deref()
+                .is_some_and(|e| e.contains(crate::openhuman::security::POLICY_BLOCKED_MARKER));
+
+        let attempt = super::no_progress::ToolAttempt {
+            tool: &result.name,
+            arg_fingerprint: &arg_fp,
+            error: result.error.as_deref(),
+            hard_reject,
+            recoverable_miss: result.name == UNKNOWN_TOOL_SENTINEL,
         };
+        // The model-call count doubles as the loop "step" for the no-progress
+        // wording ("no progress since step X").
+        let step = ctx.limits.model_calls();
 
-        if let Some(summary) = summary {
-            tracing::warn!(
-                tool = %result.name,
-                consecutive = state.consecutive,
-                same_count,
-                is_hard_reject,
-                "[tinyagents::mw] repeated tool failure — halting run so the root cause surfaces"
-            );
-            if let Ok(mut slot) = self.halt_summary.lock() {
-                *slot = Some(summary);
+        match self.tracker.record(step, &attempt) {
+            super::no_progress::NoProgress::Continue => {}
+            super::no_progress::NoProgress::Nudge(signal) => {
+                tracing::info!(
+                    tool = %result.name,
+                    step,
+                    "[tinyagents::mw] no progress — nudging the model to change strategy"
+                );
+                // Feed the corrective back into the loop as a user turn; the
+                // harness applies it at the next steering checkpoint (before the
+                // next model call), so the model sees it before it acts again.
+                self.handle
+                    .send(SteeringCommand::InjectMessage(TaMessage::user(signal)));
+            }
+            super::no_progress::NoProgress::Halt(summary) => {
+                tracing::warn!(
+                    tool = %result.name,
+                    step,
+                    hard_reject,
+                    "[tinyagents::mw] repeated tool failure — halting run so the root cause surfaces"
+                );
+                if let Ok(mut slot) = self.halt_summary.lock() {
+                    *slot = Some(summary);
+                }
+                // Pause at the top of the next iteration (before the next model
+                // call), matching the stop-hook / cap pause path. The tracker
+                // already reset its state on the halt.
+                self.handle.send(SteeringCommand::Pause);
             }
-            // Pause at the top of the next iteration (before the next model call),
-            // matching the stop-hook / cap pause path. Reset so a resumed run does
-            // not immediately re-pause on the same latched state.
-            self.handle.send(SteeringCommand::Pause);
-            *state = FailureState::default();
         }
         Ok(())
     }
@@ -1596,50 +1539,67 @@ mod tests {
         r
     }
 
+    /// Kinds of the steering commands currently queued on `handle`, drained.
+    fn drained_kinds(handle: &SteeringHandle) -> Vec<tinyagents::harness::steering::SteeringCommandKind> {
+        handle.drain().iter().map(|c| c.kind()).collect()
+    }
+
     #[tokio::test]
-    async fn repeated_tool_failure_pauses_only_after_the_threshold() {
+    async fn repeated_identical_failure_nudges_then_pauses() {
+        use tinyagents::harness::steering::SteeringCommandKind;
         let handle = SteeringHandle::allow_all();
         let mw = RepeatedToolFailureMiddleware::new(
             handle.clone(),
             3,
             std::sync::Arc::new(std::sync::Mutex::new(None)),
         );
-        // Two identical failures: below the threshold, no pause.
-        for _ in 0..2 {
-            let mut r = failing_result("flaky", "boom");
-            mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
-        }
-        assert_eq!(handle.pending(), 0, "no pause before the threshold");
-        // Third identical failure trips the breaker.
+        // First identical failure: nothing yet.
         let mut r = failing_result("flaky", "boom");
         mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
-        assert!(
-            handle.pending() >= 1,
+        assert_eq!(drained_kinds(&handle), Vec::new(), "no signal on the first failure");
+        // Second: a "no progress" nudge feeds back into the loop — not a pause.
+        let mut r = failing_result("flaky", "boom");
+        mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
+        assert_eq!(
+            drained_kinds(&handle),
+            vec![SteeringCommandKind::InjectMessage],
+            "the second identical failure should nudge, not pause"
+        );
+        // Third: same-strategy retries exhausted → pause the run.
+        let mut r = failing_result("flaky", "boom");
+        mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
+        assert_eq!(
+            drained_kinds(&handle),
+            vec![SteeringCommandKind::Pause],
             "the third identical failure should pause the run"
         );
     }
 
     #[tokio::test]
     async fn repeated_tool_failure_resets_on_a_success() {
+        use tinyagents::harness::steering::SteeringCommandKind;
         let handle = SteeringHandle::allow_all();
         let mw = RepeatedToolFailureMiddleware::new(
             handle.clone(),
             3,
             std::sync::Arc::new(std::sync::Mutex::new(None)),
         );
-        // Two failures, then a success clears the counter.
+        // Two failures (the second nudges), then a success clears the counter.
         for _ in 0..2 {
             let mut r = failing_result("t", "boom");
             mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
         }
         let mut ok = tool_result("t", "fine"); // error = None
         mw.after_tool(&mut ctx(), &(), &mut ok).await.unwrap();
-        // Two more failures — still below the threshold because the counter reset.
+        // Two more failures — the counter reset, so no Pause fires (only a nudge).
         for _ in 0..2 {
             let mut r = failing_result("t", "boom");
             mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
         }
-        assert_eq!(handle.pending(), 0, "a success should reset the breaker");
+        assert!(
+            !drained_kinds(&handle).contains(&SteeringCommandKind::Pause),
+            "a success should reset the breaker so it never halts"
+        );
     }
 
     #[tokio::test]
@@ -1651,7 +1611,8 @@ mod tests {
             std::sync::Arc::new(std::sync::Mutex::new(None)),
         );
         // Three *different* errors never trip the breaker — only an identical,
-        // deterministic failure loop does.
+        // deterministic failure loop does (and the varied-failure nudge only
+        // fires at four in a row).
         for err in ["e1", "e2", "e3"] {
             let mut r = failing_result("t", err);
             mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
@@ -1659,7 +1620,7 @@ mod tests {
         assert_eq!(
             handle.pending(),
             0,
-            "distinct errors must not trip the breaker"
+            "distinct errors must not trip the breaker this early"
         );
     }
 
diff --git a/src/openhuman/tinyagents/mod.rs b/src/openhuman/tinyagents/mod.rs
index a41937ff8d..7c2f056449 100644
--- a/src/openhuman/tinyagents/mod.rs
+++ b/src/openhuman/tinyagents/mod.rs
@@ -23,6 +23,7 @@ mod convert;
 pub mod delegation;
 pub mod middleware;
 mod model;
+pub mod no_progress;
 pub mod observability;
 pub mod orchestration;
 pub mod stop_hooks;
@@ -121,8 +122,10 @@ fn run_policy_for(max_iterations: usize) -> RunPolicy {
 
 /// Consecutive identical tool failures that trip the repeated-failure circuit
 /// breaker (see `middleware::RepeatedToolFailureMiddleware`). Three matches the
-/// legacy progress-guard's tolerance before it halted a stuck loop.
-const REPEATED_TOOL_FAILURE_THRESHOLD: usize = 3;
+/// legacy progress-guard's tolerance before it halted a stuck loop; the breaker
+/// nudges the model to change strategy one step earlier (see
+/// [`no_progress::NoProgressTracker`]).
+const REPEATED_TOOL_FAILURE_THRESHOLD: usize = no_progress::DEFAULT_IDENTICAL_HALT_THRESHOLD;
 
 /// Legacy default model-call cap used when a caller passes `max_iterations == 0`
 /// to request "unset" (native-bus / test callers relied on the old loop treating
@@ -450,10 +453,14 @@ pub async fn run_turn_via_tinyagents_shared(
     // handle is a no-op — the loop just drains an empty steering channel.
     let handle = Some(SteeringHandle::allow_all());
 
-    // Repeated-failure circuit breaker: pause the run when a tool returns the same
-    // error `REPEATED_TOOL_FAILURE_THRESHOLD` times in a row, so a deterministic
-    // security/approval denial or terminal tool error surfaces its root cause
-    // instead of burning the whole iteration budget (legacy ProgressGuard parity).
+    // Repeated-failure circuit breaker (issue #4089): on a repeated identical
+    // failure it first injects a structured "no progress since step X" nudge so
+    // the model changes strategy, and only pauses the run once same-strategy
+    // retries are exhausted (`REPEATED_TOOL_FAILURE_THRESHOLD`) — so a
+    // deterministic security/approval denial or terminal tool error surfaces its
+    // root cause instead of burning the whole iteration budget (legacy
+    // ProgressGuard parity). The escalation ladder lives in
+    // `no_progress::NoProgressTracker`.
     let halt_summary: HaltSummarySlot = std::sync::Arc::new(std::sync::Mutex::new(None));
     if let Some(handle) = &handle {
         harness.push_middleware(Arc::new(middleware::RepeatedToolFailureMiddleware::new(
diff --git a/src/openhuman/tinyagents/no_progress.rs b/src/openhuman/tinyagents/no_progress.rs
new file mode 100644
index 0000000000..6b50a19088
--- /dev/null
+++ b/src/openhuman/tinyagents/no_progress.rs
@@ -0,0 +1,381 @@
+//! Foundational no-progress primitive (issue #4089).
+//!
+//! Agents that hit an unproductive tool result tend to *retry the same
+//! strategy* — the identical tool with the identical arguments — instead of
+//! adapting. This module is the reusable detector that breaks that pattern: it
+//! tracks recent `(tool, args) → outcome` across a turn and, on each failure,
+//! decides whether the loop is still making progress, should be **nudged** to
+//! change approach, or has exhausted its same-strategy retries and must
+//! **halt**.
+//!
+//! The escalation ladder caps same-strategy retries *before* giving up: a
+//! repeated identical failure first feeds a structured "no progress since step
+//! X" signal back into the loop (a nudge) so the model picks a *different* next
+//! action, and only halts if it keeps re-issuing the same failing call.
+//!
+//! It is deliberately free of `tinyagents` harness types so it can be unit
+//! tested in isolation and reused by the reliability work layering on top of it
+//! (self-suspend poll loops #4090, escalate-after-N-iterations #4092). The
+//! [`RepeatedToolFailureMiddleware`](super::middleware::RepeatedToolFailureMiddleware)
+//! is its first driver: it feeds each tool outcome in and turns the returned
+//! [`NoProgress`] verdict into a steering `InjectMessage` (nudge) or `Pause`
+//! (halt).
+
+use std::sync::Mutex;
+
+/// Consecutive **identical** (tool + args + error) failures tolerated before the
+/// ladder halts the run — a call re-issued unchanged that keeps failing can
+/// never succeed. Matches the legacy progress-guard tolerance.
+pub const DEFAULT_IDENTICAL_HALT_THRESHOLD: usize = 3;
+/// Identical repeats that trigger the **nudge** — one below the halt threshold,
+/// so the model gets exactly one corrective chance to change strategy before the
+/// same-strategy retry cap trips.
+const IDENTICAL_NUDGE_THRESHOLD: usize = 2;
+/// Consecutive **any**-failure no-progress backstop: different commands all
+/// failing means the goal is unreachable here. Legacy `NO_PROGRESS_FAILURE_THRESHOLD`.
+const NO_PROGRESS_HALT_THRESHOLD: usize = 6;
+/// Consecutive varied failures that trigger the **nudge** before the any-failure
+/// backstop halts.
+const NO_PROGRESS_NUDGE_THRESHOLD: usize = 4;
+/// Consecutive identical **hard policy rejections** before halting — a blocked
+/// call re-issued unchanged can never succeed. Legacy `HARD_REJECT_REPEAT_THRESHOLD`.
+const HARD_REJECT_HALT_THRESHOLD: usize = 2;
+
+/// One tool call's outcome, reduced to the deterministic parts the ladder
+/// compares. Built by the driver from a tool result plus the argument
+/// fingerprint captured before execution.
+pub struct ToolAttempt<'a> {
+    /// Tool name.
+    pub tool: &'a str,
+    /// Stable fingerprint of the call arguments (see
+    /// [`args_fingerprint`](super::middleware::args_fingerprint)). Folded into
+    /// the identical-repeat signature so the "identical arguments" ladder only
+    /// trips when the args truly repeat.
+    pub arg_fingerprint: &'a str,
+    /// `None` on success; otherwise the tool's error text.
+    pub error: Option<&'a str>,
+    /// `true` when the result is a hard security/approval rejection that can
+    /// never succeed re-issued unchanged.
+    pub hard_reject: bool,
+    /// `true` for the unknown-tool recovery sentinel — a correctable miss that
+    /// must not feed the generic any-failure backstop (it still feeds the
+    /// identical-repeat counter, so re-issuing the *same* unavailable tool
+    /// halts).
+    pub recoverable_miss: bool,
+}
+
+/// The ladder's verdict for one recorded attempt.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum NoProgress {
+    /// Progress was made, or not enough repetition yet — carry on.
+    Continue,
+    /// Same-strategy repetition detected below the retry cap: feed this
+    /// structured "no progress since step X" corrective back into the loop so
+    /// the model picks a *different* next action.
+    Nudge(String),
+    /// Same-strategy retries exhausted (or the any-failure backstop tripped):
+    /// halt with this root-cause summary.
+    Halt(String),
+}
+
+#[derive(Default)]
+struct LadderState {
+    /// Signature of the previous failing call (tool + args + first error line).
+    last_sig: Option<String>,
+    /// Consecutive repeats of `last_sig`.
+    same_count: usize,
+    /// Consecutive failures of any kind (reset by any success).
+    consecutive: usize,
+    /// Signature we have already nudged on, so a nudge fires at most once per
+    /// distinct failing `(tool, args, error)` before escalating to a halt.
+    nudged_sig: Option<String>,
+    /// `true` once the varied-failure nudge fired for the current streak.
+    nudged_streak: bool,
+}
+
+/// Tracks recent tool outcomes and drives the no-progress escalation ladder.
+///
+/// Cheap to construct and interior-mutable, so a middleware can hold one behind
+/// a shared reference for the whole turn. `identical_halt_threshold` is the
+/// same-strategy retry cap; it is clamped so a nudge always precedes a halt.
+pub struct NoProgressTracker {
+    identical_halt_threshold: usize,
+    state: Mutex<LadderState>,
+}
+
+impl NoProgressTracker {
+    /// Build a tracker whose identical-repeat halt threshold is
+    /// `identical_halt_threshold`, clamped up so it always sits above the nudge
+    /// threshold (a single failure is never a loop, and the nudge must land
+    /// before the halt).
+    pub fn new(identical_halt_threshold: usize) -> Self {
+        Self {
+            identical_halt_threshold: identical_halt_threshold.max(IDENTICAL_NUDGE_THRESHOLD + 1),
+            state: Mutex::new(LadderState::default()),
+        }
+    }
+
+    /// Clear every counter. Called after a halt so a resumed run does not
+    /// immediately re-trip on the same latched state.
+    pub fn reset(&self) {
+        *self.state.lock().unwrap() = LadderState::default();
+    }
+
+    /// Record one tool outcome observed at loop `step` (the current model-call
+    /// count, used only for the "no progress since step X" wording) and return
+    /// the ladder's verdict. On a [`NoProgress::Halt`] the internal state is
+    /// reset for the caller.
+    pub fn record(&self, step: usize, attempt: &ToolAttempt) -> NoProgress {
+        let mut state = self.state.lock().unwrap();
+
+        let Some(err) = attempt.error else {
+            // Success → progress was made; clear every counter.
+            *state = LadderState::default();
+            return NoProgress::Continue;
+        };
+
+        // Signature: tool name + argument fingerprint + first error line (the
+        // deterministic parts; a huge payload tail must not dominate the
+        // identical-repeat comparison).
+        let err_line = err.lines().next().unwrap_or(err);
+        let sig = format!(
+            "{}\u{1f}{}\u{1f}{err_line}",
+            attempt.tool, attempt.arg_fingerprint
+        );
+
+        // The unknown-tool recovery is correctable feedback the model already
+        // received, so it must NOT feed the generic any-failure backstop — else a
+        // turn that recovers from one bad tool name and then legitimately
+        // exhausts its budget would trip the backstop instead of hitting the cap.
+        // It still feeds the identical-repeat counter below.
+        if !attempt.recoverable_miss {
+            state.consecutive += 1;
+        }
+
+        let same_count = match &state.last_sig {
+            Some(prev) if *prev == sig => {
+                state.same_count += 1;
+                state.same_count
+            }
+            _ => {
+                state.last_sig = Some(sig.clone());
+                state.same_count = 1;
+                // A fresh signature is eligible for its own nudge again.
+                state.nudged_sig = None;
+                1
+            }
+        };
+
+        // ── Halt: same-strategy retries exhausted ───────────────────────────
+        // A hard policy rejection can never succeed re-issued unchanged, so it
+        // trips fastest.
+        if attempt.hard_reject && same_count >= HARD_REJECT_HALT_THRESHOLD {
+            let summary = format!(
+                "Stopping: the `{}` call is blocked by the security policy and was re-issued with \
+                 identical arguments — it can never succeed this way. Reason:\n{}\n\nDo not repeat \
+                 this call; use an allowed alternative or report that it can't be done here.",
+                attempt.tool,
+                truncate_for_halt(err),
+            );
+            *state = LadderState::default();
+            return NoProgress::Halt(summary);
+        }
+        if same_count >= self.identical_halt_threshold {
+            let summary = format!(
+                "Stopping: the `{}` call was retried {same_count} times with identical arguments \
+                 and kept failing — repeating it will not help. Last error:\n{}\n\nThis looks \
+                 unrecoverable in the current environment. Report this back instead of retrying.",
+                attempt.tool,
+                truncate_for_halt(err),
+            );
+            *state = LadderState::default();
+            return NoProgress::Halt(summary);
+        }
+        if state.consecutive >= NO_PROGRESS_HALT_THRESHOLD {
+            let summary = format!(
+                "Stopping: {} tool calls in a row failed with no progress. Last error (from \
+                 `{}`):\n{}\n\nDifferent commands are all failing — the goal looks unreachable in \
+                 this environment. Report this back instead of retrying.",
+                state.consecutive,
+                attempt.tool,
+                truncate_for_halt(err),
+            );
+            *state = LadderState::default();
+            return NoProgress::Halt(summary);
+        }
+
+        // ── Nudge: cap retries *before* forcing an alternative ──────────────
+        // Same tool + same args + same error just repeated: give the model one
+        // corrective chance to change strategy before the halt threshold.
+        if same_count == IDENTICAL_NUDGE_THRESHOLD && state.nudged_sig.as_deref() != Some(&sig) {
+            state.nudged_sig = Some(sig);
+            return NoProgress::Nudge(identical_nudge(step, attempt.tool, same_count, err));
+        }
+        // Varied failures piling up with no success: step back before the
+        // any-failure backstop halts.
+        if !attempt.recoverable_miss
+            && state.consecutive == NO_PROGRESS_NUDGE_THRESHOLD
+            && !state.nudged_streak
+        {
+            state.nudged_streak = true;
+            return NoProgress::Nudge(varied_nudge(step, attempt.tool, state.consecutive, err));
+        }
+
+        NoProgress::Continue
+    }
+}
+
+/// The structured "no progress since step X" corrective for an identical
+/// repeated failure — the core #4089 case (same tool, same args, same error).
+fn identical_nudge(step: usize, tool: &str, count: usize, err: &str) -> String {
+    format!(
+        "[no progress since step {step}] The `{tool}` call has now failed {count} times with the \
+         same arguments and the same error — you are retrying an identical action that cannot \
+         succeed as-is. Do NOT repeat it. Change strategy on your next step: use a different tool \
+         or different arguments (for a missing path, enumerate the directory first; for a failing \
+         query, correct or broaden it), or report back that it can't be done here. Last error:\n{}",
+        truncate_for_halt(err),
+    )
+}
+
+/// The structured "no progress since step X" corrective for a run of varied
+/// failures — different commands all failing without progress.
+fn varied_nudge(step: usize, tool: &str, count: usize, err: &str) -> String {
+    format!(
+        "[no progress since step {step}] {count} tool calls in a row have failed without making \
+         progress. Stop cycling through variations of the same approach — step back and try a \
+         different strategy (enumerate/inspect before acting, pick a different tool, or narrow the \
+         goal). Last error (from `{tool}`):\n{}",
+        truncate_for_halt(err),
+    )
+}
+
+/// Trim a tool error for inclusion in a nudge/halt summary (keep it bounded but
+/// retain the deterministic leading detail the model/user needs).
+pub(super) fn truncate_for_halt(text: &str) -> String {
+    const MAX: usize = 600;
+    crate::openhuman::util::truncate_with_ellipsis(text, MAX)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn fail<'a>(tool: &'a str, fp: &'a str, err: &'a str) -> ToolAttempt<'a> {
+        ToolAttempt {
+            tool,
+            arg_fingerprint: fp,
+            error: Some(err),
+            hard_reject: false,
+            recoverable_miss: false,
+        }
+    }
+
+    fn ok<'a>(tool: &'a str, fp: &'a str) -> ToolAttempt<'a> {
+        ToolAttempt {
+            tool,
+            arg_fingerprint: fp,
+            error: None,
+            hard_reject: false,
+            recoverable_miss: false,
+        }
+    }
+
+    #[test]
+    fn identical_failure_nudges_then_halts() {
+        let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
+        // First identical failure: not enough repetition yet.
+        assert_eq!(t.record(1, &fail("read_file", "a", "not found")), NoProgress::Continue);
+        // Second: nudge the model to change strategy before the retry cap.
+        match t.record(2, &fail("read_file", "a", "not found")) {
+            NoProgress::Nudge(msg) => {
+                assert!(msg.contains("no progress since step 2"));
+                assert!(msg.contains("read_file"));
+                assert!(msg.contains("Change strategy"));
+            }
+            other => panic!("expected a nudge on the second identical failure, got {other:?}"),
+        }
+        // Third: same-strategy retries exhausted → halt.
+        match t.record(3, &fail("read_file", "a", "not found")) {
+            NoProgress::Halt(msg) => assert!(msg.contains("retried 3 times")),
+            other => panic!("expected a halt on the third identical failure, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn a_success_resets_the_ladder() {
+        let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
+        let _ = t.record(1, &fail("t", "a", "boom"));
+        let _ = t.record(2, &fail("t", "a", "boom")); // nudge
+        assert_eq!(t.record(3, &ok("t", "a")), NoProgress::Continue);
+        // After the reset, two more identical failures only re-nudge — no halt.
+        assert_eq!(t.record(4, &fail("t", "a", "boom")), NoProgress::Continue);
+        assert!(matches!(
+            t.record(5, &fail("t", "a", "boom")),
+            NoProgress::Nudge(_)
+        ));
+    }
+
+    #[test]
+    fn changed_arguments_clear_the_identical_streak() {
+        let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
+        let _ = t.record(1, &fail("read_file", "a", "not found"));
+        // The model heeded the nudge and changed args: a new signature, so the
+        // identical-repeat counter starts over and never reaches the halt.
+        assert_eq!(t.record(2, &fail("read_file", "b", "not found")), NoProgress::Continue);
+        assert_eq!(t.record(3, &fail("list_dir", "c", "denied")), NoProgress::Continue);
+    }
+
+    #[test]
+    fn a_hard_rejection_halts_on_the_second_identical_repeat() {
+        let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
+        let mut a = fail("send_email", "a", "[policy-blocked] denied");
+        a.hard_reject = true;
+        assert_eq!(t.record(1, &a), NoProgress::Continue);
+        let mut b = fail("send_email", "a", "[policy-blocked] denied");
+        b.hard_reject = true;
+        assert!(matches!(t.record(2, &b), NoProgress::Halt(msg) if msg.contains("security policy")));
+    }
+
+    #[test]
+    fn varied_failures_nudge_then_hit_the_backstop() {
+        let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
+        // Distinct signatures each time: the identical counter never climbs, but
+        // the any-failure streak does.
+        for i in 1..=3 {
+            let (fp, err) = (format!("fp{i}"), format!("err{i}"));
+            assert_eq!(t.record(i, &fail("t", &fp, &err)), NoProgress::Continue);
+        }
+        // Fourth consecutive varied failure → nudge to step back.
+        assert!(matches!(t.record(4, &fail("t", "fp4", "err4")), NoProgress::Nudge(_)));
+        assert_eq!(t.record(5, &fail("t", "fp5", "err5")), NoProgress::Continue);
+        // Sixth → the no-progress backstop halts.
+        assert!(matches!(
+            t.record(6, &fail("t", "fp6", "err6")),
+            NoProgress::Halt(msg) if msg.contains("in a row failed")
+        ));
+    }
+
+    #[test]
+    fn unknown_tool_recovery_does_not_feed_the_backstop() {
+        let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
+        // Six correctable misses with *distinct* signatures must not trip the
+        // any-failure backstop (they don't count toward `consecutive`).
+        for i in 0..6 {
+            let (fp, err) = (format!("fp{i}"), format!("unknown tool foo{i}"));
+            let mut a = fail("__unknown_tool__", &fp, &err);
+            a.recoverable_miss = true;
+            assert_eq!(t.record(i, &a), NoProgress::Continue);
+        }
+    }
+
+    #[test]
+    fn identical_halt_threshold_is_clamped_above_the_nudge() {
+        // A caller asking for a halt threshold of 1 or 2 still gets a nudge
+        // before the halt (the nudge lands at 2, so the halt must be >= 3).
+        let t = NoProgressTracker::new(1);
+        assert_eq!(t.record(1, &fail("t", "a", "boom")), NoProgress::Continue);
+        assert!(matches!(t.record(2, &fail("t", "a", "boom")), NoProgress::Nudge(_)));
+        assert!(matches!(t.record(3, &fail("t", "a", "boom")), NoProgress::Halt(_)));
+    }
+}

From 3f3cdf4f1bb5741eaeaef5b82513265e3309b6bd Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Wed, 1 Jul 2026 23:49:02 +0530
Subject: [PATCH 2/3] style: rustfmt the #4089 no-progress tests

---
 src/openhuman/tinyagents/middleware.rs  | 10 ++++++--
 src/openhuman/tinyagents/no_progress.rs | 34 ++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/openhuman/tinyagents/middleware.rs b/src/openhuman/tinyagents/middleware.rs
index 6a9fc402f1..eb0d3a6987 100644
--- a/src/openhuman/tinyagents/middleware.rs
+++ b/src/openhuman/tinyagents/middleware.rs
@@ -1540,7 +1540,9 @@ mod tests {
     }
 
     /// Kinds of the steering commands currently queued on `handle`, drained.
-    fn drained_kinds(handle: &SteeringHandle) -> Vec<tinyagents::harness::steering::SteeringCommandKind> {
+    fn drained_kinds(
+        handle: &SteeringHandle,
+    ) -> Vec<tinyagents::harness::steering::SteeringCommandKind> {
         handle.drain().iter().map(|c| c.kind()).collect()
     }
 
@@ -1556,7 +1558,11 @@ mod tests {
         // First identical failure: nothing yet.
         let mut r = failing_result("flaky", "boom");
         mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
-        assert_eq!(drained_kinds(&handle), Vec::new(), "no signal on the first failure");
+        assert_eq!(
+            drained_kinds(&handle),
+            Vec::new(),
+            "no signal on the first failure"
+        );
         // Second: a "no progress" nudge feeds back into the loop — not a pause.
         let mut r = failing_result("flaky", "boom");
         mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
diff --git a/src/openhuman/tinyagents/no_progress.rs b/src/openhuman/tinyagents/no_progress.rs
index 6b50a19088..5bfce4d5cc 100644
--- a/src/openhuman/tinyagents/no_progress.rs
+++ b/src/openhuman/tinyagents/no_progress.rs
@@ -285,7 +285,10 @@ mod tests {
     fn identical_failure_nudges_then_halts() {
         let t = NoProgressTracker::new(DEFAULT_IDENTICAL_HALT_THRESHOLD);
         // First identical failure: not enough repetition yet.
-        assert_eq!(t.record(1, &fail("read_file", "a", "not found")), NoProgress::Continue);
+        assert_eq!(
+            t.record(1, &fail("read_file", "a", "not found")),
+            NoProgress::Continue
+        );
         // Second: nudge the model to change strategy before the retry cap.
         match t.record(2, &fail("read_file", "a", "not found")) {
             NoProgress::Nudge(msg) => {
@@ -322,8 +325,14 @@ mod tests {
         let _ = t.record(1, &fail("read_file", "a", "not found"));
         // The model heeded the nudge and changed args: a new signature, so the
         // identical-repeat counter starts over and never reaches the halt.
-        assert_eq!(t.record(2, &fail("read_file", "b", "not found")), NoProgress::Continue);
-        assert_eq!(t.record(3, &fail("list_dir", "c", "denied")), NoProgress::Continue);
+        assert_eq!(
+            t.record(2, &fail("read_file", "b", "not found")),
+            NoProgress::Continue
+        );
+        assert_eq!(
+            t.record(3, &fail("list_dir", "c", "denied")),
+            NoProgress::Continue
+        );
     }
 
     #[test]
@@ -334,7 +343,9 @@ mod tests {
         assert_eq!(t.record(1, &a), NoProgress::Continue);
         let mut b = fail("send_email", "a", "[policy-blocked] denied");
         b.hard_reject = true;
-        assert!(matches!(t.record(2, &b), NoProgress::Halt(msg) if msg.contains("security policy")));
+        assert!(
+            matches!(t.record(2, &b), NoProgress::Halt(msg) if msg.contains("security policy"))
+        );
     }
 
     #[test]
@@ -347,7 +358,10 @@ mod tests {
             assert_eq!(t.record(i, &fail("t", &fp, &err)), NoProgress::Continue);
         }
         // Fourth consecutive varied failure → nudge to step back.
-        assert!(matches!(t.record(4, &fail("t", "fp4", "err4")), NoProgress::Nudge(_)));
+        assert!(matches!(
+            t.record(4, &fail("t", "fp4", "err4")),
+            NoProgress::Nudge(_)
+        ));
         assert_eq!(t.record(5, &fail("t", "fp5", "err5")), NoProgress::Continue);
         // Sixth → the no-progress backstop halts.
         assert!(matches!(
@@ -375,7 +389,13 @@ mod tests {
         // before the halt (the nudge lands at 2, so the halt must be >= 3).
         let t = NoProgressTracker::new(1);
         assert_eq!(t.record(1, &fail("t", "a", "boom")), NoProgress::Continue);
-        assert!(matches!(t.record(2, &fail("t", "a", "boom")), NoProgress::Nudge(_)));
-        assert!(matches!(t.record(3, &fail("t", "a", "boom")), NoProgress::Halt(_)));
+        assert!(matches!(
+            t.record(2, &fail("t", "a", "boom")),
+            NoProgress::Nudge(_)
+        ));
+        assert!(matches!(
+            t.record(3, &fail("t", "a", "boom")),
+            NoProgress::Halt(_)
+        ));
     }
 }

From 0acdc8f08684ae3b836b7eac147cc9bce30a1091 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 2 Jul 2026 00:13:56 +0530
Subject: [PATCH 3/3] fix(harness): inject the no-progress nudge as a system
 message (#4089)

The nudge fed a User-role message into the loop. Turn-outcome extraction
slices the persisted suffix after the last user message
(convert::messages_since_last_user), so on a run that nudges and then
recovers instead of halting, every assistant/tool cycle before the nudge
was dropped from the persisted conversation and the post-turn tool records.

Inject the corrective as a system (scaffold) message instead: it still
reaches the model at the next steering checkpoint, but the original user
turn stays the persistence boundary so the full turn suffix is preserved.
Add a regression test asserting the nudge is a system message.
---
 src/openhuman/tinyagents/middleware.rs | 47 +++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/src/openhuman/tinyagents/middleware.rs b/src/openhuman/tinyagents/middleware.rs
index eb0d3a6987..f28f701ffa 100644
--- a/src/openhuman/tinyagents/middleware.rs
+++ b/src/openhuman/tinyagents/middleware.rs
@@ -1174,11 +1174,18 @@ impl Middleware<()> for RepeatedToolFailureMiddleware {
                     step,
                     "[tinyagents::mw] no progress — nudging the model to change strategy"
                 );
-                // Feed the corrective back into the loop as a user turn; the
-                // harness applies it at the next steering checkpoint (before the
-                // next model call), so the model sees it before it acts again.
+                // Feed the corrective back into the loop as a *system* (scaffold)
+                // message — not a user turn. The harness applies it at the next
+                // steering checkpoint (before the next model call), so the model
+                // sees it before it acts again. A system role keeps the original
+                // user message as the turn's persistence boundary: `outcome`
+                // extraction slices the turn suffix after the last *user* message
+                // (`convert::messages_since_last_user`), so injecting a user turn
+                // here would drop every assistant/tool cycle before the nudge from
+                // the persisted conversation + post-turn tool records on a run that
+                // nudges and then recovers instead of halting.
                 self.handle
-                    .send(SteeringCommand::InjectMessage(TaMessage::user(signal)));
+                    .send(SteeringCommand::InjectMessage(TaMessage::system(signal)));
             }
             super::no_progress::NoProgress::Halt(summary) => {
                 tracing::warn!(
@@ -1581,6 +1588,38 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn the_nudge_is_a_system_message_so_it_does_not_move_the_user_boundary() {
+        // The nudge must inject a *system* (scaffold) message, not a user turn:
+        // `convert::messages_since_last_user` slices the persisted turn suffix
+        // after the last user message, so a user-role nudge would drop every
+        // assistant/tool cycle before it on a run that nudges then recovers.
+        let handle = SteeringHandle::allow_all();
+        let mw = RepeatedToolFailureMiddleware::new(
+            handle.clone(),
+            3,
+            std::sync::Arc::new(std::sync::Mutex::new(None)),
+        );
+        for _ in 0..2 {
+            let mut r = failing_result("read_file", "not found");
+            mw.after_tool(&mut ctx(), &(), &mut r).await.unwrap();
+        }
+        let cmds = handle.drain();
+        match cmds.as_slice() {
+            [SteeringCommand::InjectMessage(msg)] => {
+                assert!(
+                    matches!(msg, TaMessage::System(_)),
+                    "the nudge must be a system (scaffold) message, not a user turn"
+                );
+                assert!(
+                    msg.text().contains("no progress since step"),
+                    "the nudge should carry the structured no-progress corrective"
+                );
+            }
+            other => panic!("expected a single injected system nudge, got {other:?}"),
+        }
+    }
+
     #[tokio::test]
     async fn repeated_tool_failure_resets_on_a_success() {
         use tinyagents::harness::steering::SteeringCommandKind;